In [57]:
import pandas as pd
import numpy as np

train = pd.read_csv("train.csv")

train.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [58]:
#Null values in each dataset
train.isnull().sum()

PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age            177
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          687
Embarked         2
dtype: int64

In [59]:
train.columns

Index(['PassengerId', 'Survived', 'Pclass', 'Name', 'Sex', 'Age', 'SibSp',
       'Parch', 'Ticket', 'Fare', 'Cabin', 'Embarked'],
      dtype='object')

Handle Missing Values

In [60]:
train['Age'] = train['Age'].fillna(train['Age'].median())

train['Embarked'] = train['Embarked'].fillna(train['Embarked'].mode()[0])

train['HasCabin'] = train['Cabin'].notnull().astype(int)

train = train.drop(columns=['Cabin'])


Encoding variables

In [61]:
# Encode Categorical Variables
train['Sex'] = train['Sex'].map({'male':0, 'female':1})

# One-hot encode Embarked
train = pd.get_dummies(train, columns=['Embarked'], drop_first=True)


In [62]:
# Feature Engineering
train['FamilySize'] = train['SibSp'] + train['Parch'] + 1

# IsAlone
train['IsAlone'] = (train['FamilySize'] == 1).astype(int)


Extract Title from Name

In [63]:
# Extract title
train['Title'] = train['Name'].str.extract(r' ([A-Za-z]+)\.', expand=False)

# Group rare titles
train['Title'] = train['Title'].replace(['Lady','Countess','Capt','Col','Don','Dr','Major','Rev','Sir','Jonkheer','Dona'], 'Rare')

# Map titles to numeric
title_mapping = {'Mr':1, 'Miss':2, 'Mrs':3, 'Master':4, 'Rare':5}
train['Title'] = train['Title'].map(title_mapping)

# Fill missing titles with 0 (if any)
train['Title'] = train['Title'].fillna(0)


AgeGroup mapping

In [64]:
train['AgeGroup'] = pd.cut(train['Age'], bins=[0, 12, 18, 35, 60, 80], labels=[0,1,2,3,4])

# 0: child
# 1: teenager
# 2: adult
# 3: middle-aged
# 4: senior

Fare Groups

In [65]:
train['FareGroup'] = pd.qcut(train['Fare'], 4, labels=[0,1,2,3])

FamilySize Groups

In [66]:
train['FamilySizeGroup'] = pd.cut(train['FamilySize'], bins=[0,1,4,11], labels=[0,1,2])

Drop unused columns

In [67]:
train = train.drop(columns=['Name', 'Ticket', 'PassengerId', 'Age', 'Fare', 'SibSp', 'Parch'])

In [68]:
train.head()
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
 #   Column           Non-Null Count  Dtype   
---  ------           --------------  -----   
 0   Survived         891 non-null    int64   
 1   Pclass           891 non-null    int64   
 2   Sex              891 non-null    int64   
 3   HasCabin         891 non-null    int64   
 4   Embarked_Q       891 non-null    bool    
 5   Embarked_S       891 non-null    bool    
 6   FamilySize       891 non-null    int64   
 7   IsAlone          891 non-null    int64   
 8   Title            891 non-null    float64 
 9   AgeGroup         891 non-null    category
 10  FareGroup        891 non-null    category
 11  FamilySizeGroup  891 non-null    category
dtypes: bool(2), category(3), float64(1), int64(6)
memory usage: 53.7 KB


In [69]:
train.to_csv("train_cleaned.csv", index=False)