In [44]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

# Load the dataset
df = pd.read_csv('titanic.csv')
df.head(10)

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,892,0,3,"Kelly, Mr. James",male,34.5,0,0,330911,7.8292,,Q
1,893,1,3,"Wilkes, Mrs. James (Ellen Needs)",female,47.0,1,0,363272,7.0,,S
2,894,0,2,"Myles, Mr. Thomas Francis",male,62.0,0,0,240276,9.6875,,Q
3,895,0,3,"Wirz, Mr. Albert",male,27.0,0,0,315154,8.6625,,S
4,896,1,3,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",female,22.0,1,1,3101298,12.2875,,S
5,897,0,3,"Svensson, Mr. Johan Cervin",male,14.0,0,0,7538,9.225,,S
6,898,1,3,"Connolly, Miss. Kate",female,30.0,0,0,330972,7.6292,,Q
7,899,0,2,"Caldwell, Mr. Albert Francis",male,26.0,1,1,248738,29.0,,S
8,900,1,3,"Abrahim, Mrs. Joseph (Sophie Halaut Easu)",female,18.0,0,0,2657,7.2292,,C
9,901,0,3,"Davies, Mr. John Samuel",male,21.0,2,0,A/4 48871,24.15,,S


In [45]:
# Checking missing value in each column
df.isna().sum()

PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age             86
SibSp            0
Parch            0
Ticket           0
Fare             1
Cabin          327
Embarked         0
dtype: int64

In [46]:
# Replace missing values in 'Age' with mean age and drop rows with missing 'Embarked'
mean_age = df['Age'].mean()
df.fillna({'Age': mean_age}, inplace = True)

df.dropna(subset = ['Embarked'], inplace = True)
# no missing values in 'Embarked' column already so no effect would take place
print("Missing values after replacing 'Age' and dropping 'Embarked':")
df.isna().sum()

Missing values after replacing 'Age' and dropping 'Embarked':


PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age              0
SibSp            0
Parch            0
Ticket           0
Fare             1
Cabin          327
Embarked         0
dtype: int64

In [47]:
# Convert 'Sex' column categories into numerical values
df['Sex'] = df['Sex'].replace({'male':0 , 'female': 1}) # Alternatively: df['Sex'] = df['Sex'].astype('category').cat.codes
df.head()

  df['Sex'] = df['Sex'].replace({'male':0 , 'female': 1}) # Alternatively: df['Sex'] = df['Sex'].astype('category').cat.codes


Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,892,0,3,"Kelly, Mr. James",0,34.5,0,0,330911,7.8292,,Q
1,893,1,3,"Wilkes, Mrs. James (Ellen Needs)",1,47.0,1,0,363272,7.0,,S
2,894,0,2,"Myles, Mr. Thomas Francis",0,62.0,0,0,240276,9.6875,,Q
3,895,0,3,"Wirz, Mr. Albert",0,27.0,0,0,315154,8.6625,,S
4,896,1,3,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",1,22.0,1,1,3101298,12.2875,,S


In [48]:
# Applying One-Hot Encoding to 'Embarked' column
df = pd.get_dummies(df, columns=['Embarked'], drop_first=True, prefix='Embarked')
df.head(10)

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked_Q,Embarked_S
0,892,0,3,"Kelly, Mr. James",0,34.5,0,0,330911,7.8292,,True,False
1,893,1,3,"Wilkes, Mrs. James (Ellen Needs)",1,47.0,1,0,363272,7.0,,False,True
2,894,0,2,"Myles, Mr. Thomas Francis",0,62.0,0,0,240276,9.6875,,True,False
3,895,0,3,"Wirz, Mr. Albert",0,27.0,0,0,315154,8.6625,,False,True
4,896,1,3,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",1,22.0,1,1,3101298,12.2875,,False,True
5,897,0,3,"Svensson, Mr. Johan Cervin",0,14.0,0,0,7538,9.225,,False,True
6,898,1,3,"Connolly, Miss. Kate",1,30.0,0,0,330972,7.6292,,True,False
7,899,0,2,"Caldwell, Mr. Albert Francis",0,26.0,1,1,248738,29.0,,False,True
8,900,1,3,"Abrahim, Mrs. Joseph (Sophie Halaut Easu)",1,18.0,0,0,2657,7.2292,,False,False
9,901,0,3,"Davies, Mr. John Samuel",0,21.0,2,0,A/4 48871,24.15,,False,True


In [49]:
# Feature Selection and Scaling 
features = ['Age', 'Fare', 'Sex', 'Pclass']
X = df[features]
Y = df['Survived']
scalar = StandardScaler()
X_scaled = scalar.fit_transform(X)
X_scaled_df = pd.DataFrame(X_scaled, columns=features)
X_scaled_df.head(10)

Unnamed: 0,Age,Fare,Sex,Pclass
0,0.334993,-0.497811,-0.755929,0.873482
1,1.32553,-0.51266,1.322876,0.873482
2,2.514175,-0.464532,-0.755929,-0.315819
3,-0.25933,-0.482888,-0.755929,0.873482
4,-0.655545,-0.417971,1.322876,0.873482
5,-1.289489,-0.472814,-0.755929,0.873482
6,-0.021601,-0.501392,1.322876,0.873482
7,-0.338573,-0.118681,-0.755929,-0.315819
8,-0.972517,-0.508555,1.322876,0.873482
9,-0.734788,-0.205535,-0.755929,0.873482


In [50]:
X_train, X_test, Y_train, Y_test = train_test_split(
    X_scaled_df, # normally distributed features after scaling
    Y, # target
    test_size = 0.2, # 20% for testing and 80% for training
    random_state=42, # for reproducibility
    stratify=Y # similar proportion of 'Survived' in both sets
)

print("X_train shape (80%):", X_train.shape)
print("X_test shape (20%):", X_test.shape)    
print("Y_train shape:", Y_train.shape)
print("Y_test shape:", Y_test.shape)

X_train shape (80%): (334, 4)
X_test shape (20%): (84, 4)
Y_train shape: (334,)
Y_test shape: (84,)


In [32]:
X_train

Unnamed: 0,Age,Fare,Sex,Pclass
163,0.000000,-0.512660,-0.755929,0.873482
223,-0.734788,-0.498409,-0.755929,0.873482
183,0.000000,-0.499229,-0.755929,0.873482
21,-1.685704,-0.581234,-0.755929,0.873482
111,0.000000,-0.498706,1.322876,0.873482
...,...,...,...,...
58,0.000000,-0.349696,-0.755929,0.873482
287,-0.497059,0.835227,-0.755929,-1.505120
307,-2.333119,-0.470576,-0.755929,0.873482
93,0.000000,-0.493856,-0.755929,0.873482


In [35]:
Y_train

163    0
223    0
183    0
21     0
111    1
      ..
58     0
287    0
307    0
93     0
228    0
Name: Survived, Length: 334, dtype: int64

In [33]:
X_test

Unnamed: 0,Age,Fare,Sex,Pclass
410,0.000000,-0.499229,1.322876,0.873482
364,-0.417816,0.354841,1.322876,-1.505120
334,-0.259330,-0.496618,-0.755929,0.873482
176,-0.814031,0.020107,1.322876,-0.315819
94,-0.417816,-0.172405,-0.755929,-1.505120
...,...,...,...,...
200,0.000000,-0.360441,1.322876,0.873482
178,0.453857,0.060401,1.322876,-0.315819
351,-0.417816,-0.449981,-0.755929,-0.315819
298,-0.021601,0.176804,-0.755929,-1.505120


In [34]:
Y_test

410    1
364    1
334    0
176    1
94     0
      ..
200    1
178    1
351    0
298    0
118    0
Name: Survived, Length: 84, dtype: int64