In [1]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np

titanic = sns.load_dataset('titanic')

In [2]:
df= pd.DataFrame(titanic)

In [3]:
df.head()

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,who,adult_male,deck,embark_town,alive,alone
0,0,3,male,22.0,1,0,7.25,S,Third,man,True,,Southampton,no,False
1,1,1,female,38.0,1,0,71.2833,C,First,woman,False,C,Cherbourg,yes,False
2,1,3,female,26.0,0,0,7.925,S,Third,woman,False,,Southampton,yes,True
3,1,1,female,35.0,1,0,53.1,S,First,woman,False,C,Southampton,yes,False
4,0,3,male,35.0,0,0,8.05,S,Third,man,True,,Southampton,no,True


In [12]:
df.dtypes

survived          int64
pclass            int64
sex              object
age             float64
sibsp             int64
parch             int64
fare            float64
embarked         object
class          category
who              object
adult_male         bool
deck           category
embark_town      object
alive            object
alone              bool
dtype: object

In [4]:
df.isnull().sum() 

survived         0
pclass           0
sex              0
age            177
sibsp            0
parch            0
fare             0
embarked         2
class            0
who              0
adult_male       0
deck           688
embark_town      2
alive            0
alone            0
dtype: int64

In [14]:
df_imputed= df.copy()
from sklearn.impute import KNNImputer
imputer= KNNImputer()
df_imputed[['age']]= imputer.fit_transform(df[['age']])


In [15]:
df_imputed.isnull().sum()

survived         0
pclass           0
sex              0
age              0
sibsp            0
parch            0
fare             0
embarked         2
class            0
who              0
adult_male       0
deck           688
embark_town      2
alive            0
alone            0
dtype: int64

In [16]:
from sklearn.impute import SimpleImputer
imputer= SimpleImputer(strategy='most_frequent')
df_imputed[['embarked','deck','embark_town']]= imputer.fit_transform(df_imputed[['embarked','deck','embark_town']])

In [18]:
df_imputed.isnull().sum()

survived       0
pclass         0
sex            0
age            0
sibsp          0
parch          0
fare           0
embarked       0
class          0
who            0
adult_male     0
deck           0
embark_town    0
alive          0
alone          0
dtype: int64

In [22]:
from sklearn.preprocessing import StandardScaler
numerical_cols= df_imputed.select_dtypes(include=['int64','float64']).columns
scaler= StandardScaler()
scaled_data= scaler.fit_transform(df_imputed[numerical_cols])
df_imputed[numerical_cols]= pd.DataFrame(scaled_data,columns=numerical_cols)

In [23]:
df_imputed.head()

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,who,adult_male,deck,embark_town,alive,alone
0,-0.789272,0.827377,male,-0.592481,0.432793,-0.473674,-0.502445,S,Third,man,True,C,Southampton,no,False
1,1.26699,-1.566107,female,0.638789,0.432793,-0.473674,0.786845,C,First,woman,False,C,Cherbourg,yes,False
2,1.26699,0.827377,female,-0.284663,-0.474545,-0.473674,-0.488854,S,Third,woman,False,C,Southampton,yes,True
3,1.26699,-1.566107,female,0.407926,0.432793,-0.473674,0.42073,S,First,woman,False,C,Southampton,yes,False
4,-0.789272,0.827377,male,0.407926,-0.474545,-0.473674,-0.486337,S,Third,man,True,C,Southampton,no,True


In [25]:
df= df_imputed.copy()

In [28]:
df.isnull().sum()

survived       0
pclass         0
sex            0
age            0
sibsp          0
parch          0
fare           0
embarked       0
class          0
who            0
adult_male     0
deck           0
embark_town    0
alive          0
alone          0
dtype: int64

In [33]:
embarked_dummies = pd.get_dummies(df['embarked'], prefix='embarked')

df = pd.concat([df, embarked_dummies], axis=1)
df.drop('embarked', axis=1, inplace=True)

In [34]:
df.head()

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,class,who,adult_male,deck,embark_town,alive,alone,embarked_C,embarked_Q,embarked_S
0,-0.789272,0.827377,male,-0.592481,0.432793,-0.473674,-0.502445,Third,man,True,C,Southampton,no,False,False,False,True
1,1.26699,-1.566107,female,0.638789,0.432793,-0.473674,0.786845,First,woman,False,C,Cherbourg,yes,False,True,False,False
2,1.26699,0.827377,female,-0.284663,-0.474545,-0.473674,-0.488854,Third,woman,False,C,Southampton,yes,True,False,False,True
3,1.26699,-1.566107,female,0.407926,0.432793,-0.473674,0.42073,First,woman,False,C,Southampton,yes,False,False,False,True
4,-0.789272,0.827377,male,0.407926,-0.474545,-0.473674,-0.486337,Third,man,True,C,Southampton,no,True,False,False,True


In [35]:
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
df['class'] = le.fit_transform(df['class'])

In [36]:
df.head()

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,class,who,adult_male,deck,embark_town,alive,alone,embarked_C,embarked_Q,embarked_S
0,-0.789272,0.827377,male,-0.592481,0.432793,-0.473674,-0.502445,2,man,True,C,Southampton,no,False,False,False,True
1,1.26699,-1.566107,female,0.638789,0.432793,-0.473674,0.786845,0,woman,False,C,Cherbourg,yes,False,True,False,False
2,1.26699,0.827377,female,-0.284663,-0.474545,-0.473674,-0.488854,2,woman,False,C,Southampton,yes,True,False,False,True
3,1.26699,-1.566107,female,0.407926,0.432793,-0.473674,0.42073,0,woman,False,C,Southampton,yes,False,False,False,True
4,-0.789272,0.827377,male,0.407926,-0.474545,-0.473674,-0.486337,2,man,True,C,Southampton,no,True,False,False,True


In [41]:
df['fare']= pd.cut(df['fare'],bins=3,labels=['Low','Moderate','High'])

In [47]:
df.head(10)

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,class,who,adult_male,deck,embark_town,alive,alone,embarked_C,embarked_Q,embarked_S
0,-0.789272,0.827377,male,-0.592481,0.432793,-0.473674,Low,2,man,True,C,Southampton,no,False,False,False,True
1,1.26699,-1.566107,female,0.638789,0.432793,-0.473674,Low,0,woman,False,C,Cherbourg,yes,False,True,False,False
2,1.26699,0.827377,female,-0.284663,-0.474545,-0.473674,Low,2,woman,False,C,Southampton,yes,True,False,False,True
3,1.26699,-1.566107,female,0.407926,0.432793,-0.473674,Low,0,woman,False,C,Southampton,yes,False,False,False,True
4,-0.789272,0.827377,male,0.407926,-0.474545,-0.473674,Low,2,man,True,C,Southampton,no,True,False,False,True
5,-0.789272,0.827377,male,0.0,-0.474545,-0.473674,Low,2,man,True,C,Queenstown,no,True,False,True,False
6,-0.789272,-1.566107,male,1.870059,-0.474545,-0.473674,Low,0,man,True,E,Southampton,no,True,False,False,True
7,-0.789272,0.827377,male,-2.131568,2.24747,0.76763,Low,2,child,False,C,Southampton,no,False,False,False,True
8,1.26699,0.827377,female,-0.207709,-0.474545,2.008933,Low,2,woman,False,C,Southampton,yes,False,False,False,True
9,1.26699,-0.369365,female,-1.208115,0.432793,-0.473674,Low,1,child,False,C,Cherbourg,yes,False,True,False,False
