In [40]:
import pandas as pd
import numpy as np
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import r2_score,mean_absolute_error,mean_squared_error,root_mean_squared_error
from sklearn.preprocessing import LabelEncoder

# remove warning
import warnings
warnings.filterwarnings('ignore')

In [41]:
df = sns.load_dataset('titanic')
df.head()

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,who,adult_male,deck,embark_town,alive,alone
0,0,3,male,22.0,1,0,7.25,S,Third,man,True,,Southampton,no,False
1,1,1,female,38.0,1,0,71.2833,C,First,woman,False,C,Cherbourg,yes,False
2,1,3,female,26.0,0,0,7.925,S,Third,woman,False,,Southampton,yes,True
3,1,1,female,35.0,1,0,53.1,S,First,woman,False,C,Southampton,yes,False
4,0,3,male,35.0,0,0,8.05,S,Third,man,True,,Southampton,no,True


In [42]:
df.isnull().sum().sort_values(ascending=False)

deck           688
age            177
embarked         2
embark_town      2
survived         0
pclass           0
sex              0
sibsp            0
parch            0
fare             0
class            0
who              0
adult_male       0
alive            0
alone            0
dtype: int64

In [43]:
df = df.drop('deck',axis = 1)

In [44]:
df.isnull().sum().sort_values(ascending=False)

age            177
embarked         2
embark_town      2
survived         0
pclass           0
sex              0
sibsp            0
parch            0
fare             0
class            0
who              0
adult_male       0
alive            0
alone            0
dtype: int64

In [45]:
# now we handle age missing values using random forest classifier

cat_columns = ['sex','embarked','who','class','embark_town','alive']

label_encoders = {}

for col in cat_columns:
    le = LabelEncoder()

    df[col] = le.fit_transform(df[col])

    label_encoders[col] = le

In [46]:
# Split the dataset into two parts: one with missing values, one without
df_with_missing = df[df['age'].isna()]
# dropna removes all rows with missing values
df_without_missing = df.dropna()

In [47]:
print("The shape of the original dataset is: ", df.shape)
print("The shape of the dataset with missing values removed is: ", df_without_missing.shape)
print("The shape of the dataset with missing values is: ", df_with_missing.shape)

The shape of the original dataset is:  (891, 14)
The shape of the dataset with missing values removed is:  (714, 14)
The shape of the dataset with missing values is:  (177, 14)


In [48]:
X = df_without_missing.drop(['age'],axis=1)
y = df_without_missing['age']

# split the data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=42)

model = RandomForestRegressor()
model.fit(X_train,y_train)

y_pred = model.predict(X_test)

print("mean_absolute_error  :" ,mean_absolute_error(y_test,y_pred))
print("mean_squared_error :" ,mean_squared_error(y_test,y_pred))
print("root_mean_squared_error  :" ,root_mean_squared_error(y_test,y_pred))
print("r2 error  :" ,r2_score(y_test,y_pred))

mean_absolute_error  : 8.71243412157477
mean_squared_error : 125.43389839664319
root_mean_squared_error  : 11.199727603680511
r2 error  : 0.32345708496978665


In [49]:
predict_missing_values = model.predict(df_with_missing.drop(['age'],axis=1))

In [50]:
# replace the missing values with the predicted values
df_with_missing['age'] = predict_missing_values

In [51]:
# concatenate the two dataframes
df_complete = pd.concat([df_with_missing, df_without_missing], axis=0)
# print the shape of the complete dataframe
print("The shape of the complete dataframe is: ", df_complete.shape)

#check the first 5 rows of the complete dataframe
df_complete.head()

The shape of the complete dataframe is:  (891, 14)


Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,who,adult_male,embark_town,alive,alone
5,0,3,1,31.22131,0,0,8.4583,1,2,1,True,1,0,True
17,1,2,1,35.146622,0,0,13.0,2,1,1,True,2,1,True
19,1,3,0,18.979333,0,0,7.225,0,2,2,False,0,1,True
26,0,3,1,35.34781,0,0,7.225,0,2,1,True,0,0,True
28,1,3,0,21.056818,0,0,7.8792,1,2,2,False,1,1,True
