In [2]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt


In [3]:
df=sns.load_dataset('titanic')
df.head()

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,who,adult_male,deck,embark_town,alive,alone
0,0,3,male,22.0,1,0,7.25,S,Third,man,True,,Southampton,no,False
1,1,1,female,38.0,1,0,71.2833,C,First,woman,False,C,Cherbourg,yes,False
2,1,3,female,26.0,0,0,7.925,S,Third,woman,False,,Southampton,yes,True
3,1,1,female,35.0,1,0,53.1,S,First,woman,False,C,Southampton,yes,False
4,0,3,male,35.0,0,0,8.05,S,Third,man,True,,Southampton,no,True


In [4]:
df1=df.drop(columns=['alone','alive','adult_male','embark_town','who','class'])
df1.sample()

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,deck
789,0,1,male,46.0,0,0,79.2,C,B


In [5]:
df1.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 9 columns):
 #   Column    Non-Null Count  Dtype   
---  ------    --------------  -----   
 0   survived  891 non-null    int64   
 1   pclass    891 non-null    int64   
 2   sex       891 non-null    object  
 3   age       714 non-null    float64 
 4   sibsp     891 non-null    int64   
 5   parch     891 non-null    int64   
 6   fare      891 non-null    float64 
 7   embarked  889 non-null    object  
 8   deck      203 non-null    category
dtypes: category(1), float64(2), int64(4), object(2)
memory usage: 57.0+ KB


In [6]:
df1.isnull().sum()

Unnamed: 0,0
survived,0
pclass,0
sex,0
age,177
sibsp,0
parch,0
fare,0
embarked,2
deck,688


In [7]:
# # 1️⃣ Make sure column is categorical
# df['deck'] = df['deck'].astype('category')

# # 2️⃣ Add the new category FIRST
# df['deck'] = df['deck'].cat.add_categories(['Unknown'])

# # 3️⃣ Now fill missing values
# df['deck'] = df['deck'].fillna('Unknown')

df1['deck'] = df1['deck'].astype('object')
df1['deck'] = df1['deck'].fillna('Unknown')   # ✅ works

In [8]:
df1.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 9 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   survived  891 non-null    int64  
 1   pclass    891 non-null    int64  
 2   sex       891 non-null    object 
 3   age       714 non-null    float64
 4   sibsp     891 non-null    int64  
 5   parch     891 non-null    int64  
 6   fare      891 non-null    float64
 7   embarked  889 non-null    object 
 8   deck      891 non-null    object 
dtypes: float64(2), int64(4), object(3)
memory usage: 62.8+ KB


In [9]:
df1.isnull().sum()

Unnamed: 0,0
survived,0
pclass,0
sex,0
age,177
sibsp,0
parch,0
fare,0
embarked,2
deck,0


In [10]:
df2=df1.fillna(df1.groupby(['sex','pclass'])['age'].transform('mean'))
df2.isnull().sum()

Unnamed: 0,0
survived,0
pclass,0
sex,0
age,177
sibsp,0
parch,0
fare,0
embarked,2
deck,0


In [11]:
df2['age'].fillna(df2['age'].mean(),inplace=True)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df2['age'].fillna(df2['age'].mean(),inplace=True)


In [12]:
df2.isnull().sum()

Unnamed: 0,0
survived,0
pclass,0
sex,0
age,0
sibsp,0
parch,0
fare,0
embarked,2
deck,0


In [13]:
df2['embarked'].fillna(df2['embarked'].mode()[0],inplace=True)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df2['embarked'].fillna(df2['embarked'].mode()[0],inplace=True)


In [14]:
df2.isnull().sum()

Unnamed: 0,0
survived,0
pclass,0
sex,0
age,0
sibsp,0
parch,0
fare,0
embarked,0
deck,0


In [15]:
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test=train_test_split(df2.drop('survived',axis=1),df['survived'],test_size=0.3,random_state=1)

In [16]:
X_train.info()

<class 'pandas.core.frame.DataFrame'>
Index: 623 entries, 114 to 37
Data columns (total 8 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   pclass    623 non-null    int64  
 1   sex       623 non-null    object 
 2   age       623 non-null    float64
 3   sibsp     623 non-null    int64  
 4   parch     623 non-null    int64  
 5   fare      623 non-null    float64
 6   embarked  623 non-null    object 
 7   deck      623 non-null    object 
dtypes: float64(2), int64(3), object(3)
memory usage: 43.8+ KB


In [17]:
X_train

Unnamed: 0,pclass,sex,age,sibsp,parch,fare,embarked,deck
114,3,female,17.000000,0,0,14.4583,C,Unknown
874,2,female,28.000000,1,0,24.0000,C,Unknown
76,3,male,29.699118,0,0,7.8958,S,Unknown
876,3,male,20.000000,0,0,9.8458,S,Unknown
674,2,male,29.699118,0,0,0.0000,S,Unknown
...,...,...,...,...,...,...,...,...
715,3,male,19.000000,0,0,7.6500,S,F
767,3,female,30.500000,0,0,7.7500,Q,Unknown
72,2,male,21.000000,0,0,73.5000,S,Unknown
235,3,female,29.699118,0,0,7.5500,S,Unknown


In [18]:
# standardization
from sklearn.preprocessing import StandardScaler
scaler=StandardScaler()
scaler.fit(X_train[['age','fare']])
sc_trained=scaler.transform(X_train[['age','fare']])
sc_test=scaler.transform(X_test[['age','fare']])
std_train_df=pd.DataFrame(sc_trained,columns=['age','fare']).reset_index(drop=True)
std_test_df=pd.DataFrame(sc_test,columns=['age','fare']).reset_index(drop=True)



In [19]:
encoding_train=X_train[['pclass']]
encoding_test=X_test[['pclass']]


In [20]:
#ordinal encoding
from sklearn.preprocessing import OrdinalEncoder
encoder=OrdinalEncoder(categories=[[1,2,3]])
encoder.fit(encoding_train)
encoding1=encoder.transform(encoding_train)
encoding2=encoder.transform(encoding_test)

In [21]:
encoded_train_pclass=pd.DataFrame(encoding1,columns=['pclass']).reset_index(drop=True)
encoded_test_pclass=pd.DataFrame(encoding2,columns=['pclass']).reset_index(drop=True)

In [22]:
X_train.sample()

Unnamed: 0,pclass,sex,age,sibsp,parch,fare,embarked,deck
398,2,male,23.0,0,0,10.5,S,Unknown


In [23]:
#One hot encoding
from sklearn.preprocessing import OneHotEncoder
hot_enc=OneHotEncoder(handle_unknown='ignore',sparse_output=False)
hot_enc_train_col=hot_enc.fit_transform(X_train[['sex','embarked','deck']])
hot_enc_test_col=hot_enc.fit_transform(X_test[['sex','embarked','deck']])
one_hot_train_df=pd.DataFrame(
    hot_enc_train_col,
    columns=hot_enc.get_feature_names_out(['sex','embarked','deck'])
)
one_hot_test_df=pd.DataFrame(
    hot_enc_test_col,
    columns=hot_enc.get_feature_names_out(['sex','embarked','deck'])
)


In [33]:
one_hot_train_df.shape

(623, 13)

In [24]:
final_X_train=pd.concat([std_train_df,encoded_train_pclass,one_hot_train_df],axis=1)
final_X_test=pd.concat([std_test_df,encoded_test_pclass,one_hot_test_df],axis=1)



In [25]:
y_train.head()

Unnamed: 0,survived
114,0
874,1
76,0
876,0
674,0


In [26]:
from sklearn.tree import DecisionTreeClassifier
dec_tree=DecisionTreeClassifier()
dec_tree.fit(final_X_train,y_train)

In [27]:
final_X_train.head(2)

Unnamed: 0,age,fare,pclass,sex_female,sex_male,embarked_C,embarked_Q,embarked_S,deck_A,deck_B,deck_C,deck_D,deck_E,deck_F,deck_G,deck_Unknown
0,-0.980483,-0.357932,2.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
1,-0.151129,-0.152797,1.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0


In [28]:
final_X_test.head(2).shape

(2, 16)

In [29]:
y_pred=dec_tree.predict(final_X_test)

In [30]:
from sklearn.metrics import accuracy_score
accuracy_score(y_test,y_pred)

0.7276119402985075

In [31]:
import pickle

In [32]:
pickle.dump(scaler, open('scaler.pkl', 'wb'))
pickle.dump(hot_enc,open('ohe.pkl','wb'))
pickle.dump(encoder,open('enc.pkl','wb'))
pickle.dump(dec_tree,open('clf.pkl','wb'))

