In [4]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score

In [5]:
df = pd.read_csv(r"E:\Jupyter Notebooks\train.csv")

In [6]:
df['Cabin'].value_counts()

B96 B98        4
G6             4
C23 C25 C27    4
C22 C26        3
F33            3
              ..
E34            1
C7             1
C54            1
E36            1
C148           1
Name: Cabin, Length: 147, dtype: int64

In [7]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  891 non-null    int64  
 1   Survived     891 non-null    int64  
 2   Pclass       891 non-null    int64  
 3   Name         891 non-null    object 
 4   Sex          891 non-null    object 
 5   Age          714 non-null    float64
 6   SibSp        891 non-null    int64  
 7   Parch        891 non-null    int64  
 8   Ticket       891 non-null    object 
 9   Fare         891 non-null    float64
 10  Cabin        204 non-null    object 
 11  Embarked     889 non-null    object 
dtypes: float64(2), int64(5), object(5)
memory usage: 83.7+ KB


In [8]:
df.isnull().sum()

PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age            177
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          687
Embarked         2
dtype: int64

In [9]:
df['Embarked'].value_counts()

S    644
C    168
Q     77
Name: Embarked, dtype: int64

In [10]:
dftrimmed = df.drop(['PassengerId','Name','Ticket','Cabin'], axis = 1)

In [11]:
dftrimmed

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
0,0,3,male,22.0,1,0,7.2500,S
1,1,1,female,38.0,1,0,71.2833,C
2,1,3,female,26.0,0,0,7.9250,S
3,1,1,female,35.0,1,0,53.1000,S
4,0,3,male,35.0,0,0,8.0500,S
...,...,...,...,...,...,...,...,...
886,0,2,male,27.0,0,0,13.0000,S
887,1,1,female,19.0,0,0,30.0000,S
888,0,3,female,,1,2,23.4500,S
889,1,1,male,26.0,0,0,30.0000,C


**Train test split as the first step**

In [12]:
xtrain, xtest, ytrain, ytest = train_test_split(dftrimmed.drop('Survived', axis = 1), dftrimmed['Survived'], test_size = 0.2, random_state = 42)

**applying imputation on age and embarked**

In [13]:
dftrimmed['Age'].isnull().sum()

177

In [14]:
dftrimmed['Embarked'].isnull().sum()

2

In [15]:
si_age = SimpleImputer() # dono k liye diff objects becoz age m mean
si_embarked = SimpleImputer(strategy = 'most_frequent') # and embarked m most frequent se replacing

xtrainAge = si_age.fit_transform(xtrain[['Age']])
xtrainEmbarked = si_embarked.fit_transform(xtrain[['Embarked']])

xtestAge = si_age.transform(xtest[['Age']])
xtestEmbarked = si_embarked.transform(xtest[['Embarked']])

In [16]:
xtrainEmbarked.shape, xtestEmbarked.shape

((712, 1), (179, 1))

In [17]:
xtrainAge.shape

(712, 1)

**applying OHE on sex and embarked**

In [18]:
"""
ohe = OneHotEncoder(handle_unknown = 'ignore', sparse_output=False)

# if we would have used original embarked column with missing values , we would have to make separate objects for OHE
xtrainSex = ohe.fit_transform(xtrain[['Sex']])
xtrainEmbarked = ohe.fit_transform(xtrainEmbarked)

xtestSex = ohe.transform(xtest[['Sex']])
xtestEmbarked = ohe.transform(xtestEmbarked)
"""

"\nohe = OneHotEncoder(handle_unknown = 'ignore', sparse_output=False)\n\n# if we would have used original embarked column with missing values , we would have to make separate objects for OHE\nxtrainSex = ohe.fit_transform(xtrain[['Sex']])\nxtrainEmbarked = ohe.fit_transform(xtrainEmbarked)\n\nxtestSex = ohe.transform(xtest[['Sex']])\nxtestEmbarked = ohe.transform(xtestEmbarked)\n"

<a href = "https://chatgpt.com/share/bc28e495-6645-4317-9b4f-9c82cf07285b">here is the reason for the above cell not encoding sex column correctly

In [19]:
"""
this is correct code
neeche vala cell bhi correct version h iska
ohe = OneHotEncoder(handle_unknown = 'ignore', sparse_output=False)

# if we would have used original embarked column with missing values , we would have to make separate objects for OHE
xtrainSex = ohe.fit_transform(xtrain[['Sex']])
xtestSex = ohe.transform(xtest[['Sex']])

xtrainEmbarked = ohe.fit_transform(xtrainEmbarked)
xtestEmbarked = ohe.transform(xtestEmbarked)

"""

"\nthis is correct code\nneeche vala cell bhi correct version h iska\nohe = OneHotEncoder(handle_unknown = 'ignore', sparse_output=False)\n\n# if we would have used original embarked column with missing values , we would have to make separate objects for OHE\nxtrainSex = ohe.fit_transform(xtrain[['Sex']])\nxtestSex = ohe.transform(xtest[['Sex']])\n\nxtrainEmbarked = ohe.fit_transform(xtrainEmbarked)\nxtestEmbarked = ohe.transform(xtestEmbarked)\n\n"

In [21]:
oheSex = OneHotEncoder(handle_unknown = 'ignore', sparse = False)
oheEmbarked = OneHotEncoder(handle_unknown = 'ignore', sparse = False)

xtrainSex = oheSex.fit_transform(xtrain[['Sex']])
xtrainEmbarked = oheEmbarked.fit_transform(xtrainEmbarked)

xtestSex = oheSex.transform(xtest[['Sex']])
xtestEmbarked = oheEmbarked.transform(xtestEmbarked)

In [22]:
xtrain['Sex'].value_counts()

male      467
female    245
Name: Sex, dtype: int64

In [23]:
xtest['Sex'].value_counts()

male      110
female     69
Name: Sex, dtype: int64

**combining the preprocessed data**

In [24]:
xtrainEmbarked

array([[0., 0., 1.],
       [0., 0., 1.],
       [0., 0., 1.],
       ...,
       [0., 0., 1.],
       [0., 0., 1.],
       [0., 0., 1.]])

In [25]:
dftrimmed.loc[:,['Sex','Age','Embarked']]

Unnamed: 0,Sex,Age,Embarked
0,male,22.0,S
1,female,38.0,C
2,female,26.0,S
3,female,35.0,S
4,male,35.0,S
...,...,...,...
886,male,27.0,S
887,female,19.0,S
888,female,,S
889,male,26.0,C


In [26]:
dfremaining = dftrimmed.drop(['Sex','Age','Embarked'], axis = 1)

In [27]:
ytrain

331    0
733    0
382    0
704    0
813    0
      ..
106    1
270    0
860    0
435    1
102    0
Name: Survived, Length: 712, dtype: int64

In [28]:
xtrainrem = xtrain.drop(['Sex','Age','Embarked'], axis = 1)

In [29]:
xtrainEmbarked.shape

(712, 3)

In [30]:
xtrainEmbarked


array([[0., 0., 1.],
       [0., 0., 1.],
       [0., 0., 1.],
       ...,
       [0., 0., 1.],
       [0., 0., 1.],
       [0., 0., 1.]])

In [31]:
xtrainAge.shape

(712, 1)

In [32]:
xtrainSex.shape

(712, 2)

In [33]:
xtest.shape

(179, 7)

In [34]:
xtest['Sex'].nunique()

2

In [35]:
xtestAge.shape, xtestEmbarked.shape, xtestSex.shape

((179, 1), (179, 3), (179, 2))

In [36]:
xtestrem = xtest.drop(['Age','Embarked','Sex'], axis = 1)

In [37]:
xtraintransformed = np.concatenate((xtrainrem, xtrainAge, xtrainSex, xtrainEmbarked), axis = 1)
xtesttransformed = np.concatenate((xtestrem, xtestAge, xtestSex, xtestEmbarked), axis = 1)

In [38]:
xtraintransformed.shape, xtesttransformed.shape

((712, 10), (179, 10))

In [39]:
"""
dfEmbarked = pd.DataFrame(xtrainEmbarked, columns = ['Embarked_S', 'Embarked_C', 'Embarked_Q'])
dfSex = pd.DataFrame(xtrainSex, columns = ['Sex_male','Sex_female'])
dfAge = pd.DataFrame(xtrainAge, columns = ['Age'])
pd.concat((xtrainrem, dfAge, dfEmbarked , dfSex), axis = 1)
"""

"\ndfEmbarked = pd.DataFrame(xtrainEmbarked, columns = ['Embarked_S', 'Embarked_C', 'Embarked_Q'])\ndfSex = pd.DataFrame(xtrainSex, columns = ['Sex_male','Sex_female'])\ndfAge = pd.DataFrame(xtrainAge, columns = ['Age'])\npd.concat((xtrainrem, dfAge, dfEmbarked , dfSex), axis = 1)\n"

**now fitting model on the preprocessed data**

In [40]:
ytrain

331    0
733    0
382    0
704    0
813    0
      ..
106    1
270    0
860    0
435    1
102    0
Name: Survived, Length: 712, dtype: int64

In [41]:
clf = DecisionTreeClassifier()
clf.fit(xtraintransformed, ytrain)
pred = clf.predict(xtesttransformed)

In [42]:
pred

array([0, 1, 0, 1, 1, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 1, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 1, 0, 1, 1, 0, 1, 1, 1, 1, 0, 1, 0, 1, 1, 1, 0, 0, 1,
       0, 0, 0, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 0, 1, 1, 0, 0, 0, 1, 1,
       0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 1, 1, 0, 0, 0,
       1, 0, 1, 1, 0, 0, 0, 0, 1, 0, 0, 1, 1, 1, 0, 0, 1, 1, 0, 0, 1, 0,
       0, 1, 1, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 1, 1, 0, 0, 1, 0, 0,
       0, 0, 0, 1, 1, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 1, 1, 0, 0,
       0, 1, 1], dtype=int64)

**evaluating the model accuracy**

In [43]:
accuracy_score(ytest, pred)

0.7877094972067039

In [44]:
import pickle
pickle.dump(clf, open('model.pkl','wb'))
pickle.dump(oheSex, open('oheSex.pkl','wb'))
pickle.dump(oheEmbarked, open('oheEmbarked.pkl','wb'))

<a href = "https://github.com/campusx-official/100-days-of-machine-learning/tree/main/day29-sklearn-pipelines"> campusx code github