In [141]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import MinMaxScaler
from sklearn.tree import DecisionTreeClassifier


In [142]:
df = pd.read_csv('train.csv')

In [143]:
df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [144]:
df.drop(columns= ['PassengerId', 'Name', 'Ticket', 'Cabin', 'Sex', 'Embarked'], inplace=True)

In [145]:
df.head()

Unnamed: 0,Survived,Pclass,Age,SibSp,Parch,Fare
0,0,3,22.0,1,0,7.25
1,1,1,38.0,1,0,71.2833
2,1,3,26.0,0,0,7.925
3,1,1,35.0,1,0,53.1
4,0,3,35.0,0,0,8.05


In [146]:
# Step: 1 :> Split of Data in train-test

X_train, X_test, y_train, y_test = train_test_split(df.drop(columns=['Survived']), 
                                                    df['Survived'], 
                                                    test_size=0.2,
                                                     random_state=42)

In [147]:
X_train.head(2)

Unnamed: 0,Pclass,Age,SibSp,Parch,Fare
331,1,45.5,0,0,28.5
733,2,23.0,0,0,13.0


In [148]:
X_test.head(2)

Unnamed: 0,Pclass,Age,SibSp,Parch,Fare
709,3,,1,1,15.2458
439,2,31.0,0,0,10.5


In [149]:
y_train.head()

331    0
733    0
382    0
704    0
813    0
Name: Survived, dtype: int64

In [150]:
df.isnull().sum()

Survived      0
Pclass        0
Age         177
SibSp         0
Parch         0
Fare          0
dtype: int64

In [151]:
# applying imputation
si_age = SimpleImputer()
si_embarked = SimpleImputer(strategy='most_frequent')

X_train_age = si_age.fit_transform(X_train[['Age']])
# X_train_embarked = si_embarked.fit_transform(X_train[['Embarked']])

X_test_age = si_age.transform(X_test[['Age']])
# X_test_embarked = si_embarked.transform(X_test[['Embarked']])



In [152]:
# One hot encoding  for column Sex & embarked
''' 

ohe_sex =  OneHotEncoder(sparse_output=False, handle_unknown='ignore')
ohe_embarked =  OneHotEncoder(sparse_output=False, handle_unknown='ignore')

X_train_sex = ohe_sex.fit_transform(X_train[['Sex']])
X_train_embarked = ohe_embarked.fit_transform(X_train_embarked)

X_test_sex = ohe_sex.transform(X_train[['Sex']])
X_test_embarked = ohe_embarked.transform(X_test_embarked)

'''

" \n\nohe_sex =  OneHotEncoder(sparse_output=False, handle_unknown='ignore')\nohe_embarked =  OneHotEncoder(sparse_output=False, handle_unknown='ignore')\n\nX_train_sex = ohe_sex.fit_transform(X_train[['Sex']])\nX_train_embarked = ohe_embarked.fit_transform(X_train_embarked)\n\nX_test_sex = ohe_sex.transform(X_train[['Sex']])\nX_test_embarked = ohe_embarked.transform(X_test_embarked)\n\n"

In [154]:
X_train.head(2)

Unnamed: 0,Pclass,Age,SibSp,Parch,Fare
331,1,45.5,0,0,28.5
733,2,23.0,0,0,13.0


In [156]:
X_train_rem = X_train.drop(columns=[ 'Age'])

In [157]:
X_test_rem = X_test.drop(columns=['Age'])


In [158]:
X_train_transformed = np.concatenate((X_train_rem, X_train_age), axis = 1)


In [159]:
X_test_transformed = np.concatenate((X_test_rem, X_test_age),axis=1)

In [160]:
print(X_test_rem.shape, 
      X_test_age.shape, 
      # X_test_sex.shape, 
      # X_test_embarked.shape
      )

(179, 4) (179, 1)


In [161]:
clf = DecisionTreeClassifier()
clf.fit(X_train_transformed, y_train)

In [177]:
X_test_transformed

array([[  3.        ,   1.        ,   1.        ,  15.2458    ,
         29.49884615],
       [  2.        ,   0.        ,   0.        ,  10.5       ,
         31.        ],
       [  3.        ,   0.        ,   0.        ,   7.925     ,
         20.        ],
       [  2.        ,   0.        ,   1.        ,  33.        ,
          6.        ],
       [  3.        ,   1.        ,   0.        ,  11.2417    ,
         14.        ],
       [  1.        ,   0.        ,   0.        ,  78.85      ,
         26.        ],
       [  3.        ,   0.        ,   0.        ,   7.75      ,
         29.49884615],
       [  3.        ,   2.        ,   0.        ,  18.        ,
         16.        ],
       [  3.        ,   0.        ,   0.        ,   7.75      ,
         16.        ],
       [  1.        ,   0.        ,   2.        ,  26.2833    ,
         19.        ],
       [  1.        ,   1.        ,   0.        ,  53.1       ,
         37.        ],
       [  3.        ,   0.        ,   0.   

In [162]:
X_train_transformed

array([[  1.    ,   0.    ,   0.    ,  28.5   ,  45.5   ],
       [  2.    ,   0.    ,   0.    ,  13.    ,  23.    ],
       [  3.    ,   0.    ,   0.    ,   7.925 ,  32.    ],
       ...,
       [  3.    ,   2.    ,   0.    ,  14.1083,  41.    ],
       [  1.    ,   1.    ,   2.    , 120.    ,  14.    ],
       [  1.    ,   0.    ,   1.    ,  77.2875,  21.    ]])

In [163]:
y_pred = clf.predict(X_test_transformed)


In [176]:
y_pred

array([1, 0, 1, 1, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 1, 1, 0, 0, 1, 0, 0, 0, 1, 0, 1, 1, 1, 1, 0, 0, 1,
       0, 1, 0, 1, 1, 0, 1, 1, 0, 0, 0, 1, 0, 1, 0, 0, 1, 1, 0, 0, 1, 1,
       0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0,
       0, 1, 1, 1, 0, 0, 0, 1, 1, 0, 1, 1, 1, 1, 0, 0, 1, 1, 0, 0, 1, 1,
       0, 1, 0, 0, 1, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 1, 0, 0, 1, 0, 0,
       0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0, 0,
       0, 0, 1], dtype=int64)

In [175]:
from sklearn.metrics import accuracy_score
accuracy_score(y_test,y_pred)

0.6536312849162011

In [166]:
import pickle


In [168]:
# pickle.dump(ohe_sex,open('models/ohe_sex.pkl','wb'))
# pickle.dump(ohe_embarked,open('models/ohe_embarked.pkl','wb'))
pickle.dump(clf,open('models/clf.pkl','wb'))

## Using the above created pkl files for predictions


In [169]:
import pickle
import numpy as np

In [170]:
clf = pickle.load(open('models/clf.pkl', 'rb'))

In [189]:
test_input= np.array([ 2,0,1,13,23] , dtype=object).reshape(1,5)

In [190]:
test_input.shape

(1, 5)

In [191]:
predictedValue = clf.predict(test_input)

In [192]:
predictedValue

array([1], dtype=int64)