In [8]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import MinMaxScaler
from sklearn.tree import DecisionTreeClassifier

In [9]:
df = pd.read_csv('train.csv')
df = df.drop(columns=['PassengerId','Name','Ticket','Cabin','Fare'])
df

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Embarked
0,0,3,male,22.0,1,0,S
1,1,1,female,38.0,1,0,C
2,1,3,female,26.0,0,0,S
3,1,1,female,35.0,1,0,S
4,0,3,male,35.0,0,0,S
...,...,...,...,...,...,...,...
886,0,2,male,27.0,0,0,S
887,1,1,female,19.0,0,0,S
888,0,3,female,,1,2,S
889,1,1,male,26.0,0,0,C


In [67]:
X_train,X_test,y_train,y_test = train_test_split(df.drop(columns=['Survived']),
                                                 df.Survived,
                                                 test_size=0.2,
                                                 random_state=42)
X_train

Unnamed: 0,Pclass,Sex,Age,SibSp,Parch,Embarked
331,1,male,45.5,0,0,S
733,2,male,23.0,0,0,S
382,3,male,32.0,0,0,S
704,3,male,26.0,1,0,S
813,3,female,6.0,4,2,S
...,...,...,...,...,...,...
106,3,female,21.0,0,0,S
270,1,male,,0,0,S
860,3,male,41.0,2,0,S
435,1,female,14.0,1,2,S


In [21]:
X_train.isnull().sum()

Pclass        0
Sex           0
Age         140
SibSp         0
Parch         0
Embarked      2
dtype: int64

In [86]:
type(X_train[['Sex']])

pandas.core.frame.DataFrame

# Fill Missing Values

In [70]:
# apply simple imputer
si_age = SimpleImputer()
si_embarked = SimpleImputer(strategy='most_frequent')

X_train_age = si_age.fit_transform(X_train.iloc[:,[2]])
X_train_embarked = si_embarked.fit_transform(X_train.iloc[:,[-1]])

X_test_age = si_age.transform(X_test.iloc[:,[2]])
X_test_embarked = si_embarked.transform(X_test.iloc[:,[-1]])

X_train_age.shape

(712, 1)

# One Hot Encoding on embarked and sex

In [71]:
# applying ohe

ohe_sex = OneHotEncoder(sparse_output=False,handle_unknown='ignore')
ohe_embarked = OneHotEncoder(sparse_output=False,handle_unknown='ignore')

X_train_sex = ohe_sex.fit_transform(X_train[['Sex']])
X_train_embarked = ohe_embarked.fit_transform(X_train_embarked)

X_test_sex = ohe_sex.fit_transform(X_test[['Sex']])
X_test_embarked = ohe_embarked.fit_transform(X_test_embarked)

X_train_embarked

array([[0., 0., 1.],
       [0., 0., 1.],
       [0., 0., 1.],
       ...,
       [0., 0., 1.],
       [0., 0., 1.],
       [0., 0., 1.]])

# Extract remaining column and concatenate

In [72]:
X_train.head(2)

Unnamed: 0,Pclass,Sex,Age,SibSp,Parch,Embarked
331,1,male,45.5,0,0,S
733,2,male,23.0,0,0,S


In [73]:
X_train_rem = X_train.drop(columns=['Sex','Age','Embarked'])
X_test_rem = X_test.drop(columns=['Sex','Age','Embarked'])

In [93]:
# concatenate

X_train_transformed = np.concatenate((X_train_rem,X_train_age,X_train_embarked,X_train_sex),axis=1)
X_test_transfromed = np.concatenate((X_test_rem,X_test_age,X_test_embarked,X_test_sex),axis=1)

X_train_transformed[1]

array([ 2.,  0.,  0., 23.,  0.,  0.,  1.,  0.,  1.])

# Apply Algorithm

In [75]:
clf = DecisionTreeClassifier()
clf.fit(X_train_transformed,y_train)

In [76]:
y_pred = clf.predict(X_test_transfromed)
y_pred

array([0, 0, 0, 1, 0, 0, 1, 0, 1, 1, 1, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0,
       1, 0, 0, 0, 0, 1, 1, 1, 0, 1, 1, 1, 1, 0, 1, 0, 0, 1, 1, 0, 0, 1,
       0, 0, 0, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 0, 0, 1, 1, 0, 1, 0, 1, 1,
       0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0,
       1, 0, 1, 1, 0, 0, 0, 0, 1, 0, 0, 1, 1, 1, 0, 1, 1, 0, 0, 0, 1, 0,
       0, 0, 1, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 1,
       0, 0, 0, 1, 1, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 1, 1, 0, 0,
       0, 1, 0])

In [77]:
from sklearn.metrics import accuracy_score
accuracy_score(y_test,y_pred)

0.776536312849162

In [78]:
import pickle

In [83]:
pickle.dump(si_age,open('si_age.pkl','wb'))
pickle.dump(si_embarked,open('si_embarked.pkl','wb'))
pickle.dump(ohe_embarked,open('ohe_embarked.pkl','wb'))
pickle.dump(ohe_sex,open('ohe_sex.pkl','wb'))
pickle.dump(clf,open('clf.pkl','wb'))

In [84]:
si_age.statistics_

array([29.49884615])