In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn import tree
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import StandardScaler
%matplotlib inline
sns.set()

from subprocess import check_output

In [33]:
titanic_data = pd.read_csv("titanic.csv")
titanic_data.head()


Unnamed: 0,PassengerId,Name,Pclass,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,Survived
0,1,"Braund, Mr. Owen Harris",3,male,22.0,1,0,A/5 21171,7.25,,S,0
1,2,"Cumings, Mrs. John Bradley (Florence Briggs Th...",1,female,38.0,1,0,PC 17599,71.2833,C85,C,1
2,3,"Heikkinen, Miss. Laina",3,female,26.0,0,0,STON/O2. 3101282,7.925,,S,1
3,4,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",1,female,35.0,1,0,113803,53.1,C123,S,1
4,5,"Allen, Mr. William Henry",3,male,35.0,0,0,373450,8.05,,S,0


In [3]:
titanic_data.Embarked = titanic_data.Embarked.fillna(titanic_data['Embarked'].mode()[0])

In [4]:
median_age = titanic_data.Age.median()
titanic_data.Age.fillna(median_age, inplace = True)

In [5]:
titanic_data.drop('Cabin', axis = 1,inplace = True)

In [6]:
titanic_data['Fare'] = titanic_data['Fare'].replace(0,titanic_data['Fare'].median())

In [7]:
titanic_data['FamilySize'] = titanic_data['SibSp'] + titanic_data['Parch'] + 1

In [8]:
titanic_data['GenderClass'] = titanic_data.apply(lambda x: 'child' if x['Age'] < 15 else x['Sex'],axis=1)

In [9]:
titanic_data[titanic_data.Age<15].head(2)

Unnamed: 0,PassengerId,Name,Pclass,Sex,Age,SibSp,Parch,Ticket,Fare,Embarked,Survived,FamilySize,GenderClass
7,8,"Palsson, Master. Gosta Leonard",3,male,2.0,3,1,349909,21.075,S,0,5,child
9,10,"Nasser, Mrs. Nicholas (Adele Achem)",2,female,14.0,1,0,237736,30.0708,C,1,2,child


In [10]:
titanic_data[titanic_data.Age>15].head(2)

Unnamed: 0,PassengerId,Name,Pclass,Sex,Age,SibSp,Parch,Ticket,Fare,Embarked,Survived,FamilySize,GenderClass
0,1,"Braund, Mr. Owen Harris",3,male,22.0,1,0,A/5 21171,7.25,S,0,2,male
1,2,"Cumings, Mrs. John Bradley (Florence Briggs Th...",1,female,38.0,1,0,PC 17599,71.2833,C,1,2,female


In [11]:
titanic_data = pd.get_dummies(titanic_data, columns=['GenderClass','Embarked'], drop_first=True)

In [12]:
titanic_data.head(2)

Unnamed: 0,PassengerId,Name,Pclass,Sex,Age,SibSp,Parch,Ticket,Fare,Survived,FamilySize,GenderClass_female,GenderClass_male,Embarked_Q,Embarked_S
0,1,"Braund, Mr. Owen Harris",3,male,22.0,1,0,A/5 21171,7.25,0,2,0,1,0,1
1,2,"Cumings, Mrs. John Bradley (Florence Briggs Th...",1,female,38.0,1,0,PC 17599,71.2833,1,2,1,0,0,0


In [29]:
titanic = titanic_data.drop(['Name','Ticket','Sex','SibSp','Parch','PassengerId'], axis = 1)
titanic.head()

Unnamed: 0,Pclass,Age,Fare,Survived,FamilySize,GenderClass_female,GenderClass_male,Embarked_Q,Embarked_S
0,3,22.0,7.25,0,2,0,1,0,1
1,1,38.0,71.2833,1,2,1,0,0,0
2,3,26.0,7.925,1,1,1,0,0,1
3,1,35.0,53.1,1,2,1,0,0,1
4,3,35.0,8.05,0,1,0,1,0,1


In [30]:
X = titanic.loc[:,titanic.columns != 'Survived']
y = titanic.Survived 
print(X.head())
print(y.head())
X.shape

   Pclass   Age     Fare  FamilySize  GenderClass_female  GenderClass_male  \
0       3  22.0   7.2500           2                   0                 1   
1       1  38.0  71.2833           2                   1                 0   
2       3  26.0   7.9250           1                   1                 0   
3       1  35.0  53.1000           2                   1                 0   
4       3  35.0   8.0500           1                   0                 1   

   Embarked_Q  Embarked_S  
0           0           1  
1           0           0  
2           0           1  
3           0           1  
4           0           1  
0    0
1    1
2    1
3    1
4    0
Name: Survived, dtype: int64


(891, 8)

In [15]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=1)

In [16]:
scaler = StandardScaler()

scaler.fit(X)

X = scaler.transform(X)
X[0]

array([ 0.82737724, -0.56573646, -0.50860555,  0.05915988, -0.6681531 ,
        0.81002088, -0.30756234,  0.61583843])

In [17]:
a = []

for i in range(1,20):
    model = tree.DecisionTreeClassifier(random_state = 0, max_depth=i)
    model.fit(X, y)
    y_pred_train = model.predict(X)
    a.append(accuracy_score(y,y_pred_train))

high = a.index(max(a))+1
high

19

In [18]:
model = tree.DecisionTreeClassifier(random_state = 0, max_depth=high)
model.fit(X, y)

DecisionTreeClassifier(max_depth=19, random_state=0)

In [19]:
y_pred_train = model.predict(X)

In [20]:
print('Accuracy score for test data is:', accuracy_score(y,y_pred_train))

Accuracy score for test data is: 0.9775533108866442


In [21]:
from joblib import dump, load

dump(model, 'decision.joblib')

['decision.joblib']

In [22]:
loaded_model = load('decision.joblib')

In [23]:
# 0	 1	3	22.0	7.2500	0	2	0	1	0	1
# array([-1.73010796,  0.82737724, -0.56573646, -0.50860555,  0.05915988,
#        -0.6681531 ,  0.81002088, -0.30756234,  0.61583843])

# 1	 2	1	38.0	71.2833	1	2	1	0	0	0
# 2	 3	3	26.0	7.9250	1	1	1	0	0	1

# 5	3	35.0	8.0500	0	1	0	1	0	1

In [24]:
# x_real = np.array([1,3,22.0,7.2500,2,0,1,0,1]).reshape(1,-1)
# x_real = np.array([2,1,38.0,71.2833,2,1,0,0,0]).reshape(1,-1)
# x_real = np.array([3,3,26.0,7.9250,1,1,0,0,1]).reshape(1,-1)
x_real = np.array([3,35.0,8.0500,1,0,1,0,1]).reshape(1,-1)

In [25]:
x_real

array([[ 3.  , 35.  ,  8.05,  1.  ,  0.  ,  1.  ,  0.  ,  1.  ]])

In [26]:
x_real_process = scaler.transform(x_real)
x_real_process

array([[ 0.82737724,  0.4333115 , -0.49245777, -0.56097483, -0.6681531 ,
         0.81002088, -0.30756234,  0.61583843]])

In [27]:
new_model = loaded_model.predict(x_real_process)
new_model

array([0])