## Decision Tree Model Training on Titanic Dataset

In [1]:
import pandas as pd

In [2]:
data = pd.read_csv("/content/titanic.csv")

In [3]:
data['Male'] = data["Sex"] == "male" 
data.head()

Unnamed: 0,Survived,Pclass,Sex,Age,Siblings/Spouses,Parents/Children,Fare,Male
0,0,3,male,22.0,1,0,7.25,True
1,1,1,female,38.0,1,0,71.2833,False
2,1,3,female,26.0,0,0,7.925,False
3,1,1,female,35.0,1,0,53.1,False
4,0,3,male,35.0,0,0,8.05,True


In [4]:
X = data[['Pclass', 'Male', 'Age', 'Siblings/Spouses', 'Parents/Children', 'Fare']].values
print(X)

[[3 True 22.0 1 0 7.25]
 [1 False 38.0 1 0 71.2833]
 [3 False 26.0 0 0 7.925]
 ...
 [3 False 7.0 1 2 23.45]
 [1 True 26.0 0 0 30.0]
 [3 True 32.0 0 0 7.75]]


In [5]:
y = data['Survived'].values
print(y)

[0 1 1 1 0 0 0 0 1 1 1 1 0 0 0 1 0 1 0 1 0 1 1 1 0 1 0 0 1 0 0 1 1 0 0 0 1
 0 0 1 0 0 1 1 0 0 1 0 0 0 0 1 1 0 1 1 0 1 0 0 1 0 0 0 1 1 0 1 0 0 0 0 0 1
 0 0 0 1 1 0 1 1 0 1 1 0 0 1 0 0 0 0 0 0 0 0 1 1 0 0 0 0 0 0 0 1 1 0 1 0 0
 0 0 0 0 0 0 0 0 0 0 0 1 0 1 0 1 1 0 0 0 0 1 0 0 1 0 0 0 0 1 1 0 0 0 1 0 0
 0 0 1 0 0 0 0 1 0 0 0 0 1 0 0 0 1 1 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 1 1 0
 1 1 0 0 1 0 1 1 1 1 0 0 1 0 0 0 0 0 1 0 0 1 1 1 0 1 0 0 0 1 1 0 1 0 1 0 0
 0 1 0 1 0 0 0 1 0 0 1 0 0 0 1 0 0 0 1 0 0 0 0 0 1 1 0 0 0 0 0 0 1 1 1 1 1
 1 0 0 0 0 0 1 1 1 0 1 1 0 1 1 0 0 0 1 0 0 0 1 0 0 1 0 1 1 1 1 0 0 0 0 0 0
 1 1 1 1 0 1 0 1 1 1 0 1 1 1 0 0 0 1 1 0 1 1 0 0 1 1 0 1 0 1 1 1 1 0 0 0 1
 0 0 1 1 0 1 1 0 0 0 1 1 1 1 0 0 0 0 0 0 0 1 0 1 1 0 0 0 0 0 0 1 1 1 1 1 0
 0 0 0 1 1 0 0 0 1 1 0 1 0 0 0 1 0 1 1 1 0 1 1 0 0 0 0 1 1 0 0 0 0 0 0 1 0
 0 0 0 1 0 1 0 1 1 0 0 0 0 0 0 0 1 1 0 1 1 1 1 0 0 1 0 1 0 0 1 0 0 1 1 1 1
 1 1 1 0 0 0 1 0 1 0 1 1 0 1 0 0 0 0 0 0 0 0 1 0 0 1 1 0 0 0 0 0 1 0 0 0 1
 1 0 1 0 0 1 0 0 0 0 0 0 

In [6]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.model_selection import train_test_split

In [7]:
model = DecisionTreeClassifier()

In [8]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=22)

In [9]:
X_train.shape

(665, 6)

In [10]:
y_train.shape

(665,)

In [11]:
model.fit(X_train, y_train)

DecisionTreeClassifier()

In [12]:
y_pred = model.predict(X_test)

In [13]:
print("accuracy", accuracy_score(y_test,y_pred)) #actual (the first argument), predicted (the second argument)
print("precision", precision_score(y_test,y_pred))
print("recall", recall_score(y_test,y_pred))
print("f1 score", f1_score(y_test,y_pred))

accuracy 0.7882882882882883
precision 0.7471264367816092
recall 0.7222222222222222
f1 score 0.7344632768361581


In [14]:
from sklearn.model_selection import GridSearchCV

In [15]:
param_grid = {
    'max_depth': [5, 15, 25],
    'min_samples_leaf': [1, 3],
    'max_leaf_nodes': [10, 20, 35, 50]}

In [16]:
gs = GridSearchCV(model, param_grid, scoring='f1', cv=5)

In [17]:
gs.fit(X, y)

GridSearchCV(cv=5, estimator=DecisionTreeClassifier(),
             param_grid={'max_depth': [5, 15, 25],
                         'max_leaf_nodes': [10, 20, 35, 50],
                         'min_samples_leaf': [1, 3]},
             scoring='f1')

In [18]:
print("best parameters: ", gs.best_params_) #use the best_params_ attribute to see which model won.

best parameters:  {'max_depth': 25, 'max_leaf_nodes': 35, 'min_samples_leaf': 1}


In [19]:
print("best score", gs.best_score_)

best score 0.7753307769248867


## Pre-Prunning our decision tree

In [20]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.model_selection import train_test_split

In [21]:
dt = DecisionTreeClassifier(max_depth=3, min_samples_leaf=2, max_leaf_nodes=10)

In [22]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=22)

In [23]:
X_train.shape

(665, 6)

In [24]:
y_train.shape

(665,)

In [25]:
dt.fit(X_train, y_train)

DecisionTreeClassifier(max_depth=3, max_leaf_nodes=10, min_samples_leaf=2)

In [26]:
y_pred = model.predict(X_test)

In [27]:
print("accuracy", accuracy_score(y_test,y_pred)) #actual (the first argument), predicted (the second argument)
print("precision", precision_score(y_test,y_pred))
print("recall", recall_score(y_test,y_pred))
print("f1 score", f1_score(y_test,y_pred))

accuracy 0.7882882882882883
precision 0.7471264367816092
recall 0.7222222222222222
f1 score 0.7344632768361581


## Applying Logistic Regression on Titanic Dataset

In [None]:
from sklearn.linear_model import LogisticRegression

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=22)

In [None]:
model = LogisticRegression()

In [None]:
model.fit(X_train, y_train)

LogisticRegression()

In [None]:
y_pred = model.predict(X_test)

In [None]:
print("accuracy", accuracy_score(y_test,y_pred)) #actual (the first argument), predicted (the second argument)
print("precision", precision_score(y_test,y_pred))
print("recall", recall_score(y_test,y_pred))
print("f1 score", f1_score(y_test,y_pred))

accuracy 0.7522522522522522
precision 0.7058823529411765
recall 0.6666666666666666
f1 score 0.6857142857142857


## Making a png image of decision tree

In [None]:
from sklearn.tree import export_graphviz
import graphviz
from IPython.display import Image

feature_names = ['Pclass', 'Male']
X = data[feature_names].values
y = data['Survived'].values

dt = DecisionTreeClassifier()
dt.fit(X, y)

dot_file = export_graphviz(dt, feature_names=feature_names)
graph = graphviz.Source(dot_file)
graph.render(filename='tree', format='png', cleanup=True)

'tree.png'