<h1 style="text-align:center;margin:50px 0">Decision Trees</h1>

## 1. Simple Decision Tree model

In [12]:
import pandas as pd
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split

In [13]:
df = pd.read_csv('../datasets/titanic.csv')

In [14]:
df['male'] = df['Sex'] == 'male'

In [15]:
X = df.drop(columns=["Survived", "Sex"], axis=1).values
y = df["Survived"].values

In [16]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=22)

In [17]:
model = DecisionTreeClassifier()
model.fit(X_train, y_train)

In [20]:
y_pred = model.predict(X_test)

In [22]:
score = model.score(X_test, y_test)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
print("Score:", score)
print("Precision:", precision)
print("Recall:", recall)

Score: 0.7792792792792793
Precision: 0.7303370786516854
Recall: 0.7222222222222222


In [18]:
print(model.predict([[3, True, 22, 1, 0, 7.25]]))

[0]


## 2. Comparing DT with LR using K-fold Cross Validation

In [23]:
import numpy as np
import pandas as pd
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import KFold
from sklearn.metrics import precision_score, recall_score

In [24]:
df = pd.read_csv('../datasets/titanic.csv')

In [25]:
df['male'] = df['Sex'] == 'male'

In [26]:
X = df.drop(columns=["Survived", "Sex"], axis=1).values
y = df["Survived"].values

In [27]:
kf = KFold(n_splits=5, shuffle=True, random_state=10)

In [28]:
dt_accuracy_scores = []
dt_precision_scores = []
dt_recall_scores = []
lr_accuracy_scores = []
lr_precision_scores = []
lr_recall_scores = []

In [29]:
for train_index, test_index in kf.split(X):
    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = y[train_index], y[test_index]
    dt = DecisionTreeClassifier()
    dt.fit(X_train, y_train)
    dt_accuracy_scores.append(dt.score(X_test, y_test))
    dt_y_pred = dt.predict(X_test)
    dt_precision_scores.append(precision_score(y_test, dt_y_pred))
    dt_recall_scores.append(recall_score(y_test, dt_y_pred))
    lr = LogisticRegression()
    lr.fit(X_train, y_train)
    lr_accuracy_scores.append(lr.score(X_test, y_test))
    lr_y_pred = lr.predict(X_test)
    lr_precision_scores.append(precision_score(y_test, lr_y_pred))
    lr_recall_scores.append(recall_score(y_test, lr_y_pred))

In [30]:
print("Decision Tree")
print("  accuracy:", np.mean(dt_accuracy_scores))
print("  precision:", np.mean(dt_precision_scores))
print("  recall:", np.mean(dt_recall_scores))
print("Logistic Regression")
print("  accuracy:", np.mean(lr_accuracy_scores))
print("  precision:", np.mean(lr_precision_scores))
print("  recall:", np.mean(lr_recall_scores))

Decision Tree
  accuracy: 0.7812607122452866
  precision: 0.7189432422009148
  recall: 0.7119206773618538
Logistic Regression
  accuracy: 0.7970354853043865
  precision: 0.7618898922983288
  recall: 0.6900529617441382


## 3. Gini vs Entropy

In [31]:
import pandas as pd
import numpy as np
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import KFold
from sklearn.metrics import accuracy_score, precision_score, recall_score

In [32]:
df = pd.read_csv('../datasets/titanic.csv')

In [33]:
df['male'] = df['Sex'] == 'male'

In [34]:
X = df.drop(columns=["Survived", "Sex"], axis=1).values
y = df["Survived"].values

In [36]:
kf = KFold(n_splits=5, shuffle=True)

In [37]:
for criterion in ['gini', 'entropy']:
    print("Decision Tree - {}".format(criterion))
    accuracy = []
    precision = []
    recall = []
    for train_index, test_index in kf.split(X):
        X_train, X_test = X[train_index], X[test_index]
        y_train, y_test = y[train_index], y[test_index]
        dt = DecisionTreeClassifier(criterion=criterion)
        dt.fit(X_train, y_train)
        y_pred = dt.predict(X_test)
        accuracy.append(accuracy_score(y_test, y_pred))
        precision.append(precision_score(y_test, y_pred))
        recall.append(recall_score(y_test, y_pred))
    print("accuracy:", np.mean(accuracy))
    print("precision:", np.mean(precision))
    print("recall:", np.mean(recall), '\n')
    print()

Decision Tree - gini
accuracy: 0.7710912207198629
precision: 0.7106299748638458
recall: 0.6849557913351016 


Decision Tree - entropy
accuracy: 0.7823779597536977
precision: 0.7146568671792553
recall: 0.7307212308025092 




## 4. Visualizing Decision Trees

In [38]:
from sklearn.tree import export_graphviz
import graphviz
from IPython.display import Image

In [39]:
feature_names = ['Pclass', 'male']

In [40]:
X = df[feature_names].values
y = df['Survived'].values

In [41]:
dt = DecisionTreeClassifier()
dt.fit(X, y)

In [42]:
dot_file = export_graphviz(dt, feature_names=feature_names)
graph = graphviz.Source(dot_file)
graph.render(filename='tree', format='png', cleanup=True)

'tree.png'

<div style="width:100%;text-align:center"><img src="tree.png" style="height:450px;"></div>

## 5. Pruning our Decision Tree

In [57]:
import pandas as pd
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import GridSearchCV

In [58]:
df = pd.read_csv('../datasets/titanic.csv')

In [59]:
df['male'] = df['Sex'] == 'male'

In [60]:
X = df.drop(columns=["Survived", "Sex"], axis=1).values
y = df["Survived"].values

In [61]:
param_grid = {
    'max_depth': [5, 15, 25],
    'min_samples_leaf': [1, 3],
    'max_leaf_nodes': [10, 20, 35, 50]
}

In [62]:
dt = DecisionTreeClassifier()

In [63]:
gs = GridSearchCV(dt, param_grid, scoring='f1', cv=5)
gs.fit(X, y)

In [64]:
print("best params:", gs.best_params_)

best params: {'max_depth': 15, 'max_leaf_nodes': 35, 'min_samples_leaf': 1}


In [68]:
res = gs.predict([[3, True, 22, 1, 0, 7.25]])[0]
print("Not Survived") if res == 0 else print("Survived")

Not Survived
