## Did timings of both fitting and prediction, as well as cross-validation; orders of magnitude of difference between a single tree and a forest.
## Decent improvements in accuracy, reduction of types I and II errors too.  Not huge, but noticeable.

In [1]:
import pandas as pd
import numpy as np
import time

from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier

from sklearn.metrics import classification_report,confusion_matrix
from sklearn import ensemble
from sklearn.model_selection import cross_val_score

from IPython.core.display import display, HTML

display(HTML("<style>.container { width:100% !important; }</style>"))
pd.set_option('display.max_rows', 100)

In [2]:
# I'm using my previously cleaned and prepared Titanic set;
# going to drop a few features
df = pd.read_csv("titan_clean.csv")

df.drop(["Low_fare", "Med_fare", "75th_fare", "Child", "Elderly"], 1, inplace=True)
df.dropna(inplace=True)
df.head()

Unnamed: 0,C,Q,S,Class,Age,SibSp,ParCh,Fare,Female,Survived
0,0,0,1,3,22.0,1,0,7.25,0,0
1,1,0,0,1,38.0,1,0,71.2833,1,1
2,0,0,1,3,26.0,0,0,7.925,1,1
3,0,0,1,1,35.0,1,0,53.1,1,1
4,0,0,1,3,35.0,0,0,8.05,0,0


In [3]:
X = df.drop("Survived", 1)
Y = df["Survived"]

In [4]:
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.30)

## Start of simple Decision Tree

In [5]:
dtree = DecisionTreeClassifier()
start_time = time.time()
dtree.fit(X_train,Y_train)
print("--- %s seconds ---" % (time.time() - start_time))

--- 0.002895832061767578 seconds ---


In [6]:
start_time = time.time()
predictions = dtree.predict(X_test)
print("--- %s seconds ---" % (time.time() - start_time))
print(classification_report(Y_test, predictions))
print(confusion_matrix(Y_test, predictions))

--- 0.0012509822845458984 seconds ---
              precision    recall  f1-score   support

           0       0.76      0.71      0.73       117
           1       0.68      0.73      0.71        98

   micro avg       0.72      0.72      0.72       215
   macro avg       0.72      0.72      0.72       215
weighted avg       0.72      0.72      0.72       215

[[83 34]
 [26 72]]


In [7]:
start_time = time.time()
cross_val_score(dtree, X, Y, cv=10)
print("--- %s seconds ---" % (time.time() - start_time))

--- 0.049448251724243164 seconds ---


## Start of Random Forest

In [8]:
rfc = RandomForestClassifier(n_estimators=100)

start_time = time.time()
rfc.fit(X_train, Y_train)
print("--- %s seconds ---" % (time.time() - start_time))

--- 0.11816096305847168 seconds ---


In [9]:
start_time = time.time()
rfc_pred = rfc.predict(X_test)
print("--- %s seconds ---" % (time.time() - start_time))

print(confusion_matrix(Y_test, rfc_pred))
print(classification_report(Y_test, rfc_pred))

--- 0.013809919357299805 seconds ---
[[94 23]
 [25 73]]
              precision    recall  f1-score   support

           0       0.79      0.80      0.80       117
           1       0.76      0.74      0.75        98

   micro avg       0.78      0.78      0.78       215
   macro avg       0.78      0.77      0.77       215
weighted avg       0.78      0.78      0.78       215



In [10]:
start_time = time.time()
cross_val_score(rfc, X, Y, cv=10)
print("--- %s seconds ---" % (time.time() - start_time))

--- 1.3200898170471191 seconds ---
