# Attempting Prediction with Random Forests and Extra-Trees Ensemble

## Getting data ready

In [None]:
%matplotlib inline
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
sns.set()

In [None]:
# The classifiers
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier

# Metrics and Validation
from sklearn.model_selection import train_test_split
from sklearn.model_selection import ShuffleSplit, cross_val_score
from sklearn.model_selection import GridSearchCV

from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

# Custom Functions
from util.author import results2csv

In [None]:
# Load Precessed Data
train = pd.read_csv('data/train_processed_1.csv', index_col='PassengerId')
test = pd.read_csv('data/test_processed_1.csv', index_col='PassengerId')

# Feature Engineering
train = pd.get_dummies(data=train, drop_first=True)
test = pd.get_dummies(data=test, drop_first=True)

# Split Datasets
train_y = train.pop('Survived')
train_x = train
test_x = test # Nothing to split! Test-set has no target columns.

## RandomForests Ensemble
* Built on top of Decision Trees.

In [None]:
rfclf = RandomForestClassifier(random_state=42, n_jobs=-1, n_estimators=500)
rfclf.fit(train_x,train_y)

# Measuring Accuracy with K-fold Cross-Validation
cv = ShuffleSplit(n_splits=3, test_size=0.3, random_state=42)
cv_scores = cross_val_score(rfclf, train_x, train_y, cv=cv, scoring='accuracy')
print('CV Scores :', cv_scores)
# CV Scores : [0.78358209 0.79104478 0.80970149] # Default params
# CV Scores : [0.79477612 0.79850746 0.79850746] # n_estimators=50
# CV Scores : [0.78358209 0.79477612 0.79850746] # n_estimators=100
# CV Scores : [0.78731343 0.79850746 0.80597015] # n_estimators=500

# Make Predictions    
test_y_pred = rfclf.predict(test_x)

# Persist Data to CSV file for submission
fname = "data/predictions/random_forest.csv"
results2csv(test_x.index, test_y_pred, fname)

confusion_matrix(train_y, rfclf.predict(train_x))    

## Extra-Trees (Extremely Randomized Trees) Ensemble
* ExtraTrees make use of random thresholds for each feature. This is how it differs from Decision Tree.

In [None]:
xtclf = ExtraTreesClassifier(random_state=42, n_jobs=-1, bootstrap=True)
xtclf.fit(train_x, train_y)

# Measuring Accuracy with K-fold Cross-Validation
cv = ShuffleSplit(n_splits=3, test_size=0.3, random_state=42)
cv_scores = cross_val_score(xtclf, train_x, train_y, cv=cv, scoring='accuracy')
print('CV Scores :', cv_scores)
# CV Scores : [0.75       0.78358209 0.79104478] # Default params
# CV Scores : [0.7761194  0.79104478 0.77985075] # When bootstrap=True


# Make Predictions    
test_y_pred = xtclf.predict(test_x)

# Persist Data to CSV file for submission
fname = "data/predictions/xtra_trees.csv"
results2csv(test_x.index, test_y_pred, fname)

confusion_matrix(train_y, xtclf.predict(train_x))    