# Attempting Prediction with Random Forests, Extra-Trees, VotingClassifier Ensembles

## Getting data ready

In [None]:
%matplotlib inline
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
sns.set()

In [None]:
# The classifiers
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier

# Metrics and Validation
from sklearn.model_selection import train_test_split
from sklearn.model_selection import ShuffleSplit, cross_val_score
from sklearn.model_selection import GridSearchCV

from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

# Custom Functions
from util.author import results2csv

In [None]:
# Load Precessed Data
train = pd.read_csv('data/train_processed_1.csv', index_col='PassengerId')
test = pd.read_csv('data/test_processed_1.csv', index_col='PassengerId')

# Feature Engineering
train = pd.get_dummies(data=train, drop_first=True)
test = pd.get_dummies(data=test, drop_first=True)

# Split Datasets
train_y = train.pop('Survived')
train_x = train
test_x = test # Nothing to split! Test-set has no target columns.

## RandomForests Ensemble
* Built on top of Decision Trees.
* In general, the more trees in the forest, the more stable the prediction and thus high accuracy.
* How it works?
  - Grow more trees
  - A Tree of maximal depth is grown on a bootstrap sample of size m of the training set. There is no pruning.
  - A number m << p is specified such that at each node,
      m variables are sampled at random, out of p.
      The best-split of these variables, is used to split the node into 2 sub-nodes.
  - Final classification is given by majority voting of the ensemble of trees in the forest.
  - There are only 2 "free parameters":
      1. number of trees, and
      2. number of variables in random subset at each node.
  - For given input if it haas to be classified, then the vote from each tree is collected and the class with majority votes is chosen by the RandomForest algorithm.
* Advantages
  - RandomForest can handle missing values and maintain accuracy for missing data.
  - Won't overfit the model
  - It can handle large dataset with higher dimensionality
* Disadvantages
  - Not as good for Regression as it is for Classification
  - You have very little control of what the model does. It's a black-box for statistical modedlling.
* Known application/use-cases for RF
  - Classifying medicines
  - Identify diseases
  - Anticipate stock-behaviour in stock market.
  - In ecommerce, used for product recommendations.
  - Image classification.
  - MS has used this for body-parts idedntification in X-Box connect.

In [None]:
rfclf = RandomForestClassifier(random_state=42, n_jobs=-1, n_estimators=500)
rfclf.fit(train_x,train_y)

# Measuring Accuracy with K-fold Cross-Validation
cv = ShuffleSplit(n_splits=3, test_size=0.3, random_state=42)
cv_scores = cross_val_score(rfclf, train_x, train_y, cv=cv, scoring='accuracy')
print('CV Scores :', cv_scores)
# CV Scores : [0.78358209 0.79104478 0.80970149] # Default params
# CV Scores : [0.79477612 0.79850746 0.79850746] # n_estimators=50
# CV Scores : [0.78358209 0.79477612 0.79850746] # n_estimators=100
# CV Scores : [0.78731343 0.79850746 0.80597015] # n_estimators=500

# Make Predictions    
test_y_pred = rfclf.predict(test_x)

# Persist Data to CSV file for submission
fname = "data/predictions/random_forest.csv"
results2csv(test_x.index, test_y_pred, fname)

confusion_matrix(train_y, rfclf.predict(train_x))    

In [None]:
rfclf = RandomForestClassifier(random_state=42, n_jobs=-1, oob_score=True)

params = {
    'n_estimators':[10,20,50,100],
    'min_samples_leaf':[1,2,3,4,5],
    'min_samples_split': [.2,.25, .3,.4,.5], # [2,3,5,7,11,13],
    'max_features':[2,3,4,5,6],
    'max_depth':[None,7,6,5]
}

gsclf = GridSearchCV(rfclf, 
                     n_jobs=-1, # Use all cores of the machine
                     param_grid=params,
                     cv=3,
                     verbose=1, 
                     scoring='accuracy')
gsclf.fit(train_x, train_y)

best_score = gsclf.best_score_
print('Best Score : ', best_score)
# print('OOB Score : ', gsclf.oob_score_)
'''
Best Score :  0.8282828282828283
	max_depth 	 6
	min_samples_leaf 	 3
	min_samples_split 	 2
	n_estimators 	 20

Best Score :  0.8092031425364759
	max_depth 	 5
	max_features 	 2
	min_samples_leaf 	 3
	min_samples_split 	 0.2
	n_estimators 	 20
'''

# gsclf.get_params()
best_params = gsclf.best_estimator_.get_params()
for k in sorted(params.keys()):
    print('\t{0} \t {1}'.format(k, best_params[k]))

cv = ShuffleSplit(n_splits=3, test_size=0.3, random_state=42)
cv_scores = cross_val_score(gsclf, train_x, train_y, cv=cv, scoring='accuracy')
print('CV Scores :', cv_scores)
# CV Scores : [0.78358209 0.79104478 0.80970149] # Default params
# CV Scores : [0.82089552 0.80970149 0.82835821] # With best_params
# CV Scores : [0.80970149 0.81343284 0.81716418] # With best_params

# Make Predictions    
test_y_pred = gsclf.predict(test_x)

# Persist Data to CSV file for submission
fname = "data/predictions/random_forest_tuned.csv"
results2csv(test_x.index, test_y_pred, fname)

confusion_matrix(train_y, gsclf.predict(train_x))    

## Extra-Trees (Extremely Randomized Trees) Ensemble
* ExtraTrees make use of random thresholds for each feature. This is how it differs from Decision Tree.

In [None]:
xtclf = ExtraTreesClassifier(random_state=42, n_jobs=-1, bootstrap=True)
xtclf.fit(train_x, train_y)

# Measuring Accuracy with K-fold Cross-Validation
cv = ShuffleSplit(n_splits=3, test_size=0.3, random_state=42)
cv_scores = cross_val_score(xtclf, train_x, train_y, cv=cv, scoring='accuracy')
print('CV Scores :', cv_scores)
# CV Scores : [0.75       0.78358209 0.79104478] # Default params
# CV Scores : [0.7761194  0.79104478 0.77985075] # When bootstrap=True


# Make Predictions    
test_y_pred = xtclf.predict(test_x)

# Persist Data to CSV file for submission
fname = "data/predictions/xtra_trees.csv"
results2csv(test_x.index, test_y_pred, fname)

confusion_matrix(train_y, xtclf.predict(train_x))    

## VotingClassifier Ensemble
Ref.: http://scikit-learn.org/stable/modules/generated/sklearn.ensemble.VotingClassifier.html

Voting Types:
* Hard (default) : If ‘hard’, uses predicted class labels for majority rule voting.
* Soft : If ‘soft’, predicts the class label based on the argmax of the sums of the predicted probabilities, which is recommended for an ensemble of well-calibrated classifiers.

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import SGDClassifier
from sklearn import svm, tree
from sklearn.ensemble import RandomForestClassifier, VotingClassifier

lrclf = LogisticRegression(random_state=42, max_iter=300, C=0.3, solver='sag',n_jobs=3)
sgdclf = SGDClassifier(random_state=42, max_iter=1000, alpha=0.7)
svmclf = svm.SVC(kernel='rbf', gamma=0.7, C=0.75, random_state=42, probability=True)
dtclf = tree.DecisionTreeClassifier(max_leaf_nodes=47, min_samples_split=4, random_state=42)
rfclf = RandomForestClassifier(random_state=42, n_jobs=-1, oob_score=True, max_depth=6, min_samples_leaf=3, min_samples_split=2, n_estimators=20)

vc1 = VotingClassifier(estimators=[('lrclf',lrclf), ('sgdclf',sgdclf), ('svmclf',svmclf), ('dtclf',dtclf), ('rfclf',rfclf)],
                      voting='hard')
vc1.fit(train_x, train_y)
vc2 = VotingClassifier(estimators=[('lrclf',lrclf), 
#                                    ('sgdclf',sgdclf), 
                                   ('svmclf',svmclf), 
                                   ('dtclf',dtclf), 
                                   ('rfclf',rfclf)],
                      voting='soft')
vc2.fit(train_x, train_y)

test_y_pred_1 = vc1.predict(test_x)
test_y_pred_2 = vc2.predict(test_x)

print(confusion_matrix(train_y, vc1.predict(train_x)))
'''
[[539  10]
 [110 232]]
'''
print(confusion_matrix(train_y, vc2.predict(train_x)))
'''
[[529  20]
 [ 67 275]]
'''

# Persist Data to CSV file for submission
results2csv(test_x.index, test_y_pred_1, "data/predictions/voting_classifier_hard.csv")
results2csv(test_x.index, test_y_pred_2, "data/predictions/voting_classifier_soft.csv")