# Ensemble Learning and Random Forests

## Daniel Wilcox: 19147414

This example problem can be found within chapter 7 of the "Hands-on Machine Learning with Scikit-Learn and TensorFlow" by Aurélien Géron. 

This project will be investigating the theory behind Ensemble Learning and Random Forests and how to implament them.

In [1]:
#General imports for operating system, unzip and URL's
import os
from six.moves import urllib
from scipy.io import loadmat
from sklearn.datasets import fetch_mldata

#Graphics
import matplotlib
import matplotlib.pyplot as plt

#Array Manipulation
import numpy as np

from sklearn.linear_model import SGDClassifier

#Shuffles data to test/train sets that represent the original data
from sklearn.model_selection import StratifiedKFold

#Cross-validation
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import cross_val_predict

from sklearn.metrics import confusion_matrix

from sklearn.metrics import precision_score, recall_score
from sklearn.metrics import f1_score
from sklearn.metrics import precision_recall_curve
from sklearn.metrics import roc_curve
from sklearn.metrics import roc_auc_score

from sklearn.ensemble import RandomForestClassifier

from sklearn.multiclass import OneVsOneClassifier

from sklearn.preprocessing import StandardScaler

from sklearn.neighbors import KNeighborsClassifier

from sklearn.metrics import accuracy_score


from sklearn.base import clone

#Creating custom Transformers
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.preprocessing import FunctionTransformer

import random

#Model Tuning
from sklearn.model_selection import GridSearchCV

#Image shifting
from scipy.ndimage.interpolation import shift

In [2]:
#The Location to save the dataset
MNIST_PATH = "datasets/MNIST"
MNIST_URL = "https://github.com/amplab/datascience-sp14/raw/master/lab7/mldata/mnist-original.mat"
MNIST_MAT = "/mnist-original.mat"

In [3]:
def load_MNIST_data(mnist_path=MNIST_PATH, mnist_mat=MNIST_MAT):
        mnist_raw = loadmat(mnist_path+mnist_mat)
        mnist = {"data": mnist_raw["data"].T,
                 "target": mnist_raw["label"][0],
                 "Col_names": ["target", "data"],
                 "DESCR": "mldata.org dataset: mnist-original",
                }
        print("Data Successfully extracted from mnist.mat!")
        return mnist
        
    
def get_MNIST_data(mnist_path=MNIST_PATH, mnist_url=MNIST_URL, mnist_mat=MNIST_MAT):
    
    print("Checking if directory exists...")
    if not os.path.isdir(mnist_path):
        os.makedirs(mnist_path)
        print("Creating directory")
    
    else: 
        print("Directory exists")
        
        #------------------------------------------------------------------
        #uncomment if connected to internet
        #try:
            #print("\nAttempting to get MNIST data from mldata.org ...")
            #mnist = fetch_mldata('MNIST original')
            #print("\nSuccess!")
            #return mnist
    
        #except urllib.error.HTTPError as ex:
            #print("\nCan't reach mldata.org, attempting alternative...")
            #print("Checking if mnist.mat file exists...")  
            
        #------------------------------------------------------------------
        #followig if, else should fall under 'except' 
            
        if os.path.isfile(mnist_path+mnist_mat):
            print("mnist.mat file does exists...")
            print("extracting data from mnist.mat...")
            
            mnist = load_MNIST_data(mnist_path, mnist_mat)
            print("\nSuccess!")
            return mnist
        
        else:
            print("mnist.mat file doesn't exists...")
            print("downloading mnist.mat file...")
            url_response = urllib.request.urlopen(mnist_url)
            
            print("\nCreating .mat file")
            with open(mnist_path+mnist_mat, "wb") as f:
                contents = url_response.read()
                f.write(contents)
            mnist = load_MNIST_data(mnist_path, mnist_mat)
            print("\nSuccess!")
            return mnist
            

# Exercises:
8. Exercise: Load the MNIST data and split it into a training set, a validation set, and a test set (e.g., use 50,000 instances for training, 10,000 for validation, and 10,000 for testing).

In [4]:
mnist = get_MNIST_data(MNIST_PATH, MNIST_URL, MNIST_MAT)
mnist            


Checking if directory exists...
Directory exists
mnist.mat file does exists...
extracting data from mnist.mat...
Data Successfully extracted from mnist.mat!

Success!


{'data': array([[0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0],
        ...,
        [0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0]], dtype=uint8),
 'target': array([0., 0., 0., ..., 9., 9., 9.]),
 'Col_names': ['target', 'data'],
 'DESCR': 'mldata.org dataset: mnist-original'}

In [5]:
X, y = mnist["data"], mnist["target"]
print("Shape of \"Data\": {}\nShape of \"target\": {}\n".format(X.shape,y.shape))

Shape of "Data": (70000, 784)
Shape of "target": (70000,)



In [6]:
#MNIST is already split into train and test
X_train, X_test, y_train, y_test = X[:60000], X[60000:], y[:60000], y[60000:]


In [7]:
#Shuffle training set to guarentee cross-validation folds are similar.
shuffle_index = np.random.permutation(60000)
X_train, y_train = X_train[shuffle_index], y_train[shuffle_index]

In [8]:

from sklearn.model_selection import train_test_split

val_size = 10000

#Create validation
X_training, X_validation, y_training, y_validation = train_test_split(
    X_train, y_train, test_size=val_size, random_state=42)


Then train various classifiers, such as a Random Forest classifier, an Extra-Trees classifier, and an SVM.

In [9]:
#Models
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.svm import LinearSVC

from sklearn.metrics import accuracy_score

model_option = [
    RandomForestClassifier(),
    ExtraTreesClassifier(),
    LinearSVC()
]


for clf in model_option:
    clf_name = clf.__class__.__name__
    
    clf.fit(X_training, y_training)
    y_pred = clf.predict(X_validation)
    acc = accuracy_score(y_validation, y_pred)
    
    print("{}: {}%".format(clf_name, acc*100))
    
  
    
    




RandomForestClassifier: 94.17%




ExtraTreesClassifier: 94.48%
LinearSVC: 86.86%




Next, try to combine them into an ensemble that outperforms them all on the validation set, using a soft or hard voting classifier.

In [10]:
from sklearn.ensemble import VotingClassifier

hardVote = True

for hardVote in [True, False]:
    if hardVote:
        voter = 'hard'
        est = [
            ('forest', model_option[0]), 
            ('tree', model_option[1]),
            ('linSvc', model_option[2])
        ]

    else:
        voter = 'soft'
        est = [
            ('forest', model_option[0]), 
            ('tree', model_option[1]),
        ]
        
    vote_clf = VotingClassifier(estimators=est, voting=voter)
    vote_clf.fit(X_train, y_train)
    
    clf_name = vote_clf.__class__.__name__
    y_pred1 = vote_clf.predict(X_validation)
    acc1 = accuracy_score(y_validation, y_pred1)
    
    y_pred2 = vote_clf.predict(X_test)
    acc2 = accuracy_score(y_test, y_pred2)
    
    print("{}({}): Val score:{}%".format(clf_name, voter, acc1*100))
    print("{}({}): Test score:{}%\n".format(clf_name, voter, acc2*100))
    



VotingClassifier(hard): Val score:99.94%
VotingClassifier(hard): Test score:95.25%

VotingClassifier(soft): Val score:100.0%
VotingClassifier(soft): Test score:95.92%



In [11]:
for clf in model_option:
    clf_name = clf.__class__.__name__
    
    clf.fit(X_train, y_train)
    
    y_pred1 = clf.predict(X_validation)
    acc1 = accuracy_score(y_validation, y_pred1)
    
    y_pred2 = clf.predict(X_test)
    acc2 = accuracy_score(y_test, y_pred2)
    
    print("{}: Val score:{}%".format(clf_name, acc1*100))
    print("{}: Test score:{}%\n".format(clf_name, acc2*100))



RandomForestClassifier: Val score:99.92%
RandomForestClassifier: Test score:94.67%

ExtraTreesClassifier: Val score:100.0%
ExtraTreesClassifier: Test score:95.17999999999999%

LinearSVC: Val score:86.85000000000001%
LinearSVC: Test score:85.9%



