In [1]:
import os
import time

import struct
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

# Prepare the training set and testing set

In [2]:
def load_data(path, kind=''):
    """Load MNIST data from `path`"""
    labels_path = os.path.join(path,
                               '%s-labels-idx1-ubyte'
                               % kind)
    images_path = os.path.join(path,
                               '%s-images-idx3-ubyte'
                               % kind)
    with open(labels_path, 'rb') as lbpath:
        magic, n = struct.unpack('>II',
                                 lbpath.read(8))
        labels = np.fromfile(lbpath,
                             dtype=np.uint8)

    with open(images_path, 'rb') as imgpath:
        magic, num, rows, cols = struct.unpack('>IIII',
                                               imgpath.read(16))
        images = np.fromfile(imgpath,
                             dtype=np.uint8).reshape(len(labels), 784)

    return images, labels

path = './datasets/'

In [3]:
X_mnist_train,y_mnist_train = load_data(path, kind='train')
X_mnist_test,y_mnist_test = load_data(path, kind='t10k')

In [4]:
noise_random = np.random.rand(60000,784)
noise_labels = np.full((60000), 10, dtype=int)

noise_random_6 = np.random.rand(6000,784)
noise_labels_6 = np.full((6000), 10, dtype=int)

In [5]:
X_random_train =  np.concatenate((X_mnist_train, noise_random), axis = 0 )
y_random_train = np.concatenate((y_mnist_train, noise_labels), axis = 0 )

X_random_train_6 =  np.concatenate((X_mnist_train, noise_random_6), axis = 0 )
y_random_train_6 = np.concatenate((y_mnist_train, noise_labels_6), axis = 0 )


In [6]:
# Random shuffle the training and testing datasets
from scipy.sparse import coo_matrix
from sklearn.utils import shuffle

X_sparse_mnist_train = coo_matrix(X_mnist_train)
X_mnist_train, X_sparse_mnist_train, y_mnist_train = shuffle(X_mnist_train, X_sparse_mnist_train, y_mnist_train, random_state=666)

X_sparse_random_train = coo_matrix(X_random_train)
X_random_train, X_sparse_random_train, y_random_train = shuffle(X_random_train, X_sparse_random_train, y_random_train, random_state=666)

X_sparse_random_train_6 = coo_matrix(X_random_train_6)
X_random_train_6, X_sparse_random_train_6, y_random_train_6 = shuffle(X_random_train_6, X_sparse_random_train_6, y_random_train_6, random_state=666)

In [7]:
X_mnist_train = X_mnist_train/255
X_mnist_test = X_mnist_test/255

X_random_train = X_random_train/255

X_random_train_6 = X_random_train_6/255


# Training

In [8]:
from sklearn.model_selection import train_test_split 
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score


In [9]:
hs_rf_mnist_scores = []
for i in range(3):
    rt_clf_mnist = RandomForestClassifier(bootstrap=True, 
                                    max_depth=1000, 
                                    max_leaf_nodes = 16,
                                    max_features='auto',
                                    n_estimators=6000, 
                                    n_jobs=-1,
                                    oob_score=True, 
                                    random_state=666, 
                                    verbose=0)
    
    rt_clf_mnist.fit(X_mnist_train, y_mnist_train)
    
    y_pred_mnist = rt_clf_mnist.predict(X_mnist_test)
    hs_rf_mnist_scores.append(accuracy_score(y_mnist_test, y_pred_mnist))

In [17]:
hs_rf_random_scores = []
for j in range(3):
    rt_clf_random = RandomForestClassifier(bootstrap=True, 
                                    max_depth=1000, 
                                    max_leaf_nodes = 16,
                                    max_features='auto',
                                    n_estimators=6000, 
                                    n_jobs=-1,
                                    oob_score=True, 
                                    random_state=666, 
                                    verbose=0)
    
    rt_clf_random.fit(X_random_train, y_random_train)

    y_pred_random = rt_clf_random.predict(X_mnist_test)
    hs_rf_random_scores.append(accuracy_score(y_mnist_test, y_pred_random))


In [12]:
hs_rf_random_6_scores = []
for j in range(3):
    rt_clf_random_6 = RandomForestClassifier(bootstrap=True, 
                                    max_depth=1000, 
                                    max_leaf_nodes = 16,
                                    max_features='auto',
                                    n_estimators=6000, 
                                    n_jobs=-1,
                                    oob_score=True, 
                                    random_state=666, 
                                    verbose=0)
    
    rt_clf_random_6.fit(X_random_train_6, y_random_train_6)

    y_pred_random_6 = rt_clf_random_6.predict(X_mnist_test)
    hs_rf_random_6_scores.append(accuracy_score(y_mnist_test, y_pred_random_6))


In [13]:
hs_rf_mnist_scores

[0.8328, 0.8328, 0.8328]

In [18]:
hs_rf_random_scores

[0.8266, 0.8266, 0.8266]

In [15]:
hs_rf_random_6_scores

[0.8256, 0.8256, 0.8256]