In [1]:
from sklearn.datasets import fetch_openml
import numpy as np
import pandas as pd
import matplotlib as mpl
import matplotlib.pyplot as plt
from sklearn.model_selection import StratifiedShuffleSplit
from sklearn.model_selection import train_test_split
import collections
import random
from scipy.io import savemat
import os 
from os import path
from sklearn.preprocessing import normalize
from sklearn.metrics import confusion_matrix
import time, datetime

In [2]:
mnist = fetch_openml('mnist_784',version=1)
mnist.keys()

dict_keys(['data', 'target', 'frame', 'feature_names', 'target_names', 'DESCR', 'details', 'categories', 'url'])

In [3]:
mnist

{'DESCR': "The MNIST database of handwritten digits with 784 features. It can be split in a training set of the first 60,000 examples, and a test set of 10,000 examples  \n\nIt is a subset of a larger set available from NIST. The digits have been size-normalized and centered in a fixed-size image. It is a good database for people who want to try learning techniques and pattern recognition methods on real-world data while spending minimal efforts on preprocessing and formatting. The original black and white (bilevel) images from NIST were size normalized to fit in a 20x20 pixel box while preserving their aspect ratio. The resulting images contain grey levels as a result of the anti-aliasing technique used by the normalization algorithm. the images were centered in a 28x28 image by computing the center of mass of the pixels, and translating the image so as to position this point at the center of the 28x28 field.  \n\nWith some classification methods (particularly template-based methods, 

In [4]:
import numpy as np
X,y=mnist["data"],mnist["target"]
y=y.astype(int)
print(X.shape,y.shape)
print(min(y),max(y))
print(type(X),type(y))
print(np.min(X), np.max(X))

(70000, 784) (70000,)
0 9
<class 'numpy.ndarray'> <class 'numpy.ndarray'>
0.0 255.0


In [5]:
X_train, X_test, y_train, y_test = X[:60000], X[60000:], y[:60000], y[60000:]
print(X_train.shape,X_test.shape,y_train.shape,y_test.shape)

(60000, 784) (10000, 784) (60000,) (10000,)


In [6]:
from sklearn.ensemble import RandomForestClassifier
rnd_clf = RandomForestClassifier(n_estimators=100, max_depth=None)

In [7]:
start = datetime.datetime.now()
rnd_clf.fit(X_train, y_train)
y_pred_rf = rnd_clf.predict(X_test)
end = datetime.datetime.now()
diff = (end - start)

### Time taken by Random Forest model

In [8]:
diff

datetime.timedelta(seconds=44, microseconds=659552)

In [9]:
len(y_pred_rf)

10000

In [10]:
from sklearn.metrics import confusion_matrix
matrix = confusion_matrix(y_test, y_pred_rf)
accuracy_class = matrix.diagonal()/matrix.sum(axis=1)

### Confusion Matrix for Random Forest 

In [11]:
matrix

array([[ 969,    0,    1,    0,    0,    3,    3,    1,    3,    0],
       [   0, 1120,    3,    3,    1,    2,    3,    0,    2,    1],
       [   5,    0,  999,    6,    4,    0,    4,    8,    6,    0],
       [   0,    0,    8,  970,    0,   10,    0,    9,    9,    4],
       [   1,    0,    0,    0,  958,    0,    6,    1,    4,   12],
       [   3,    0,    0,   11,    3,  862,    5,    1,    5,    2],
       [   7,    3,    0,    0,    4,    3,  939,    0,    2,    0],
       [   1,    4,   21,    2,    1,    0,    0,  989,    3,    7],
       [   4,    0,    7,   11,    6,    7,    3,    3,  925,    8],
       [   5,    6,    2,   10,   11,    3,    2,    4,    6,  960]])

In [12]:
accuracy_class

array([0.98877551, 0.98678414, 0.96802326, 0.96039604, 0.97556008,
       0.96636771, 0.98016701, 0.96206226, 0.94969199, 0.95143707])

### Accuracy of Random Forest aross all the classes

In [13]:
j=0
for i in accuracy_class:
    print("Accuracy for the number ",j," is ",round(i*100,2),"%")
    j+=1

Accuracy for the number  0  is  98.88 %
Accuracy for the number  1  is  98.68 %
Accuracy for the number  2  is  96.8 %
Accuracy for the number  3  is  96.04 %
Accuracy for the number  4  is  97.56 %
Accuracy for the number  5  is  96.64 %
Accuracy for the number  6  is  98.02 %
Accuracy for the number  7  is  96.21 %
Accuracy for the number  8  is  94.97 %
Accuracy for the number  9  is  95.14 %


### Accuracy of Random Forest overall

In [14]:
accuracy_class.sum()/len(accuracy_class)*100

96.89265070673983

## PCA for dimensionality reduction

## variance_ratio = 0.95

In [15]:
X_train, X_test, y_train, y_test = X[:60000], X[60000:], y[:60000], y[60000:]
print(X_train.shape,X_test.shape,y_train.shape,y_test.shape)

(60000, 784) (10000, 784) (60000,) (10000,)


In [16]:
from sklearn.decomposition import PCA

pca = PCA(n_components=0.95, svd_solver='full')
X_train = pca.fit_transform(X_train)
X_test = pca.transform(X_test)

In [17]:
pca.explained_variance_ratio_ 

array([0.09704664, 0.07095924, 0.06169089, 0.05389419, 0.04868797,
       0.04312231, 0.0327193 , 0.02883895, 0.02762029, 0.02357001,
       0.0210919 , 0.02022991, 0.01715818, 0.01692111, 0.01578641,
       0.01482953, 0.01324561, 0.01276897, 0.01187263, 0.01152684,
       0.01066166, 0.01006713, 0.00953573, 0.00912544, 0.00883405,
       0.00839319, 0.00812579, 0.00786366, 0.00744733, 0.00690859,
       0.00658094, 0.00648148, 0.00602615, 0.00586582, 0.00570021,
       0.00543628, 0.00505786, 0.00487859, 0.00481429, 0.00472266,
       0.00456747, 0.00444836, 0.00418501, 0.00398215, 0.00384975,
       0.00375103, 0.00362009, 0.00351591, 0.00340058, 0.00321874,
       0.00319017, 0.00312805, 0.00295983, 0.00288955, 0.0028413 ,
       0.00271436, 0.00269521, 0.00258473, 0.00253771, 0.00244781,
       0.00240506, 0.00239263, 0.00230408, 0.00221532, 0.00213721,
       0.00207225, 0.00203043, 0.00196783, 0.00192853, 0.00188632,
       0.00186977, 0.00181083, 0.00177562, 0.00174898, 0.00165

In [18]:
start = datetime.datetime.now()
rnd_clf.fit(X_train, y_train)
y_pred_rf = rnd_clf.predict(X_test)
end = datetime.datetime.now()
diff = (end - start)

### Time taken by Random Forest model with PCA (variance_ratio=0.95)

In [19]:
diff

datetime.timedelta(seconds=105, microseconds=458442)

In [20]:
from sklearn.metrics import confusion_matrix
matrix = confusion_matrix(y_test, y_pred_rf)
accuracy_class = matrix.diagonal()/matrix.sum(axis=1)

### Confusion Matrix for Random Forest with PCA (variance_ratio=0.95) 

In [21]:
matrix

array([[ 963,    0,    3,    0,    0,    4,    6,    1,    3,    0],
       [   0, 1118,    6,    4,    0,    2,    3,    0,    2,    0],
       [   9,    0,  963,   14,    6,    2,    3,   11,   23,    1],
       [   4,    0,    9,  953,    0,   15,    3,    9,   12,    5],
       [   2,    1,    6,    0,  937,    3,    6,    3,    3,   21],
       [   5,    1,    4,   21,    6,  836,    8,    1,    6,    4],
       [   9,    3,    2,    1,    2,    2,  939,    0,    0,    0],
       [   1,    6,   19,    1,    6,    2,    1,  966,    4,   22],
       [   8,    0,   11,   19,    9,   17,    5,    6,  891,    8],
       [   7,    6,    1,   14,   23,    6,    0,   15,    4,  933]])

In [22]:
accuracy_class

array([0.98265306, 0.98502203, 0.93313953, 0.94356436, 0.95417515,
       0.93721973, 0.98016701, 0.93968872, 0.91478439, 0.9246779 ])

### Accuracy of Random Forest aross all the classes with PCA (variance_ratio=0.95) 

In [23]:
j=0
for i in accuracy_class:
    print("Accuracy for the number ",j," is ",round(i*100,2),"%")
    j+=1

Accuracy for the number  0  is  98.27 %
Accuracy for the number  1  is  98.5 %
Accuracy for the number  2  is  93.31 %
Accuracy for the number  3  is  94.36 %
Accuracy for the number  4  is  95.42 %
Accuracy for the number  5  is  93.72 %
Accuracy for the number  6  is  98.02 %
Accuracy for the number  7  is  93.97 %
Accuracy for the number  8  is  91.48 %
Accuracy for the number  9  is  92.47 %


### Accuracy of Random Forest overall with PCA (variance_ratio=0.95) 

In [24]:
accuracy_class.sum()/len(accuracy_class)*100

94.95091886394178

## variance_ratio = 0.90

In [25]:
X_train, X_test, y_train, y_test = X[:60000], X[60000:], y[:60000], y[60000:]
print(X_train.shape,X_test.shape,y_train.shape,y_test.shape)

(60000, 784) (10000, 784) (60000,) (10000,)


In [26]:
pca = PCA(n_components=0.90, svd_solver='full')
X_train = pca.fit_transform(X_train)
X_test = pca.transform(X_test)

In [27]:
pca.explained_variance_ratio_ 

array([0.09704664, 0.07095924, 0.06169089, 0.05389419, 0.04868797,
       0.04312231, 0.0327193 , 0.02883895, 0.02762029, 0.02357001,
       0.0210919 , 0.02022991, 0.01715818, 0.01692111, 0.01578641,
       0.01482953, 0.01324561, 0.01276897, 0.01187263, 0.01152684,
       0.01066166, 0.01006713, 0.00953573, 0.00912544, 0.00883405,
       0.00839319, 0.00812579, 0.00786366, 0.00744733, 0.00690859,
       0.00658094, 0.00648148, 0.00602615, 0.00586582, 0.00570021,
       0.00543628, 0.00505786, 0.00487859, 0.00481429, 0.00472266,
       0.00456747, 0.00444836, 0.00418501, 0.00398215, 0.00384975,
       0.00375103, 0.00362009, 0.00351591, 0.00340058, 0.00321874,
       0.00319017, 0.00312805, 0.00295983, 0.00288955, 0.0028413 ,
       0.00271436, 0.00269521, 0.00258473, 0.00253771, 0.00244781,
       0.00240506, 0.00239263, 0.00230408, 0.00221532, 0.00213721,
       0.00207225, 0.00203043, 0.00196783, 0.00192853, 0.00188632,
       0.00186977, 0.00181083, 0.00177562, 0.00174898, 0.00165

In [28]:
start = datetime.datetime.now()
rnd_clf.fit(X_train, y_train)
y_pred_rf = rnd_clf.predict(X_test)
end = datetime.datetime.now()
diff = (end - start)

### Time taken by Random Forest model with PCA (variance_ratio=0.90)

In [29]:
diff

datetime.timedelta(seconds=79, microseconds=822364)

In [30]:
from sklearn.metrics import confusion_matrix
matrix = confusion_matrix(y_test, y_pred_rf)
accuracy_class = matrix.diagonal()/matrix.sum(axis=1)

### Confusion Matrix for Random Forest with PCA (variance_ratio=0.90) 

In [31]:
matrix

array([[ 963,    0,    3,    0,    0,    3,    7,    2,    2,    0],
       [   0, 1121,    2,    5,    0,    1,    3,    0,    3,    0],
       [  12,    0,  975,   13,    4,    0,    2,    9,   16,    1],
       [   2,    0,    7,  956,    1,   11,    2,    9,   17,    5],
       [   1,    0,    5,    1,  941,    2,    9,    2,    4,   17],
       [   5,    1,    3,   18,    5,  842,    7,    1,    3,    7],
       [   7,    2,    2,    0,    3,    7,  934,    0,    2,    1],
       [   1,    6,   16,    2,    8,    0,    1,  973,    2,   19],
       [   4,    0,    9,   20,    9,   16,    3,    8,  897,    8],
       [   7,    6,    3,   13,   21,    6,    0,    8,    6,  939]])

In [32]:
accuracy_class

array([0.98265306, 0.9876652 , 0.94476744, 0.94653465, 0.95824847,
       0.94394619, 0.97494781, 0.94649805, 0.92094456, 0.93062438])

### Accuracy of Random Forest aross all the classes with PCA (variance_ratio=0.90) 

In [33]:
j=0
for i in accuracy_class:
    print("Accuracy for the number ",j," is ",round(i*100,2),"%")
    j+=1

Accuracy for the number  0  is  98.27 %
Accuracy for the number  1  is  98.77 %
Accuracy for the number  2  is  94.48 %
Accuracy for the number  3  is  94.65 %
Accuracy for the number  4  is  95.82 %
Accuracy for the number  5  is  94.39 %
Accuracy for the number  6  is  97.49 %
Accuracy for the number  7  is  94.65 %
Accuracy for the number  8  is  92.09 %
Accuracy for the number  9  is  93.06 %


### Accuracy of Random Forest overall with PCA (variance_ratio=0.90) 

In [34]:
accuracy_class.sum()/len(accuracy_class)*100

95.36829817138376

## variance_ratio = 0.85

In [35]:
X_train, X_test, y_train, y_test = X[:60000], X[60000:], y[:60000], y[60000:]
print(X_train.shape,X_test.shape,y_train.shape,y_test.shape)

(60000, 784) (10000, 784) (60000,) (10000,)


In [36]:
pca = PCA(n_components=0.85, svd_solver='full')
X_train = pca.fit_transform(X_train)
X_test = pca.transform(X_test)

In [37]:
pca.explained_variance_ratio_ 

array([0.09704664, 0.07095924, 0.06169089, 0.05389419, 0.04868797,
       0.04312231, 0.0327193 , 0.02883895, 0.02762029, 0.02357001,
       0.0210919 , 0.02022991, 0.01715818, 0.01692111, 0.01578641,
       0.01482953, 0.01324561, 0.01276897, 0.01187263, 0.01152684,
       0.01066166, 0.01006713, 0.00953573, 0.00912544, 0.00883405,
       0.00839319, 0.00812579, 0.00786366, 0.00744733, 0.00690859,
       0.00658094, 0.00648148, 0.00602615, 0.00586582, 0.00570021,
       0.00543628, 0.00505786, 0.00487859, 0.00481429, 0.00472266,
       0.00456747, 0.00444836, 0.00418501, 0.00398215, 0.00384975,
       0.00375103, 0.00362009, 0.00351591, 0.00340058, 0.00321874,
       0.00319017, 0.00312805, 0.00295983, 0.00288955, 0.0028413 ,
       0.00271436, 0.00269521, 0.00258473, 0.00253771])

In [38]:
start = datetime.datetime.now()
rnd_clf.fit(X_train, y_train)
y_pred_rf = rnd_clf.predict(X_test)
end = datetime.datetime.now()
diff = (end - start)

### Time taken by Random Forest model with PCA (variance_ratio=0.85)

In [39]:
diff

datetime.timedelta(seconds=60, microseconds=827781)

In [40]:
from sklearn.metrics import confusion_matrix
matrix = confusion_matrix(y_test, y_pred_rf)
accuracy_class = matrix.diagonal()/matrix.sum(axis=1)

### Confusion Matrix for Random Forest with PCA (variance_ratio=0.85) 

In [41]:
matrix

array([[ 967,    0,    2,    0,    0,    3,    6,    1,    1,    0],
       [   0, 1120,    2,    5,    0,    0,    4,    0,    3,    1],
       [   9,    0,  975,   10,    6,    0,    2,    9,   19,    2],
       [   2,    0,    6,  956,    1,   14,    2,    7,   17,    5],
       [   1,    0,    7,    0,  946,    1,    8,    1,    3,   15],
       [   3,    1,    1,   17,    4,  843,   12,    1,    6,    4],
       [  10,    3,    1,    0,    2,    2,  938,    0,    2,    0],
       [   1,    4,   19,    1,    6,    2,    0,  972,    3,   20],
       [   6,    0,    8,   18,   12,   17,    4,    4,  898,    7],
       [   3,    6,    4,   12,   20,    6,    0,    8,    9,  941]])

In [42]:
accuracy_class

array([0.98673469, 0.98678414, 0.94476744, 0.94653465, 0.96334012,
       0.94506726, 0.97912317, 0.94552529, 0.92197125, 0.93260654])

### Accuracy of Random Forest aross all the classes with PCA (variance_ratio=0.85) 

In [43]:
j=0
for i in accuracy_class:
    print("Accuracy for the number ",j," is ",round(i*100,2),"%")
    j+=1

Accuracy for the number  0  is  98.67 %
Accuracy for the number  1  is  98.68 %
Accuracy for the number  2  is  94.48 %
Accuracy for the number  3  is  94.65 %
Accuracy for the number  4  is  96.33 %
Accuracy for the number  5  is  94.51 %
Accuracy for the number  6  is  97.91 %
Accuracy for the number  7  is  94.55 %
Accuracy for the number  8  is  92.2 %
Accuracy for the number  9  is  93.26 %


### Accuracy of Random Forest overall with PCA (variance_ratio=0.85) 

In [45]:
accuracy_class.sum()/len(accuracy_class)*100

95.52454575749132

### Discussion: 
### 1) The time taken for the normal random forest model is 44 secs and the accuracy is 96.89 %. 
### 2) The time taken after dimension reduction using PCA (variance_ratio=0.95) is 105 secs and the accuracy is 94.95%. 
### 3) The time taken after dimension reduction using PCA (variance_ratio=0.90) is 79 secs and the accuracy is 95.36%. 
### 4) The time taken after dimension reduction using PCA (variance_ratio=0.85) is 60 secs and the accuracy is 95.52%. 

### Thus, the time taken by normal model is the least, followed by PCA with var_ratio=0.85 < var_ratio=90 < var_ratio=0.95