In [None]:
import pandas as pd
from keras.datasets import mnist
from sklearn.decomposition import PCA
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.multiclass import OneVsRestClassifier
from sklearn.metrics import confusion_matrix
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier

In [None]:
(x_train, y_train), (x_test, y_test) = mnist.load_data()

Downloading data from https://storage.googleapis.com/tensorflow/tf-keras-datasets/mnist.npz


In [None]:
X_train = x_train.reshape(60000, 28*28)

In [None]:
pca = PCA(n_components=44)
pca.fit(X_train)

PCA(n_components=44)

In [None]:
sum(pca.explained_variance_ratio_)

0.8032799165085327

In [None]:
pca = PCA(n_components=44, svd_solver='full')
pca.fit(X_train)

PCA(n_components=44, svd_solver='full')

In [None]:
X_transform = pca.fit_transform(X_train)

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X_transform, y_train, test_size=0.3, random_state=45)

**Random Forest**

In [None]:
rfc = RandomForestClassifier(criterion='gini', min_samples_leaf=10, max_depth=20, n_estimators=10, random_state=45)

In [None]:
ovrc_rfc = OneVsRestClassifier(rfc)

In [None]:
ovrc_rfc.fit(X_train, y_train)

OneVsRestClassifier(estimator=RandomForestClassifier(max_depth=20,
                                                     min_samples_leaf=10,
                                                     n_estimators=10,
                                                     random_state=45))

In [None]:
y_pred = ovrc_rfc.predict(X_test)

In [None]:
confusion_matrix(y_test, y_pred)

array([[1721,    0,    6,    5,    3,   15,   16,    1,    4,    4],
       [   1, 1926,   18,    4,    1,    5,    4,    5,    8,    2],
       [  12,    5, 1648,   25,   18,    7,   10,   15,   41,    7],
       [   5,    6,   42, 1623,    5,   60,    7,   20,   46,   18],
       [   6,   13,   15,    3, 1568,    5,   22,   11,   12,   75],
       [  21,    4,   18,   50,   24, 1401,   38,    4,   28,   13],
       [  18,    5,   11,    3,    7,   29, 1673,    1,    9,    1],
       [  11,   17,   27,    0,   26,    5,    0, 1767,    9,   45],
       [  12,   23,   31,   70,   19,   46,   10,    5, 1584,   26],
       [  12,    6,    6,   34,   74,    6,    3,   50,   15, 1604]])

**Logistic Regression**

In [None]:
lr = LogisticRegression(solver='lbfgs', random_state=45)

In [None]:
ovrc_lr = OneVsRestClassifier(lr).fit(X_train, y_train)

In [None]:
y_pred = ovrc_lr.predict(X_test)

In [None]:
confusion_matrix(y_test, y_pred)

array([[1718,    1,    4,    4,    4,   19,    9,    4,   10,    2],
       [   0, 1909,    7,    7,    0,   13,    2,    3,   31,    2],
       [  14,   27, 1533,   26,   44,   11,   34,   34,   53,   12],
       [   6,   17,   62, 1554,    4,   85,   11,   15,   47,   31],
       [   6,   12,   16,    2, 1556,    7,   17,    6,   26,   82],
       [  20,   10,   17,   81,   29, 1293,   49,   11,   51,   40],
       [  15,    5,   15,    2,   14,   33, 1650,    3,   19,    1],
       [   8,   12,   24,    8,   26,    2,    3, 1751,   10,   63],
       [  15,   59,   26,   59,    9,   74,   17,    7, 1530,   30],
       [  17,   14,   11,   44,   86,   23,    1,   65,   17, 1532]])

**Decision Tree**

In [None]:
dtc = DecisionTreeClassifier(criterion='gini', min_samples_leaf=10, max_depth=20, random_state=45)

In [None]:
ovrc_dtc = OneVsRestClassifier(dtc).fit(X_train, y_train)

In [None]:
y_pred = ovrc_dtc.predict(X_test)

In [None]:
confusion_matrix(y_test, y_pred)

array([[1605,    0,   33,   19,    9,   52,   20,    7,   17,   13],
       [   1, 1899,   24,    2,    4,    8,   12,    9,    9,    6],
       [  14,   15, 1511,   49,   33,   28,   31,   19,   56,   32],
       [   6,    9,   98, 1501,    7,   79,   15,   17,   71,   29],
       [   5,    8,   40,    8, 1403,   36,   37,   25,   31,  137],
       [  14,    7,   83,   81,   35, 1236,   41,   12,   62,   30],
       [  23,    9,   36,   13,   12,   51, 1582,    1,   18,   12],
       [  13,   12,   39,   11,   35,   13,    3, 1701,   12,   68],
       [  22,   20,  116,   73,   18,   74,   19,   19, 1413,   52],
       [   9,   11,   48,   39,  106,   36,   15,   78,   44, 1424]])

In [None]:
df = pd.read_csv('pred_for_exam.csv', index_col="FileName")

In [None]:
df.head(10)

Unnamed: 0_level_0,Label,f0,f1,f2,f3,f4,f5,f6,f7,f8,...,f774,f775,f776,f777,f778,f779,f780,f781,f782,f783
FileName,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
file1,7,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
file2,2,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
file3,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
file4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
file5,4,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
file6,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
file7,4,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
file8,9,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
file9,5,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
file10,9,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [None]:
X_data = df.drop("Label" ,axis = 1)

In [None]:
X_reduced = pca.transform(X_data)



In [None]:
print(ovrc_rfc.predict([X_reduced[4]]))
print(ovrc_rfc.predict_proba([X_reduced[4]])[:, 4])

[4]
[0.64334151]


In [None]:
print(ovrc_lr.predict([X_reduced[9]]))
print(ovrc_lr.predict_proba([X_reduced[9]])[:, 9])

[9]
[0.80209183]


In [None]:
print(ovrc_dtc.predict([X_reduced[0]]))
print(ovrc_dtc.predict_proba([X_reduced[0]])[:, 7])

[7]
[0.83207517]
