In [1]:
import pandas as pd
import numpy as np
import matplotlib
import matplotlib.pyplot as plt
from sklearn.decomposition import PCA
from sklearn.model_selection import train_test_split
from statistics import *
%matplotlib inline
from sklearn.multiclass import OneVsRestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import confusion_matrix
from sklearn.linear_model import LogisticRegression
from keras.datasets import mnist

Using TensorFlow backend.


1. Select the first 3000 rows from the MNIST dataset.

In [2]:
(X_train, y_train), (X_pred, y_pred) = mnist.load_data()
X_train = X_train[:3000]
y_train = y_train[:3000]

In [5]:
dim = 784 # 28*28 
X_train = X_train.reshape(len(X_train), dim)

2. Randomly split the derived sample using the train_test_split() into training and test sets in a ratio of 70 to 30. Use the parameter **random_state = 15**.

In [6]:
X_train, X_test, y_train, y_test = train_test_split(X_train, y_train, test_size = 0.3, random_state = 15)

In [7]:
len(X_train)

2100

3. Use One-vs-All multiclass classification method OneVsRestClassifier(), train the random forest classifier RandomForestClassifier() with parameters **criterion='gini', min_samples_leaf=10, max_depth=20, n_estimators=10, random_state=15**. Perform evaluation using the test data.

In [8]:
tree = RandomForestClassifier(criterion = 'gini', 
                              min_samples_leaf = 10, 
                              max_depth = 20, 
                              n_estimators = 10, 
                              random_state = 15)
clf1 = OneVsRestClassifier(tree).fit(X_train, y_train)

In [9]:
y_pred = clf1.predict(X_test)

number of correctly classified objects of all classes for the test data:

In [10]:
CM = confusion_matrix(y_test, y_pred)
CM

array([[77,  0,  2,  0,  1,  2,  2,  0,  2,  0],
       [ 0, 99,  0,  1,  0,  1,  0,  0,  0,  0],
       [ 1,  0, 72,  0,  2,  0,  1,  3,  2,  1],
       [ 0,  1,  6, 79,  0,  3,  2,  7,  4,  2],
       [ 0,  2,  3,  0, 82,  0,  1,  2,  0,  0],
       [ 1,  0,  0,  4,  3, 62,  2,  0,  3,  2],
       [ 2,  1,  1,  0,  0,  2, 79,  0,  0,  0],
       [ 2,  0,  1,  0,  6,  0,  0, 87,  0,  1],
       [ 0,  2,  4,  5,  0,  6,  1,  0, 63,  2],
       [ 1,  0,  0,  2,  8,  0,  0,  5,  2, 77]])

In [11]:
sum([CM[i][i] for i in range(len(CM))])

777

5. Download the data for prediction. Obtain a prediction for the digit image with **id 10** using the trained algorithm.  

Enter the probability of assigning image to the assigned class. Enter the solution rounded to the nearest thousandth.

In [12]:
df = pd.read_csv('DataForPrediction_FinalTask.csv', sep=',')
df = df.drop(['id'], axis = 1).values
df

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]])

In [13]:
y_pred = clf1.predict(df)
assigned = y_pred[10]

In [15]:
round(clf1.predict_proba(df)[10][assigned], 3)

0.763

6. Find scores for the training data corresponding to the **25** principal components.

In [16]:
pca = PCA(n_components = 25, svd_solver = 'full')
modelPCA = pca.fit(X_train)

X_train = modelPCA.transform(X_train)

7. Use One-vs-All multiclass classification method OneVsRestClassifier(), train the decision tree DecisionTreeClassifier() with parameters **criterion='gini', min_samples_leaf=10, max_depth=20, random_state=15**. Use scores as training data.

In [17]:
dec_tree = DecisionTreeClassifier(criterion = 'gini', #критерий разделения
                              min_samples_leaf = 10, #минимальное число объектов в листе
                              max_depth = 20,
                              random_state = 15)

clf3 = OneVsRestClassifier(dec_tree).fit(X_train, y_train)

8. Apply PCA transformation obtained earlier to the test data. Perform evaluation.

In [21]:
X_test = modelPCA.transform(X_test)

In [22]:
y_pred = clf3.predict(X_test)

Enter the number of correctly classified objects of all classes for the test data:

In [23]:
CM = confusion_matrix(y_test, y_pred)
CM

array([[66,  0,  1,  0,  1, 10,  1,  3,  3,  1],
       [ 0, 95,  2,  0,  1,  0,  1,  0,  0,  2],
       [ 4,  0, 62,  3,  3,  2,  0,  3,  1,  4],
       [ 3,  1,  4, 65,  2,  9,  1,  0,  4, 15],
       [ 0,  1,  4,  0, 67,  1,  1,  1,  3, 12],
       [ 3,  2,  0,  9,  1, 42,  3,  1,  4, 12],
       [ 2,  1,  2,  0,  1,  1, 73,  1,  3,  1],
       [ 1,  1,  2,  0,  4,  0,  1, 79,  2,  7],
       [ 1,  1,  4,  4,  0,  6,  1,  1, 49, 16],
       [ 3,  2,  1,  3, 10,  3,  0,  5,  3, 65]])

In [24]:
sum([CM[i][i] for i in range(len(CM))])

663

9. Apply PCA transformation obtained earlier to the data for prediction. Obtain a prediction for the digit image with **id 10** using the trained algorithm.

In [25]:
df = modelPCA.transform(df)

Enter the probability of assigning image to the assigned class by the decision tree.

In [26]:
y_pred = clf3.predict(df)
assigned = y_pred[10]

In [27]:
round(clf3.predict_proba(df)[10][assigned], 3)

  Y /= np.sum(Y, axis=1)[:, np.newaxis]


0.556