In [1]:
# pip install python-mnist will install the required package
from mnist import MNIST
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
from sklearn import metrics

In [2]:
np.random.seed(60) # reproducability
mndata = MNIST('Datasets/MNIST')

# read training images and corresponding labels
tr_images, tr_labels = mndata.load_training()
# read test images and corresponding labels
tt_images, tt_labels = mndata.load_testing()

# convert lists into numpy format and apply normalization
tr_images = np.array(tr_images) / 255. # shape (60000, 784)
tr_labels = np.array(tr_labels)         # shape (60000,)
tt_images = np.array(tt_images) / 255. # shape (10000, 784)
tt_labels = np.array(tt_labels)         # shape (10000,)

columns_images = ['p{}'.format(i+1) for i in range(784)]
tr_df_images = pd.DataFrame(data=tr_images, columns=columns_images)
tr_df_labels = pd.DataFrame(data=tr_labels, columns=['label'])
tt_df_images = pd.DataFrame(data=tt_images, columns=columns_images)
tt_df_labels = pd.DataFrame(data=tt_labels, columns=['label'])

In [3]:
X_train, X_test, y_train, y_test = train_test_split(tr_df_images, tr_df_labels, test_size=0.2, random_state=0)

In [4]:
dtc = DecisionTreeClassifier(random_state=0, max_depth = 40, min_samples_split = 2, min_samples_leaf=2, max_leaf_nodes=1000, min_impurity_decrease=0.00003)

In [5]:
# Train Decision Tree Classifer
dtc = dtc.fit(X_train,y_train)
y_pred = dtc.predict(X_test)
print("Accuracy:",metrics.accuracy_score(y_test, y_pred))

Accuracy: 0.8724166666666666


In [6]:
from sklearn.ensemble import RandomForestClassifier
rf=RandomForestClassifier(n_estimators=100)
rf.fit(X_train,y_train)
pred=rf.predict(X_test)
print(metrics.accuracy_score(y_test, pred))

  rf.fit(X_train,y_train)


0.97125


In [8]:
N=100

In [9]:
splitted_data = []
tr_df_images_copy = tr_df_images.copy()
tr_df_labels_copy = tr_df_labels.copy()
splitted_data_size = tr_df_images.shape[0] // N

for i in range(N):
    start_index = splitted_data_size*i
    end_index = splitted_data_size*(i+1)-1
    
    if i == (N-1):
        images_splitted = tr_df_images_copy.iloc[start_index:,].reset_index(drop=True)
        labels_splitted = tr_df_labels_copy.iloc[start_index:,].reset_index(drop=True)
    else:
        images_splitted = tr_df_images_copy.iloc[start_index:end_index,].reset_index(drop=True)
        labels_splitted = tr_df_labels_copy.iloc[start_index:end_index,].reset_index(drop=True)

    splitted_data.append({
        "images": images_splitted,
        "labels": labels_splitted
    })


In [10]:
splitted_data[99]["images"]

Unnamed: 0,p1,p2,p3,p4,p5,p6,p7,p8,p9,p10,...,p775,p776,p777,p778,p779,p780,p781,p782,p783,p784
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
595,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
596,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
597,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
598,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [11]:
trained_trees = []
for i in range(N):
    X_train_subtree = splitted_data[i]["images"]
    Y_train_subtree = splitted_data[i]["labels"]
    dtc = DecisionTreeClassifier(random_state=0)
    dtc = dtc.fit(X_train_subtree,Y_train_subtree)
    y_pred = dtc.predict(X_test)
    print("Accuracy of",i+1,"th Tree:",metrics.accuracy_score(y_test, y_pred))
    trained_trees.append(dtc)

Accuracy of 1 th Tree: 0.5763333333333334
Accuracy of 2 th Tree: 0.59875
Accuracy of 3 th Tree: 0.59275
Accuracy of 4 th Tree: 0.5920833333333333
Accuracy of 5 th Tree: 0.5914166666666667
Accuracy of 6 th Tree: 0.6295833333333334
Accuracy of 7 th Tree: 0.5890833333333333
Accuracy of 8 th Tree: 0.58325
Accuracy of 9 th Tree: 0.6051666666666666
Accuracy of 10 th Tree: 0.5769166666666666
Accuracy of 11 th Tree: 0.60025
Accuracy of 12 th Tree: 0.5885833333333333
Accuracy of 13 th Tree: 0.6024166666666667
Accuracy of 14 th Tree: 0.5825833333333333
Accuracy of 15 th Tree: 0.6015
Accuracy of 16 th Tree: 0.6059166666666667
Accuracy of 17 th Tree: 0.6151666666666666
Accuracy of 18 th Tree: 0.6120833333333333
Accuracy of 19 th Tree: 0.6241666666666666
Accuracy of 20 th Tree: 0.5940833333333333
Accuracy of 21 th Tree: 0.59525
Accuracy of 22 th Tree: 0.5949166666666666
Accuracy of 23 th Tree: 0.6266666666666667
Accuracy of 24 th Tree: 0.58125
Accuracy of 25 th Tree: 0.6001666666666666
Accuracy of 

In [12]:
total_predictions = trained_trees[0].predict(X_test)
for i in range(1, N):
    total_predictions = np.vstack([total_predictions, trained_trees[i].predict(X_test)])
print(total_predictions)

[[4 6 3 ... 5 1 4]
 [3 6 6 ... 5 1 6]
 [3 6 8 ... 5 1 6]
 ...
 [3 6 6 ... 5 5 6]
 [3 6 7 ... 3 3 6]
 [1 6 8 ... 5 1 6]]


In [13]:
total_predictions = np.transpose(total_predictions)
total_predictions

array([[4, 3, 3, ..., 3, 3, 1],
       [6, 6, 6, ..., 6, 6, 6],
       [3, 6, 8, ..., 6, 7, 8],
       ...,
       [5, 5, 5, ..., 5, 3, 5],
       [1, 1, 1, ..., 5, 3, 1],
       [4, 6, 6, ..., 6, 6, 6]], dtype=uint8)

In [14]:
predicted_values = []
for row in total_predictions:
    majority_vote = np.bincount(row).argmax()
    predicted_values.append(majority_vote)

In [15]:
predicted_values = np.asarray(predicted_values)

In [16]:
print("Accuracy of Random Forest Tree:",metrics.accuracy_score(y_test, predicted_values))

Accuracy of Random Forest Tree: 0.89425


In [18]:
# Genetic algorithm
def evolve(trees):
    return

def crossover(tree1, tree2):
    return

def mutate(tree):
    return

def tournament(trees):
    return best
