In [None]:
import numpy as np
import pandas as pd
import math
from sklearn import metrics
from sklearn.naive_bayes import GaussianNB
from sklearn.naive_bayes import BernoulliNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score
from skimage import io
from skimage.transform import resize
import warnings
warnings.filterwarnings('ignore')

In [None]:
# read data, feature-label split
train_data = pd.read_csv("mnist_train.csv")
test_data = pd.read_csv("mnist_test.csv")
train_x = train_data.iloc[:, 0:784]
train_y = train_data.iloc[:, 784]
test_x = test_data.iloc[:, 0:784]
test_y = test_data.iloc[:, 784]

In [None]:
# untouched gaussian
untouched_nb_gau = GaussianNB()
untouched_nb_gau.fit(train_x, train_y)
# test
untouched_gau_test_result = untouched_nb_gau.predict(test_x)
untouched_gau_cf = confusion_matrix (test_y, untouched_gau_test_result)
untouched_gau_acc = accuracy_score(test_y, untouched_gau_test_result)

In [None]:
# untouched bernoulli
untouched_nb_ber= BernoulliNB()
untouched_nb_ber.fit(train_x, train_y)
# test
untouched_ber_test_result = untouched_nb_ber.predict(test_x)
untouched_ber_cf = confusion_matrix (test_y, untouched_ber_test_result)
untouched_ber_acc = accuracy_score(test_y, untouched_ber_test_result)

In [None]:
# crop function
def image_crop(arr):
    result = np.empty([20,20])
    i = 1
    for j in range(1,785):
        x = math.floor(j / 28)
        y = j % 28 - 1
        
        if(y == 0):
            y = 28
        if x >= 4 and x <= 23 and y >= 4 and y <= 23: 
            result[x-4,y-4] = arr[j]
            i = 1 + i

    return result

In [None]:
# cropped_digit is a 20 by 20 matrix
# stretch function

def image_scale(images): 
    # up
    up = 0
    for h in range(0,20):
        zeroRow = True
        for w in range(0,20):
            if(images[h,w] != 0.):
                zeroRow = False
                break

        if(zeroRow):
            up = up + 1
        else:
            break
  
  #down
    down = 0
    for h in range(19,0,-1):
        zeroRow = True
        for w in range(20):
            if(images[h,w] != 0.):
                zeroRow = False
                break

        if(zeroRow):
            down = down + 1
        else:
            break
    
  
  #left
    left = 0
    for w in range(20):
        zeroRow = True
        for h in range(20):
            if(images[h,w] != 0.):
                zeroRow = False
                break

        if(zeroRow):
            left = left + 1
        else:
            break
    
  # right
    right = 0
    for w in range(19,0,-1):
        zeroRow = True
        for h in range(20):
            if(images[h,w] != 0.):
                zeroRow = False
                break

        if(zeroRow):
            right = right + 1
        else:
            break
  
    height = 20 - up - down
    width = 20 - left - right
    original = images[(up):(19-down),(left):(19-right)]
         
    return original 


In [None]:
# data preprocessing
def pre_process(data):
    cp = image_crop(data.iloc[0, 0:784])
    sc = image_scale(cp)
    img = resize(sc, (20, 20))
    img = img.reshape([1,400])
    final_data = pd.DataFrame(img)
    
    for i in range(1,len(data)):
        cp = image_crop(data.iloc[i, 0:784])
        sc = image_scale(cp)
        img = resize(sc, (20, 20))
        img = img.reshape([1,400])
        temp = pd.DataFrame(img)
        final_data = final_data.append(temp)

    return final_data

In [None]:
#stretched_train_x = pre_process(train_x)
#stretched_train_y = train_y
#stretched_test_x = pre_process(test_x)
#stretched_test_y = test_y
stretched_train_x = pd.read_csv("preprocessed_train")
stretched_test_x = pd.read_csv("preprocessed_test")
stretched_train_y = train_y
stretched_test_y = test_y

In [None]:
# stretched gaussian
stretched_nb_gau = GaussianNB()
stretched_nb_gau.fit(stretched_train_x, stretched_train_y)
# test
stretched_gau_test_result = stretched_nb_gau.predict(stretched_test_x)
stretched_gau_cf = confusion_matrix (stretched_test_y, stretched_gau_test_result)
stretched_gau_acc = accuracy_score(stretched_test_y, stretched_gau_test_result)

In [None]:
# stretched bernoulli
stretched_nb_ber = BernoulliNB()
stretched_nb_ber.fit(stretched_train_x, stretched_train_y)
# test
stretched_ber_test_result = stretched_nb_ber.predict(stretched_test_x)
stretched_ber_cf = confusion_matrix (stretched_test_y, stretched_ber_test_result)
stretched_ber_acc = accuracy_score(stretched_test_y, stretched_ber_test_result)

In [None]:
d = {'Bernoulli': [untouched_ber_acc, stretched_ber_acc], 'Gaussian': [untouched_gau_acc, stretched_gau_acc]}
df_2a = pd.DataFrame(data=d)
df_2a.iloc[:].index = ['Untouched images', 'Stretched bounding box']
df_2a

In [None]:
def get_rf_acc(train_x, train_y, test_x, test_y, depth, trees):
    rf = RandomForestClassifier(max_depth=depth, n_estimators=trees)
    rf.fit(train_x, train_y)
    # test
    test_result = rf.predict(test_x)
    cm = confusion_matrix (test_y, test_result)
    acc = accuracy_score(test_y, test_result)
    return(acc)

In [None]:
acc_un_4_10 = get_rf_acc (train_x, train_y, test_x, test_y, 4, 10)
acc_un_4_20 = get_rf_acc (train_x, train_y, test_x, test_y, 4, 20)
acc_un_4_30 = get_rf_acc (train_x, train_y, test_x, test_y, 4, 30)
acc_un_8_20 = get_rf_acc (train_x, train_y, test_x, test_y, 8, 10)
acc_un_8_20 = get_rf_acc (train_x, train_y, test_x, test_y, 8, 20)
acc_un_8_20 = get_rf_acc (train_x, train_y, test_x, test_y, 8, 30)
acc_un_16_20 = get_rf_acc (train_x, train_y, test_x, test_y, 16, 10)
acc_un_16_20 = get_rf_acc (train_x, train_y, test_x, test_y, 16, 20)
acc_un_16_20 = get_rf_acc (train_x, train_y, test_x, test_y, 16, 30)

In [None]:
acc_4_10 = get_rf_acc (stretched_train_x, stretched_train_y, stretched_test_x, stretched_test_y, 4, 10)
acc_4_20 = get_rf_acc (stretched_train_x, stretched_train_y, stretched_test_x, stretched_test_y, 4, 20)
acc_4_30 = get_rf_acc (stretched_train_x, stretched_train_y, stretched_test_x, stretched_test_y, 4, 30)
acc_8_20 = get_rf_acc (stretched_train_x, stretched_train_y, stretched_test_x, stretched_test_y, 8, 10)
acc_8_20 = get_rf_acc (stretched_train_x, stretched_train_y, stretched_test_x, stretched_test_y, 8, 20)
acc_8_20 = get_rf_acc (stretched_train_x, stretched_train_y, stretched_test_x, stretched_test_y, 8, 30)
acc_16_20 = get_rf_acc (stretched_train_x, stretched_train_y, stretched_test_x, stretched_test_y, 16, 10)
acc_16_20 = get_rf_acc (stretched_train_x, stretched_train_y, stretched_test_x, stretched_test_y, 16, 20)
acc_16_20 = get_rf_acc (stretched_train_x, stretched_train_y, stretched_test_x, stretched_test_y, 16, 30)

In [None]:
dic_2b_untouch = {'depth = 4': [acc_un_4_10, acc_un_4_20, acc_un_4_30], 
          'depth = 8': [acc_un_8_20, acc_un_8_20, acc_un_8_20], 
          'depth = 16': [acc_un_16_20, acc_un_16_20, acc_un_16_20]}
df_2b_untouch = pd.DataFrame(data=dic_2b_untouch)
df_2b_untouch.iloc[:].index = ['trees = 10', 'trees = 20', 'trees = 30']
df_2b_untouch = df_2b_untouch[['depth = 4', 'depth = 8', 'depth = 16']] #set column order
df_2b_untouch

In [None]:
dic_2b_stretched = {'depth = 4': [acc_4_10, acc_4_20, acc_4_30], 
          'depth = 8': [acc_8_20, acc_8_20, acc_8_20], 
          'depth = 16': [acc_16_20, acc_16_20, acc_16_20]}
df_2b_stretched = pd.DataFrame(data=dic_2b_stretched)
df_2b_stretched.iloc[:].index = ['trees = 10', 'trees = 20', 'trees = 30']
df_2b_stretched = df_2b_stretched[['depth = 4', 'depth = 8', 'depth = 16']] #set column order
df_2b_stretched