In [1]:
import pandas as pd
import idx2numpy
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import cross_val_score
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
import matplotlib.pyplot as plt
from sklearn.metrics import confusion_matrix
import seaborn as sns
import pickle
sns.set_style("whitegrid")

In [6]:
def read_data(path):
   #  read data from path
   file_data = path+'input/train-images-idx3-ubyte'
   arr_data = idx2numpy.convert_from_file(file_data)
   arr_data = arr_data.reshape(60000,28*28)
   file_label = path+'input/train-labels-idx1-ubyte'
   arr_label = idx2numpy.convert_from_file(file_label)
   file_test = path + 'input/t10k-images-idx3-ubyte'
   test_data = idx2numpy.convert_from_file(file_test)
   test_data = test_data.reshape(10000, 28 * 28)
   test_lab = path + 'input/t10k-labels-idx1-ubyte'
   test_label = idx2numpy.convert_from_file(test_lab)
   return arr_data,arr_label,test_data,test_label

def split_data(arr_data,arr_label):
    # split data into training set and test set
    X_train, X_test, y_train, y_test = train_test_split(arr_data, arr_label, test_size=0.1, random_state=42)
    return X_train, X_test, y_train, y_test

def model_cross_validation(arr_data, arr_label,model):
    # complete cross validation on training set
    result = cross_val_score(model, arr_data, arr_label, cv=10)
    print(result)
    plt.plot(range(len(result)), result)
    plt.ylabel('accuracy')
    plt.xlabel('result_number')
    plt.title('Cross Validation Test Result')
    plt.show()
    print(np.mean(result))
    return result

def model_test(arr_data, arr_label,test_data,test_label,model,filename):
    # test model on test set
    model.fit(arr_data, arr_label)
    pickle.dump(model, open(filename, 'wb'))
    classname = ['T-shirt/top', 'Trouser', 'Pullover', 'Dress', 'Coat',
                 'Sandal', 'Shirt', 'Sneaker', 'Bag', 'Ankle boot']
    prediction = model.predict(test_data)
    con_array = confusion_matrix(test_label, prediction, labels=range(10))
    con_array = con_array / con_array.sum(axis=1)
    df_cm = pd.DataFrame(con_array, range(10), range(10))
    sns.set(font_scale=1.4)
    sns.heatmap(df_cm, annot=True, annot_kws={"size": 16})
    df_classification = classification_report(test_label, prediction, target_names=classname)
    print(df_classification)
    return model

def random_forest_tune_depth(arr_data,arr_label,param_range):
    # tune hyperparameter depth in the random forest algorithm
    X_train, X_test, y_train, y_test = split_data(arr_data,arr_label)
    score_li = []
    for dep in param_range:
        print(dep)
        randomforest = RandomForestClassifier(n_estimators=50, max_depth=dep, random_state=0)
        randomforest.fit(X_train, y_train)
        ran_score = randomforest.score(X_test, y_test)
        score_li.append(ran_score)
    print(score_li)
    plt.plot(param_range, score_li)
    plt.ylabel('accuracy')
    plt.xlabel('max_depth')
    plt.show()
    return score_li

def random_forest_tune_estimators(arr_data,arr_label,param_range):
    # tune hyperparameter estimators in the random forest algorithm
    X_train, X_test, y_train, y_test = split_data(arr_data,arr_label)
    score_li = []
    for dep in param_range:
        print(dep)
        randomforest = RandomForestClassifier(n_estimators=dep, max_depth=30, random_state=0)
        randomforest.fit(X_train, y_train)
        ran_score = randomforest.score(X_test, y_test)
        score_li.append(ran_score)
    print(score_li)
    plt.plot(param_range, score_li)
    plt.ylabel('accuracy')
    plt.xlabel('n_estimators')
    plt.show()
    return score_li

def decision_tree_tune_depth(arr_data,arr_label,param_range):
    # tune hyperparameter depth in the decision tree algorithm
    X_train, X_test, y_train, y_test = split_data(arr_data,arr_label)
    score_li = []
    for dep in param_range:
        print(dep)
        clf = DecisionTreeClassifier(max_depth=dep, random_state=0)
        clf.fit(X_train, y_train)
        ran_score = clf.score(X_test, y_test)
        score_li.append(ran_score)
    print(score_li)
    plt.plot(param_range, score_li)
    plt.ylabel('accuracy')
    plt.xlabel('max_depth')
    plt.show()
    return score_li

def kneighbor_tune_k(arr_data,arr_label,param_range):
    # tune hyperparameter k in the knearest neighbour algorithm
    X_train, X_test, y_train, y_test = split_data(arr_data,arr_label)
    score_li = []
    for dep in param_range:
        print(dep)
        clf = KNeighborsClassifier(n_neighbors=dep)
        clf.fit(X_train, y_train)
        ran_score = clf.score(X_test, y_test)
        score_li.append(ran_score)
    print(score_li)
    plt.plot(param_range, score_li)
    plt.ylabel('accuracy')
    plt.xlabel('K')
    plt.show()
    return score_li

def load_model(filename,test_data,test_label):
    # load model and make prediction
    loaded_model = pickle.load(open(filename, 'rb'))
    result = loaded_model.score(test_data,test_label)
    print("Accuracy: "+str(result))
    return result

### Read Data

In [7]:
# change path to your local path
path = "E:/Sydney Semester 2/Machine Learning/"
arr_data,arr_label,test_data,test_label  =read_data(path)

### Random Forest Algorithm

In [None]:
# tune hyperparameters in the random forest
score_li =random_forest_tune_depth(arr_data,arr_label,range(5, 60, 5))
score_li =random_forest_tune_estimators(arr_data,arr_label,range(10,100,10))

In [None]:
randomforest = RandomForestClassifier(n_estimators=70, max_depth=30, random_state=0)
ranmodel = model_test(arr_data, arr_label,test_data,test_label,randomforest,"randomforestmodel.sav")

In [None]:
# complete cross validation on training set
randomforest = RandomForestClassifier(n_estimators=70, max_depth=30, random_state=0)
ranresult =model_cross_validation(arr_data, arr_label,randomforest)

In [None]:
# add your local path to your filename
filename = "randomforestmodel.sav"
load_model(filename,test_data,test_label)

### Decision Tree Algorithm

In [None]:
# tune hyperparameters in the decision tree
score_li =decision_tree_tune_depth(arr_data,arr_label,range(5,30,2))

In [None]:
clf = DecisionTreeClassifier(max_depth=13, random_state=0)
demodel = model_test(arr_data, arr_label,test_data,test_label,clf,"decisiontreemodel.sav")

In [None]:
# complete cross validation on training set
clf = DecisionTreeClassifier(max_depth=13, random_state=0)
deresult =model_cross_validation(arr_data, arr_label,clf)

In [10]:
# add your local path to your filename
filename = "decisiontreemodel.sav"
load_model(filename,test_data,test_label)

Accuracy: 0.8111


0.8111

### Knearest Neighbour Algorithm

In [None]:
# tune hyperparameters in the Knearest Neighbour
score_li = kneighbor_tune_k(arr_data[:20000]/255,arr_label[:20000],range(2,11))

In [None]:
neigh = KNeighborsClassifier(n_neighbors=4)
kneimodel = model_test(arr_data[:20000], arr_label[:20000],test_data[:3000],test_label[:3000],neigh,"knearestmodel.sav")

In [None]:
# complete cross validation on training set
neigh = KNeighborsClassifier(n_neighbors=4)
kneiresult =model_cross_validation(arr_data[:20000]/255,arr_label[:20000],neigh)

In [8]:
# add your local path to your filename
filename = "knearestmodel.sav"
load_model(filename,test_data[:3000],test_label[:3000])

0.8276666666666667