In [1]:
import sys
assert sys.version_info >= (3, 5)
# Python ≥3.5 is required

# Scikit-Learn ≥0.20 is required
import sklearn
assert sklearn.__version__ >= "0.20"

# Common imports
import numpy as np
import os
import tarfile
import urllib
import pandas as pd

# used for feature selection
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2
from sklearn.feature_selection import f_regression

# import train_test_split
from sklearn.model_selection import train_test_split

# naive bayes and metrics import 
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report

# To plot pretty figures
%matplotlib inline
import matplotlib as mpl
import matplotlib.pyplot as plt

In [2]:
DOWNLOAD_ROOT = "http://www.macs.hw.ac.uk/%7Eek19/data/"
CURRENT_PATH = os.path.join(os.getcwd(), "datasets")
X_FILE = "x_train_gr_smpl.csv"
Y_FILE ="y_train_smpl.csv"

def fetch_data(download_root=DOWNLOAD_ROOT, current_path=CURRENT_PATH):
    if os.path.isfile(os.path.join(current_path, X_FILE)) and os.path.isfile(os.path.join(current_path, Y_FILE)):
        return
    
    os.makedirs(current_path, exist_ok=True)
    
    urllib.request.urlretrieve(download_root + X_FILE, os.path.join(current_path, X_FILE))
    urllib.request.urlretrieve(download_root + Y_FILE, os.path.join(current_path, Y_FILE))
    

In [3]:
#fetch_data()

In [4]:
def load_features_data(current_path=CURRENT_PATH):
    return pd.read_csv(os.path.join(current_path, X_FILE)).rename(columns=lambda s: "pixel_"+s, index=lambda s: "sign_"+str(s))

def load_labels_data(labels_n=None):
    if labels_n is None:
        return pd.read_csv(os.path.join(CURRENT_PATH, Y_FILE)).rename(columns=lambda s:"class", index=lambda s: "sign_"+str(s))
    else:
        return pd.read_csv(os.path.join(CURRENT_PATH, "y_train_smpl_" + str(labels_n) + ".csv")).rename(columns=lambda s:"class_"+str(labels_n), index=lambda s: "sign_"+str(s))

In [5]:
# retrieve all files and put them into the labels array
labels = []
for i in range(10):
    labels.append(load_labels_data(i))

features = load_features_data()
features.head() # Display the top five rows of the dataframe

Unnamed: 0,pixel_0,pixel_1,pixel_2,pixel_3,pixel_4,pixel_5,pixel_6,pixel_7,pixel_8,pixel_9,...,pixel_2294,pixel_2295,pixel_2296,pixel_2297,pixel_2298,pixel_2299,pixel_2300,pixel_2301,pixel_2302,pixel_2303
sign_0,78.0,77.0,76.0,82.0,87.0,92.0,104.0,119.0,117.0,120.0,...,87.0,79.0,72.0,76.0,83.0,95.0,99.0,98.0,95.0,94.0
sign_1,73.0,75.0,79.0,78.0,76.0,75.0,89.0,107.0,133.0,125.0,...,96.0,93.0,85.0,77.0,69.0,73.0,83.0,100.0,101.0,101.0
sign_2,72.0,75.0,79.0,77.0,81.0,89.0,105.0,109.0,86.0,90.0,...,98.0,95.0,88.0,80.0,73.0,71.0,74.0,80.0,89.0,95.0
sign_3,67.0,70.0,74.0,80.0,93.0,107.0,110.0,96.0,69.0,100.0,...,112.0,92.0,87.0,82.0,77.0,72.0,70.0,72.0,81.0,88.0
sign_4,74.0,74.0,73.0,72.0,77.0,87.0,104.0,109.0,84.0,83.0,...,100.0,98.0,99.0,100.0,99.0,89.0,78.0,66.0,68.0,72.0


In [6]:
# Task 3: Deeper analysis of the data: the data is split into 10 classes, 
#         search for important attributes for each class

In [7]:
all_labels = load_labels_data()

In [8]:
# appending the all the different class columns to the features data frame
# This is because running the corr() function takes time, and by doing so the runtime decreases by 9/10
features_and_classes = features
for i in range(10):
    features_and_classes = pd.concat([features_and_classes, labels[i]], axis=1, sort=False)

In [9]:
features_and_classes.head()

Unnamed: 0,pixel_0,pixel_1,pixel_2,pixel_3,pixel_4,pixel_5,pixel_6,pixel_7,pixel_8,pixel_9,...,class_0,class_1,class_2,class_3,class_4,class_5,class_6,class_7,class_8,class_9
sign_0,78.0,77.0,76.0,82.0,87.0,92.0,104.0,119.0,117.0,120.0,...,0,1,1,1,1,1,1,1,1,1
sign_1,73.0,75.0,79.0,78.0,76.0,75.0,89.0,107.0,133.0,125.0,...,0,1,1,1,1,1,1,1,1,1
sign_2,72.0,75.0,79.0,77.0,81.0,89.0,105.0,109.0,86.0,90.0,...,0,1,1,1,1,1,1,1,1,1
sign_3,67.0,70.0,74.0,80.0,93.0,107.0,110.0,96.0,69.0,100.0,...,0,1,1,1,1,1,1,1,1,1
sign_4,74.0,74.0,73.0,72.0,77.0,87.0,104.0,109.0,84.0,83.0,...,0,1,1,1,1,1,1,1,1,1


In [10]:
# Getting all correlations using Pearson's r (may take some time)
features_corr = features_and_classes.corr()

In [11]:
#splitting the correlation results for each class label and making all values absolute 
labels_feature_selection = []
for i in range(10):
    labels_feature_selection.append(abs(features_corr["class_"+str(i)]))
    
    #dropping all the classes columns from collerations
    for j in range(10):
        labels_feature_selection[i].drop(['class_'+str(j)],inplace=True)
    

In [21]:
# Task 4: Improvement in classification, based on feature selection

# function to get the top n pixel for each class label  
def get_top_pixels(nlargest):
    top_pixels = set()
    for i in range(10):
        curr_classs_top_pixels = labels_feature_selection[i].nlargest(nlargest).index.values
        top_pixels.update(curr_classs_top_pixels)
        print("class_" + str(i) + ", most reliable pixels: ")
        print(curr_classs_top_pixels)
    return list(top_pixels)

In [13]:
def display_prediction(expected, predicted):
    print("=====================================================")
    print("Accuracy:", accuracy_score(expected, predicted))
    print("=====================================================")
    print(classification_report(expected, predicted))
    print("=====================================================")
    print(confusion_matrix(expected, predicted))
    print("=====================================================")

In [14]:
def run_naive_bayes(X,Y):
    
    # Splitting dataset into training set and test set, 70/30 training/test, seed = 10 
    X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.3,random_state=10)
    
    # training model using naive bayes
    model = GaussianNB()
    model.fit(X_train, Y_train)
    
    # testing the model, getting prediction
    prediction = model.predict(X_test)
    
    display_prediction(Y_test,prediction)

In [22]:
# top 5 pixel per class (around 50 pixels)
top_5_features = features[get_top_pixels(5)].copy()

run_naive_bayes(top_5_features, all_labels.values.ravel())

class_0, most reliable pixels: 
['pixel_2261' 'pixel_2262' 'pixel_2263' 'pixel_2213' 'pixel_2260']
class_1, most reliable pixels: 
['pixel_1073' 'pixel_1121' 'pixel_1120' 'pixel_1074' 'pixel_1072']
class_2, most reliable pixels: 
['pixel_1316' 'pixel_1030' 'pixel_1317' 'pixel_1268' 'pixel_1269']
class_3, most reliable pixels: 
['pixel_1218' 'pixel_1265' 'pixel_1313' 'pixel_1266' 'pixel_1219']
class_4, most reliable pixels: 
['pixel_983' 'pixel_1031' 'pixel_1030' 'pixel_982' 'pixel_935']
class_5, most reliable pixels: 
['pixel_1743' 'pixel_1507' 'pixel_1744' 'pixel_1508' 'pixel_1742']
class_6, most reliable pixels: 
['pixel_793' 'pixel_794' 'pixel_841' 'pixel_842' 'pixel_792']
class_7, most reliable pixels: 
['pixel_1761' 'pixel_1695' 'pixel_1743' 'pixel_1694' 'pixel_1714']
class_8, most reliable pixels: 
['pixel_1714' 'pixel_1715' 'pixel_1666' 'pixel_1761' 'pixel_1743']
class_9, most reliable pixels: 
['pixel_1134' 'pixel_1215' 'pixel_1263' 'pixel_1086' 'pixel_1216']
Accuracy: 0.376676

In [23]:
# top 10 pixel per class (around 100 pixels)
top_10_features = features[get_top_pixels(10)].copy()

run_naive_bayes(top_10_features, all_labels.values.ravel())

class_0, most reliable pixels: 
['pixel_2261' 'pixel_2262' 'pixel_2263' 'pixel_2213' 'pixel_2260'
 'pixel_2212' 'pixel_2214' 'pixel_2264' 'pixel_2215' 'pixel_2165']
class_1, most reliable pixels: 
['pixel_1073' 'pixel_1121' 'pixel_1120' 'pixel_1074' 'pixel_1072'
 'pixel_1168' 'pixel_1025' 'pixel_1167' 'pixel_1026' 'pixel_1119']
class_2, most reliable pixels: 
['pixel_1316' 'pixel_1030' 'pixel_1317' 'pixel_1268' 'pixel_1269'
 'pixel_982' 'pixel_1364' 'pixel_1315' 'pixel_1365' 'pixel_1031']
class_3, most reliable pixels: 
['pixel_1218' 'pixel_1265' 'pixel_1313' 'pixel_1266' 'pixel_1219'
 'pixel_1361' 'pixel_1217' 'pixel_1312' 'pixel_1314' 'pixel_1171']
class_4, most reliable pixels: 
['pixel_983' 'pixel_1031' 'pixel_1030' 'pixel_982' 'pixel_935'
 'pixel_2139' 'pixel_2138' 'pixel_2140' 'pixel_2095' 'pixel_2096']
class_5, most reliable pixels: 
['pixel_1743' 'pixel_1507' 'pixel_1744' 'pixel_1508' 'pixel_1742'
 'pixel_1694' 'pixel_1695' 'pixel_1460' 'pixel_1714' 'pixel_1461']
class_6, most 

In [18]:
# top 20 pixel per class (around 200 pixels)
top_20_features = features[get_top_pixels(20)].copy()

run_naive_bayes(top_20_features, all_labels.values.ravel())

Accuracy: 0.34502923976608185
              precision    recall  f1-score   support

           0       0.18      0.72      0.28        72
           1       0.66      0.31      0.43       682
           2       0.56      0.40      0.47       657
           3       0.34      0.38      0.36       439
           4       0.42      0.10      0.16       575
           5       0.69      0.17      0.27        65
           6       0.12      0.85      0.20       100
           7       0.32      0.34      0.33        79
           8       0.70      0.42      0.52       160
           9       0.24      0.77      0.36        78

    accuracy                           0.35      2907
   macro avg       0.42      0.45      0.34      2907
weighted avg       0.49      0.35      0.36      2907

[[ 52   3   0  12   3   0   0   0   0   2]
 [185 214  68 103  18   1  71   0   5  17]
 [ 31  50 265 128  47   0  53   5   0  78]
 [ 11   1  10 167   7   1 214   0  10  18]
 [ 13  52 131  73  55   0 218   4   0  