In [1]:
import sys
assert sys.version_info >= (3, 5)
# Python ≥3.5 is required

# Scikit-Learn ≥0.20 is required
import sklearn
assert sklearn.__version__ >= "0.20"

# Common imports
import numpy as np
import os
import tarfile
import urllib
import pandas as pd
import seaborn as sns

# used for feature selection
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2
from sklearn.feature_selection import f_regression

# To plot pretty figures
%matplotlib inline
import matplotlib as mpl
import matplotlib.pyplot as plt

In [2]:
DOWNLOAD_ROOT = "http://www.macs.hw.ac.uk/%7Eek19/data/"
CURRENT_PATH = os.path.join(os.getcwd(), "datasets")
X_FILE = "x_train_gr_smpl.csv"
Y_FILE ="y_train_smpl.csv"
Y0_FILE = "y_train_smpl_0.csv"

def fetch_data(download_root=DOWNLOAD_ROOT, current_path=CURRENT_PATH):
    if os.path.isfile(os.path.join(current_path, X_FILE)) and os.path.isfile(os.path.join(current_path, Y_FILE)):
        return
    
    os.makedirs(current_path, exist_ok=True)
    
    #urllib.request.urlretrieve(download_root + X_FILE, os.path.join(current_path, X_FILE))
    urllib.request.urlretrieve(download_root + Y_FILE, os.path.join(current_path, Y_FILE))
    

In [3]:
fetch_data()

In [90]:
def load_features_data(current_path=CURRENT_PATH):
    return pd.read_csv(os.path.join(current_path, X_FILE)).rename(columns=lambda s: "pixel_"+s, index=lambda s: "sign_"+str(s))

def load_labels_data(labels_n=None):
    if labels_n is None:
        return pd.read_csv(os.path.join(CURRENT_PATH, Y_FILE)).rename(columns=lambda s:"class", index=lambda s: "sign_"+str(s))
    else:
        return pd.read_csv(os.path.join(CURRENT_PATH, "y_train_smpl_" + str(labels_n) + ".csv")).rename(columns=lambda s:"class_"+str(labels_n), index=lambda s: "sign_"+str(s))

In [133]:
# retrieve all files and put them into the labels array
labels = []
for i in range(10):
    labels.append(load_labels_data(i))

features = load_features_data()
features.head() # Display the top five rows of the dataframe

Unnamed: 0,pixel_0,pixel_1,pixel_2,pixel_3,pixel_4,pixel_5,pixel_6,pixel_7,pixel_8,pixel_9,...,pixel_2294,pixel_2295,pixel_2296,pixel_2297,pixel_2298,pixel_2299,pixel_2300,pixel_2301,pixel_2302,pixel_2303
sign_0,78.0,77.0,76.0,82.0,87.0,92.0,104.0,119.0,117.0,120.0,...,87.0,79.0,72.0,76.0,83.0,95.0,99.0,98.0,95.0,94.0
sign_1,73.0,75.0,79.0,78.0,76.0,75.0,89.0,107.0,133.0,125.0,...,96.0,93.0,85.0,77.0,69.0,73.0,83.0,100.0,101.0,101.0
sign_2,72.0,75.0,79.0,77.0,81.0,89.0,105.0,109.0,86.0,90.0,...,98.0,95.0,88.0,80.0,73.0,71.0,74.0,80.0,89.0,95.0
sign_3,67.0,70.0,74.0,80.0,93.0,107.0,110.0,96.0,69.0,100.0,...,112.0,92.0,87.0,82.0,77.0,72.0,70.0,72.0,81.0,88.0
sign_4,74.0,74.0,73.0,72.0,77.0,87.0,104.0,109.0,84.0,83.0,...,100.0,98.0,99.0,100.0,99.0,89.0,78.0,66.0,68.0,72.0


In [None]:
# Task 3: Deeper analysis of the data: the data is split into 10 classes, 
#         search for important attributes for each class

In [138]:
# appending the all the different class columns to the features data frame
features_and_classes = features
for i in range(10):
    features_and_classes = pd.concat([features_and_classes, labels[i]], axis=1, sort=False)

In [139]:
features_and_classes.head()

Unnamed: 0,pixel_0,pixel_1,pixel_2,pixel_3,pixel_4,pixel_5,pixel_6,pixel_7,pixel_8,pixel_9,...,class_0,class_1,class_2,class_3,class_4,class_5,class_6,class_7,class_8,class_9
sign_0,78.0,77.0,76.0,82.0,87.0,92.0,104.0,119.0,117.0,120.0,...,0,1,1,1,1,1,1,1,1,1
sign_1,73.0,75.0,79.0,78.0,76.0,75.0,89.0,107.0,133.0,125.0,...,0,1,1,1,1,1,1,1,1,1
sign_2,72.0,75.0,79.0,77.0,81.0,89.0,105.0,109.0,86.0,90.0,...,0,1,1,1,1,1,1,1,1,1
sign_3,67.0,70.0,74.0,80.0,93.0,107.0,110.0,96.0,69.0,100.0,...,0,1,1,1,1,1,1,1,1,1
sign_4,74.0,74.0,73.0,72.0,77.0,87.0,104.0,109.0,84.0,83.0,...,0,1,1,1,1,1,1,1,1,1


In [140]:
# Getting all correlations using Pearson's r
features_corr = features_and_classes.corr()

In [221]:
#splitting the correlation results for each class label and making all values absolute 
labels_feature_selection = []
for i in range(10):
    labels_feature_selection.append(abs(features_corr["class_"+str(i)]))
    
    #dropping all the classes columns from collerations
    for j in range(10):
        labels_feature_selection[i].drop(['class_'+str(j)],inplace=True)
    

In [202]:
# Task 4: Improvement in classification, based on feature selection

# function to get the top n pixel for each class label  
def getTopPixel(nlargest):
    top_pixels = []
    for i in range(10):
        top_pixels.extend(labels_feature_selection[i].nlargest(nlargest).index.values)
    return top_pixels

In [203]:
top_5_pixels = getTopPixel(5)
top_10_pixels = getTopPixel(10)
top_20_pixels = getTopPixel(20)

In [219]:
top_5_features = features[top_5_pixels].copy()
top_10_features = features[top_10_pixels].copy()
top_20_features = features[top_20_pixels].copy()

In [220]:
top_5_features.head()
#top_10_features.head()
#top_20_features.head()

Unnamed: 0,pixel_2261,pixel_2262,pixel_2263,pixel_2213,pixel_2260,pixel_1073,pixel_1121,pixel_1120,pixel_1074,pixel_1072,...,pixel_1714,pixel_1715,pixel_1666,pixel_1761,pixel_1743,pixel_1134,pixel_1215,pixel_1263,pixel_1086,pixel_1216
sign_0,72.0,71.0,68.0,73.0,73.0,174.0,197.0,207.0,163.0,184.0,...,99.0,95.0,102.0,108.0,118.0,127.0,220.0,226.0,123.0,231.0
sign_1,85.0,76.0,70.0,89.0,91.0,192.0,203.0,183.0,195.0,169.0,...,118.0,111.0,99.0,133.0,98.0,193.0,124.0,123.0,185.0,199.0
sign_2,81.0,71.0,66.0,92.0,90.0,191.0,207.0,211.0,179.0,196.0,...,122.0,114.0,101.0,136.0,112.0,167.0,196.0,208.0,161.0,225.0
sign_3,92.0,81.0,72.0,100.0,86.0,204.0,218.0,227.0,207.0,213.0,...,124.0,116.0,113.0,134.0,105.0,179.0,206.0,191.0,177.0,230.0
sign_4,126.0,127.0,128.0,138.0,123.0,162.0,170.0,178.0,150.0,163.0,...,87.0,81.0,104.0,102.0,92.0,186.0,199.0,209.0,139.0,202.0
