# Import Statements

In [58]:
%matplotlib inline
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn import neighbors
import seaborn as sns
import random as rnd
import csv
import random
import math
import operator

# Preprocess Dataset

In [25]:
train_data = pd.read_csv('train.csv')
train_data = train_data.rename(columns={"1":"labels"})
test_df = pd.read_csv('test.csv')
test_label_df = pd.read_csv('test_labels.csv')
test_label_df = test_label_df.rename(columns={"9":"labels"})

In [26]:
def findtrainsubset(data,drop_size):
    if isinstance(drop_size,float):
        drop_size=round(drop_size * len(data))
    indices = data.index.tolist()
    drop_indices = random.sample(population=indices,k=drop_size)
    train_subdata = data.drop(drop_indices)
    return train_subdata

In [27]:
random.seed(0)
train_subdata = findtrainsubset(train_data,drop_size=0.8)

# Scikit Learn

In [62]:
train_subdataset = train_subdata.values
X_train = train_subdataset[:,1:]
Y_train = train_subdataset[:,0]
X_test = test_df.values
Y_test = test_label_df.values

knn = neighbors.KNeighborsClassifier(4)
print('KNN score: %f' % knn.fit(X_train, Y_train).score(X_test, Y_test))

KNN score: 0.932933


In [41]:
def get_performance_measure(data,predicted):
    Precisions =[] 
    Recalls = []
    F1_score = []
    correct = 0
    #print(correct)
    confusion_matrix = [[0]*10]*10
    for x in range(len(data)):
        for i in range(0,10):
            if data[x,:][0] == i:
                for j in range(0,10):
                    if predicted[x] == j:
                        confusion_matrix[i][j] += 1
    for i in range(0,10):
        sum = TP = 0
        for j in range(0,10):
            if i == j:
                TP += confusion_matrix[i][i]
            sum += confusion_matrix[i][j]
        recall = (TP/sum)
        Recalls.append(recall)
    
    Recalls = np.array(Recalls) 
    
    for j in range(0,10):
        sum = TP = 0
        for i in range(0,10):
            if i == j:
                TP += confusion_matrix[i][i]
            sum += confusion_matrix[j][i]
        precision = (TP/sum)
        Precisions.append(precision)
        
    Precisions = np.array(Precisions) 
    
    for i in range(0,10):
        if (Precisions[i] == 0) or (Recalls[i] == 0):
            F1_score.append(0.)
        else:
            F1_score.append(2/((1/Precisions[i])+(1/Recalls[i]))) 
            
    #print(len(F1_score))
    for x in range(len(data)):
        if data[x,:][0] == predicted[x]:
            correct +=1
    Accuracy = (correct/(float)(len(data))) * 100.0

    return Precisions,Recalls,F1_score,confusion_matrix,Accuracy

def getresponse(neighbours):
    classvotes = {}
    Max = 0
    for x in range(len(neighbours)):
        if neighbours[x][0] in classvotes:
            classvotes[neighbours[x][0]]+=1
        else:
            classvotes[neighbours[x][0]]=1
        
        if classvotes[neighbours[x][0]] > Max:
            Max = classvotes[neighbours[x][0]]
            label = neighbours[x][0]
        
    #sortedvotes = sorted(classvotes.items(),key=operator.itemgetter(1),reverse=True)
    #return sortedvotes[0][0]
    return label

# Baseline Algorithm

## Random Guessing

In [29]:
def random_algorithm(train, test):
    output_values = [row[0] for row in train]
    unique = list(set(output_values))
    predicted = list()
    for row in test:
        index = random.randrange(len(unique))
        predicted.append(unique[index])
    return predicted

## Majority Voting

In [30]:
def majority_voting_algorithm(train, test):
    output_values = [row[0] for row in train]
    prediction = max(set(output_values), key=output_values.count)
    predicted = [prediction for i in range(len(test))]
    return predicted

In [31]:
predicted = random_algorithm(train_dataset,test_dataset)
Precisions,Recalls,F1_score,Accuracy= get_performance_measure(test_label_dataset,predicted)
for i in range(0,10):
    print("Class %d:" %i)
    print("Precision= %f" %Precisions[i])
    print("Recall= %f" %Recalls[i])
    print("F1_score= %f\n" %F1_score[i])
print("Accuracy %f" %Accuracy)
#print("Precision=%f" % precision, "Recall=%f" %recall, "F1-score=%f" %F1_score,"accuracy=%f" %accuracy)

Class 0:
Precision= 0.109109
Recall= 0.109109
F1_score= 0.109109

Class 1:
Precision= 0.102102
Recall= 0.102102
F1_score= 0.102102

Class 2:
Precision= 0.089089
Recall= 0.089089
F1_score= 0.089089

Class 3:
Precision= 0.098098
Recall= 0.098098
F1_score= 0.098098

Class 4:
Precision= 0.090090
Recall= 0.090090
F1_score= 0.090090

Class 5:
Precision= 0.112112
Recall= 0.112112
F1_score= 0.112112

Class 6:
Precision= 0.082082
Recall= 0.082082
F1_score= 0.082082

Class 7:
Precision= 0.107107
Recall= 0.107107
F1_score= 0.107107

Class 8:
Precision= 0.098098
Recall= 0.098098
F1_score= 0.098098

Class 9:
Precision= 0.112112
Recall= 0.112112
F1_score= 0.112112

Accuracy 10.010010


In [32]:
predicted = majority_voting_algorithm(train_dataset,test_dataset)
Precisions,Recalls,F1_score,Accuracy= get_performance_measure(test_label_dataset,predicted)
for i in range(0,10):
    print("Class %d:" %i)
    print("Precision= %f" %Precisions[i])
    print("Recall= %f" %Recalls[i])
    print("F1_score= %f\n" %F1_score[i])
print("Accuracy %f" %Accuracy)

Class 0:
Precision= 0.000000
Recall= 0.000000
F1_score= 0.000000

Class 1:
Precision= 1.000000
Recall= 1.000000
F1_score= 1.000000

Class 2:
Precision= 0.000000
Recall= 0.000000
F1_score= 0.000000

Class 3:
Precision= 0.000000
Recall= 0.000000
F1_score= 0.000000

Class 4:
Precision= 0.000000
Recall= 0.000000
F1_score= 0.000000

Class 5:
Precision= 0.000000
Recall= 0.000000
F1_score= 0.000000

Class 6:
Precision= 0.000000
Recall= 0.000000
F1_score= 0.000000

Class 7:
Precision= 0.000000
Recall= 0.000000
F1_score= 0.000000

Class 8:
Precision= 0.000000
Recall= 0.000000
F1_score= 0.000000

Class 9:
Precision= 0.000000
Recall= 0.000000
F1_score= 0.000000

Accuracy 10.110110


# Training and validation set split Function

In [33]:
def train_valid_split(data,valid_size):
    if isinstance(valid_size,float):
        valid_size=round(valid_size*len(data))
    indices = data.index.tolist()
    valid_indices = random.sample(population=indices,k=valid_size)
    valid_df = data.loc[valid_indices]
    train_df = data.drop(valid_indices)
    return train_df,valid_df

In [34]:
random.seed(0)
train_df,valid_df = train_valid_split(train_subdata,valid_size=0.1)
print("x=%d" %len(valid_df),"y=%d" %len(train_df))

x=400 y=3600


# Generate Prediction

In [35]:
def euclideandistance(train_data,Instance,length):
    distance=0
    for x in range(length):
        distance+=pow((train_data[x] - Instance[x]),2)
    return distance

def manhattandistance(train_data,Instance,length):
    distance=0
    for x in range(length):
        distance+= abs(train_data[x] - Instance[x])
    return distance

def getneighbours(train_data,Instance,Isparametertunning,k,proximity_measure):
    distances = []
    length = len(Instance)-1
    for x in range(len(train_data)):
        if Isparametertunning == True:
            dist = proximity_measure(train_data[x,1:],Instance[1:],length)
        else:
            dist = proximity_measure(train_data[x,1:],Instance,length)
            
        distances.append((train_data[x,:],dist))
        
    distances.sort(key=operator.itemgetter(1))
    neighbours = []
    for x in range(k):
        neighbours.append(distances[x][0])
    return neighbours

In [22]:
grid_search = {"K": [], "Accuracy": []}
train_dataset = train_df.values
valid_dataset = valid_df.values
for k in range(3,8,2):
    prediction = []
    for x in range(len(valid_dataset)):
        neighbours = getneighbours(train_dataset,valid_dataset[x,:],True,k,proximity_measure = euclideandistance)
        result = getresponse(neighbours)
        prediction.append(result)
        #print('> predicted: '+ repr(result) + ', actual:' + repr(valid_data[x,:][0]))
    _, _, _, _, accuracy = get_performance_measure(valid_dataset,prediction)
    grid_search["K"].append(k)
    grid_search["Accuracy"].append(accuracy)

grid_search = pd.DataFrame(grid_search)
grid_search.sort_values("Accuracy", ascending=False).head()

Unnamed: 0,K,Accuracy
0,3,94.5
2,7,94.25
1,5,94.0


# Classification on Testing Dataset

In [43]:
opt_k = 3
prediction = []
test_dataset = test_df.values
test_label_dataset = test_label_df.values

for x in range(len(test_dataset)):
    neighbours = getneighbours(train_dataset,test_dataset[x,:], False, opt_k,proximity_measure = euclideandistance)
    result = getresponse(neighbours)
    prediction.append(result)
    
Precisions, Recalls, F1_score, confusion_matrix, Accuracy = get_performance_measure(test_label_dataset,prediction)

for i in range(0,10):
    print("Class %d:" %i)
    print("Precision= %f" %Precisions[i])
    print("Recall= %f" %Recalls[i])
    print("F1_score= %f\n" %F1_score[i])
    
print("Accuracy %f" %Accuracy)

Class 0:
Precision= 0.098098
Recall= 0.098098
F1_score= 0.098098

Class 1:
Precision= 0.106106
Recall= 0.106106
F1_score= 0.106106

Class 2:
Precision= 0.107107
Recall= 0.107107
F1_score= 0.107107

Class 3:
Precision= 0.115115
Recall= 0.115115
F1_score= 0.115115

Class 4:
Precision= 0.087087
Recall= 0.087087
F1_score= 0.087087

Class 5:
Precision= 0.079079
Recall= 0.079079
F1_score= 0.079079

Class 6:
Precision= 0.101101
Recall= 0.101101
F1_score= 0.101101

Class 7:
Precision= 0.097097
Recall= 0.097097
F1_score= 0.097097

Class 8:
Precision= 0.092092
Recall= 0.092092
F1_score= 0.092092

Class 9:
Precision= 0.117117
Recall= 0.117117
F1_score= 0.117117

Accuracy 93.493493


In [53]:
print(confusion_matrix,"\n")

[[98, 106, 107, 115, 87, 79, 101, 97, 92, 117], [98, 106, 107, 115, 87, 79, 101, 97, 92, 117], [98, 106, 107, 115, 87, 79, 101, 97, 92, 117], [98, 106, 107, 115, 87, 79, 101, 97, 92, 117], [98, 106, 107, 115, 87, 79, 101, 97, 92, 117], [98, 106, 107, 115, 87, 79, 101, 97, 92, 117], [98, 106, 107, 115, 87, 79, 101, 97, 92, 117], [98, 106, 107, 115, 87, 79, 101, 97, 92, 117], [98, 106, 107, 115, 87, 79, 101, 97, 92, 117], [98, 106, 107, 115, 87, 79, 101, 97, 92, 117]] 



In [54]:
grid_search = {"K": [], "Accuracy": []}
train_dataset = train_df.values
valid_dataset = valid_df.values
for k in range(3,8,2):
    prediction = []
    for x in range(len(valid_dataset)):
        neighbours = getneighbours(train_dataset,valid_dataset[x,:],True,k,proximity_measure = manhattandistance)
        result = getresponse(neighbours)
        prediction.append(result)
        
    _, _, _, _, accuracy = get_performance_measure(valid_dataset,prediction)
    grid_search["K"].append(k)
    grid_search["Accuracy"].append(accuracy)

grid_search = pd.DataFrame(grid_search)
grid_search.sort_values("Accuracy", ascending=False).head()

Unnamed: 0,K,Accuracy
0,3,94.0
1,5,94.0
2,7,92.75


# Classification on Test Dataset

In [56]:
opt_k = 3
prediction = []
test_dataset = test_df.values
test_label_dataset = test_label_df.values

for x in range(len(test_dataset)):
    neighbours = getneighbours(train_dataset,test_dataset[x,:], False, opt_k,proximity_measure = manhattandistance)
    result = getresponse(neighbours)
    prediction.append(result)
    
Precisions, Recalls, F1_score, confusion_matrix, Accuracy = get_performance_measure(test_label_dataset,prediction)

for i in range(0,10):
    print("Class %d:" %i)
    print("Precision= %f" %Precisions[i])
    print("Recall= %f" %Recalls[i])
    print("F1_score= %f\n" %F1_score[i])
    
print("Accuracy %f" %Accuracy)

Class 0:
Precision= 0.097097
Recall= 0.097097
F1_score= 0.097097

Class 1:
Precision= 0.116116
Recall= 0.116116
F1_score= 0.116116

Class 2:
Precision= 0.105105
Recall= 0.105105
F1_score= 0.105105

Class 3:
Precision= 0.117117
Recall= 0.117117
F1_score= 0.117117

Class 4:
Precision= 0.088088
Recall= 0.088088
F1_score= 0.088088

Class 5:
Precision= 0.078078
Recall= 0.078078
F1_score= 0.078078

Class 6:
Precision= 0.101101
Recall= 0.101101
F1_score= 0.101101

Class 7:
Precision= 0.098098
Recall= 0.098098
F1_score= 0.098098

Class 8:
Precision= 0.086086
Recall= 0.086086
F1_score= 0.086086

Class 9:
Precision= 0.113113
Recall= 0.113113
F1_score= 0.113113

Accuracy 92.192192


In [57]:
print(confusion_matrix,"\n")

[[97, 116, 105, 117, 88, 78, 101, 98, 86, 113], [97, 116, 105, 117, 88, 78, 101, 98, 86, 113], [97, 116, 105, 117, 88, 78, 101, 98, 86, 113], [97, 116, 105, 117, 88, 78, 101, 98, 86, 113], [97, 116, 105, 117, 88, 78, 101, 98, 86, 113], [97, 116, 105, 117, 88, 78, 101, 98, 86, 113], [97, 116, 105, 117, 88, 78, 101, 98, 86, 113], [97, 116, 105, 117, 88, 78, 101, 98, 86, 113], [97, 116, 105, 117, 88, 78, 101, 98, 86, 113], [97, 116, 105, 117, 88, 78, 101, 98, 86, 113]] 

