In [1]:
import sys                       # for testing use only
import os                        # for testing use only
from datetime import datetime    # for testing use only
import random                    # for testing use only
import hashlib                   # for testing use only
import pandas as pd
import numpy as np
import math 
import statistics

# --------------------------------------
# show several outputs in one cell. 
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"
# --------------------------------------

In [5]:
def get_assignment_params(student_id):
    num_modulo = 2 ** 31-1
    created_1st_state = int(hashlib.md5(str(student_id).encode('utf-8')).hexdigest(),16) % num_modulo
    created_2nd_state = int(hashlib.md5(str(created_1st_state).encode('utf-8')).hexdigest(),16) % num_modulo
    created_3rd_state = int(hashlib.md5(str(created_2nd_state).encode('utf-8')).hexdigest(),16) % num_modulo
    created_4th_state = int(hashlib.md5(str(created_3rd_state).encode('utf-8')).hexdigest(),16) % num_modulo
    
    scale_types, dist_methods, eval_metrics = ['standardization', 'minmax[0,1]', 'minmax[-1,1]'], ['manhattan', 'euclidean', 'chebyshev'], ['accuracy', 'error_rate', 'precision', 'recall']
    
    score_cols, o_cols, binary_cols = ['math_score', 'reading_score', 'writing_score'], ['social_grp', 'parent_edu'], np.array(['gender_num', 'lunch_type', 'has_preparations'])
    
    assignment_params = {}
    
    random.seed(created_1st_state)
    y_col = random.choice(binary_cols)
    
    assignment_params['num_train1'] = 800
    assignment_params['num_train2'] = 80
    
    assignment_params['y_col'] = y_col
    assignment_params['binary_cols'] = list(binary_cols[binary_cols!=y_col])
    assignment_params['score_cols'], assignment_params['o_cols'] = score_cols, o_cols
    
    random.seed(created_2nd_state)  
    assignment_params['scale_type'] = random.choice(scale_types)
    
    random.seed(created_3rd_state)  
    assignment_params['dist_method'] = random.choice(dist_methods)
    
    random.seed(created_4th_state)  
    assignment_params['eval_metric'] = random.choice(eval_metrics)
    
    return assignment_params
# ---------------------------
try:    
    student_name, student_id = 'Desislava Marvakov', 317575173
except Exception as e:
    print ('You probably did not implement student-info functions, \nerror Message:',str(e))
    raise
assert type(student_name) is str or type(student_id) is int, "name is not a string or id is not an integer"         
# ---------------------------
assignment_params = get_assignment_params(student_id)
# ---------------------------
print ('Assignment 2 - KNN (- 10 points for the assignments):')
print ('-----------------------')
print ('What do you need to implement?')
print ('1.   imports - run only (to load python modules)')
print ('2.   methods: my_name, my_id - Your personal information (- 0.5 points for the test)')
print ('3.   personal implementation taks - run only')
print ('4.   methods: load_dataset - load the dataset (- 1 points for the tests)')
print ("5.   methods: scale_fit_transform, scale_transform_for_test \n\t- the scaling type YOU NEED to implement is: '" + assignment_params['scale_type'] + "' (- 2.5 points for the tests)")
print ("6. methods: calc_distance \n\t- the distance method YOU NEED to implement is: '" + assignment_params['dist_method'] + "' (- 2.5 points for the tests)")
print ("7. methods: predict - KNN predict main flow" + " (- 2 points for the tests)")
print ("8.   methods: eval_performance \n\t- the evaluation metric YOU NEED to implement is: '" + assignment_params['eval_metric'] + "' (- 1.5 points for the tests)")

Assignment 2 - KNN (- 10 points for the assignments):
-----------------------
What do you need to implement?
1.   imports - run only (to load python modules)
2.   methods: my_name, my_id - Your personal information (- 0.5 points for the test)
3.   personal implementation taks - run only
4.   methods: load_dataset - load the dataset (- 1 points for the tests)
5.   methods: scale_fit_transform, scale_transform_for_test 
	- the scaling type YOU NEED to implement is: 'minmax[0,1]' (- 2.5 points for the tests)
6. methods: calc_distance 
	- the distance method YOU NEED to implement is: 'manhattan' (- 2.5 points for the tests)
7. methods: predict - KNN predict main flow (- 2 points for the tests)
8.   methods: eval_performance 
	- the evaluation metric YOU NEED to implement is: 'error_rate' (- 1.5 points for the tests)


In [6]:
def load_dataset(file_name, category_col_name):
    df = pd.read_csv(file_name)
    X = df.drop(columns = [category_col_name])
    y = df[category_col_name]
    return X, y

In [7]:
file_name = 'data' + os.sep + 'Students-Performance_shuffled_1000.csv'
assignment_params = get_assignment_params(317575173)
X, y = load_dataset(file_name, assignment_params['y_col'])
print(type(X))
print(type(y))
X.head()
y
X.shape
y.shape

<class 'pandas.core.frame.DataFrame'>
<class 'pandas.core.series.Series'>


Unnamed: 0,social_grp,gender_num,parent_edu,lunch_type,math_score,reading_score,writing_score
0,3,1,4,1,60,68,68
1,4,0,2,0,70,65,60
2,3,1,3,1,46,54,52
3,1,1,2,0,56,74,74
4,1,0,2,1,62,65,62


0      0
1      0
2      0
3      1
4      0
      ..
995    0
996    0
997    0
998    0
999    0
Name: has_preparations, Length: 1000, dtype: int64

(1000, 7)

(1000,)

## Implementing MinMax Scale 

In [9]:
def scale_fit_transform(X_train):
    X_train_c = X_train.copy()
    max_val = X_train.max(axis=0) #this is series (col_name , max_val_per_col)
    min_val = X_train.min(axis=0)
    scaling_info = pd.concat([min_val.rename('min'), max_val.rename('max')], axis=1)
    for i, row in scaling_info.iterrows():
        unique_id = i
        max_r = row[1]
        min_r = row[0]
        for j, row_value in X_train_c[unique_id].iteritems():
            X_train_c[unique_id] = pd.to_numeric(X_train_c[unique_id], downcast="float")
            X_train_c.loc[j,unique_id]= (row_value-min_r)/(max_r-min_r)
    return scaling_info, X_train_c

In [10]:
file_name = 'data' + os.sep + 'Students-Performance_shuffled_1000.csv'
y_col     = 'has_preparations'  
X, y = load_dataset(file_name, y_col)
X_train, y_train = X.iloc[:assignment_params['num_train1'],:],  y.iloc[:assignment_params['num_train1']]
X_test, y_test = X.iloc[assignment_params['num_train1']:,:], y.iloc[assignment_params['num_train1']:]
trained_scaling_info,  X_train_scaled = scale_fit_transform(X_train)
trained_scaling_info
X_train_scaled

Unnamed: 0,min,max
social_grp,1,5
gender_num,0,1
parent_edu,0,5
lunch_type,0,1
math_score,16,100
reading_score,23,100
writing_score,24,100


Unnamed: 0,social_grp,gender_num,parent_edu,lunch_type,math_score,reading_score,writing_score
0,0.50,1.0,0.8,1.0,0.523810,0.584416,0.578947
1,0.75,0.0,0.4,0.0,0.642857,0.545455,0.473684
2,0.50,1.0,0.6,1.0,0.357143,0.402597,0.368421
3,0.00,1.0,0.4,0.0,0.476190,0.662338,0.657895
4,0.00,0.0,0.4,1.0,0.547619,0.545455,0.500000
...,...,...,...,...,...,...,...
795,0.75,1.0,0.2,0.0,0.452381,0.506494,0.605263
796,1.00,1.0,0.6,0.0,0.654762,0.597403,0.552632
797,0.00,0.0,0.2,1.0,0.738095,0.545455,0.552632
798,1.00,1.0,0.8,1.0,0.714286,0.805195,0.710526


In [11]:
def scale_transform_for_test(trained_scaling_info,  X_test):
    X_test_scaled=X_test.copy()
    for i, row in trained_scaling_info.iterrows():
        unique_id = i
        max_r = row[1]
        min_r = row[0]
        for j, row_value in X_test_scaled[unique_id].iteritems():
            X_test_scaled[unique_id] = pd.to_numeric(X_test_scaled[unique_id], downcast="float")
            X_test_scaled.loc[j,unique_id]= ((row_value-min_r)/(max_r-min_r))
    return X_test_scaled.copy()

In [12]:
file_name = 'data' + os.sep + 'Students-Performance_shuffled_1000.csv'
y_col     = 'has_preparations'   
X, y = load_dataset(file_name, y_col)
X_train, y_train = X.iloc[:assignment_params['num_train1'],:],  y.iloc[:assignment_params['num_train1']]
X_test, y_test = X.iloc[assignment_params['num_train1']:,:], y.iloc[assignment_params['num_train1']:]
trained_scaling_info,  X_train_scaled = scale_fit_transform(X_train)
X_test_scaled = scale_transform_for_test(trained_scaling_info,  X_test)
X_test_scaled

Unnamed: 0,social_grp,gender_num,parent_edu,lunch_type,math_score,reading_score,writing_score
800,0.25,0.0,0.6,1.0,0.642857,0.558442,0.618421
801,1.00,1.0,0.0,1.0,0.500000,0.558442,0.486842
802,0.50,0.0,0.0,0.0,0.273810,0.259740,0.197368
803,0.50,1.0,0.2,0.0,0.607143,0.727273,0.789474
804,0.50,0.0,0.6,1.0,0.750000,0.688312,0.657895
...,...,...,...,...,...,...,...
995,0.75,0.0,0.0,1.0,0.821429,0.779221,0.671053
996,0.00,0.0,0.4,0.0,0.500000,0.480519,0.434211
997,0.50,1.0,0.6,1.0,0.369048,0.467532,0.368421
998,1.00,0.0,0.2,1.0,0.809524,0.649351,0.578947


## Calculate Manhattan Distance

In [13]:
def calc_distance(X_test,X_train):
    df_dist = pd.DataFrame(index=X_test.index , columns=X_train.index)
    for i in X_test.index:
        for j in X_train.index:
            df_dist.loc[i,j] = sum(abs(X_test.loc[i]-X_train.loc[j].values))
    return df_dist

In [14]:
file_name = 'data' + os.sep + 'Students-Performance_scaled_100.csv'
y_col     = 'category'

X, y = load_dataset(file_name, y_col)
X_train, y_train = X.iloc[:assignment_params['num_train2'],:],  y.iloc[:assignment_params['num_train2']]
X_test, y_test = X.iloc[assignment_params['num_train2']:,:], y.iloc[assignment_params['num_train2']:]
dist_dataframe = calc_distance(X_test,X_train)
dist_dataframe.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,70,71,72,73,74,75,76,77,78,79
80,0.469759,0.301399,0.257425,0.498152,1.272627,0.88731,0.577501,0.414442,0.677025,0.263813,...,0.5091,0.562126,0.7743,1.256165,0.117117,0.489816,0.695503,0.771118,0.662743,0.387537
81,1.184687,1.016327,0.858513,1.148359,1.773452,0.779424,1.078326,1.12937,1.164768,0.584652,...,1.013804,1.062952,1.275125,1.149948,0.597811,1.204744,1.196329,0.995194,0.764622,0.651853
82,0.77551,0.986278,0.467822,0.739182,0.915294,0.440578,0.445303,1.00073,0.315647,0.72515,...,1.217607,0.860777,0.772564,1.301588,0.605327,0.795567,0.774854,0.183444,0.102916,0.599751
83,0.787953,0.646062,0.640634,0.799528,1.786417,1.222921,1.091291,0.731632,1.177732,0.318968,...,0.794302,1.075916,1.28809,1.480458,0.454024,0.865706,1.209293,1.106729,0.998354,0.543804
84,0.441539,0.322767,0.468616,0.709343,0.98822,0.842314,0.293094,0.386222,0.632029,0.54822,...,0.554096,0.277719,0.489893,1.044974,0.344173,0.675067,0.411096,0.726122,0.617747,0.598727


## KNN predict - main flow

In [15]:
def predict(X_test, X_train, y_train, k):
    predict = [] 
    distances=calc_distance(X_test,X_train)
    for dis_row in range(distances.shape[0]):
        indexes_list = sorted(distances.iloc[dis_row])
        closest_friend = [indexes_list[i] for i in range(k)]
        frequent_result = statistics.mode(closest_friend)
        for dis_col in range (distances.shape[1]):
            if frequent_result == distances.iloc[dis_row,dis_col]:
                predict.append(y_train.loc[distances.columns[dis_col]])
                break
    y_predicted = pd.Series(data = predict, index = X_test.index)
    return y_predicted

In [16]:
file_name = 'data' + os.sep + 'Students-Performance_scaled_100.csv'
y_col     = 'category'

X, y = load_dataset(file_name, y_col)
X_train, y_train = X.iloc[:assignment_params['num_train2'],:],  y.iloc[:assignment_params['num_train2']]
X_test, y_test = X.iloc[assignment_params['num_train2']:,:], y.iloc[assignment_params['num_train2']:]
y_predicted = predict(X_test, X_train, y_train, 1)
y_predicted

80    0
81    0
82    0
83    1
84    0
85    1
86    1
87    0
88    0
89    1
90    0
91    1
92    0
93    0
94    0
95    0
96    0
97    0
98    0
99    0
dtype: int64

## Error Rate Evaluation

In [17]:
def eval_performance(y_predicted,y_test):
    correct_pred = 0
    for pred_val, test_val in zip(y_predicted.tolist(),y_test.tolist()):
        if(pred_val == test_val):
            correct_pred += 1
    accuracy = (correct_pred / len(y_predicted))
    error_rate = 1 - accuracy
    return error_rate

In [18]:
file_name = 'data' + os.sep + 'Students-Performance_scaled_100.csv'
y_col     = 'category'
indx = [3,1,6,5,7,44,32,14,10,11]
y_pred = pd.Series([1,1,1,0,1,0,1,0,1,0],index=indx)
y_test = pd.Series([0,0,1,1,1,0,0,0,1,0],index=indx)
eval_res = eval_performance(y_pred, y_test)
eval_res

0.4