In [1]:
# campus recruitment data
from tqdm import tqdm
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
%matplotlib inline

# load data
dataframe = pd.read_csv('Placement_Data_Full_Class.csv')
dataframe.head()

Unnamed: 0,sl_no,gender,ssc_p,ssc_b,hsc_p,hsc_b,hsc_s,degree_p,degree_t,workex,etest_p,specialisation,mba_p,status,salary
0,1,M,67.0,Others,91.0,Others,Commerce,58.0,Sci&Tech,No,55.0,Mkt&HR,58.8,Placed,270000.0
1,2,M,79.33,Central,78.33,Others,Science,77.48,Sci&Tech,Yes,86.5,Mkt&Fin,66.28,Placed,200000.0
2,3,M,65.0,Central,68.0,Central,Arts,64.0,Comm&Mgmt,No,75.0,Mkt&Fin,57.8,Placed,250000.0
3,4,M,56.0,Central,52.0,Central,Science,52.0,Sci&Tech,No,66.0,Mkt&HR,59.43,Not Placed,
4,5,M,85.8,Central,73.6,Central,Commerce,73.3,Comm&Mgmt,No,96.8,Mkt&Fin,55.5,Placed,425000.0


In [2]:
# drop attributes not necessary
dataframe.drop(['sl_no', 'hsc_s', 'hsc_b', 'ssc_b', 'ssc_p', 'hsc_p', 'specialisation', 'etest_p', 'workex'], axis=1, inplace=True)
display(dataframe)

Unnamed: 0,gender,degree_p,degree_t,mba_p,status,salary
0,M,58.00,Sci&Tech,58.80,Placed,270000.0
1,M,77.48,Sci&Tech,66.28,Placed,200000.0
2,M,64.00,Comm&Mgmt,57.80,Placed,250000.0
3,M,52.00,Sci&Tech,59.43,Not Placed,
4,M,73.30,Comm&Mgmt,55.50,Placed,425000.0
...,...,...,...,...,...,...
210,M,77.60,Comm&Mgmt,74.49,Placed,400000.0
211,M,72.00,Sci&Tech,53.62,Placed,275000.0
212,M,73.00,Comm&Mgmt,69.72,Placed,295000.0
213,F,58.00,Comm&Mgmt,60.23,Placed,204000.0


In [3]:
# label encoding
dataframe.replace('M', 0.0, inplace=True)
dataframe.replace('F', 1.0, inplace=True)
dataframe.replace('Sci&Tech', 0.0, inplace=True)
dataframe.replace('Comm&Mgmt', 1.0, inplace=True)
dataframe.replace('Others', 2.0, inplace=True)
dataframe.salary.fillna(value=0,inplace=True)
dataframe.replace('Placed', 0.0, inplace=True)
dataframe.replace('Not Placed', 1.0, inplace=True)
display(dataframe)

Unnamed: 0,gender,degree_p,degree_t,mba_p,status,salary
0,0.0,58.00,0.0,58.80,0.0,270000.0
1,0.0,77.48,0.0,66.28,0.0,200000.0
2,0.0,64.00,1.0,57.80,0.0,250000.0
3,0.0,52.00,0.0,59.43,1.0,0.0
4,0.0,73.30,1.0,55.50,0.0,425000.0
...,...,...,...,...,...,...
210,0.0,77.60,1.0,74.49,0.0,400000.0
211,0.0,72.00,0.0,53.62,0.0,275000.0
212,0.0,73.00,1.0,69.72,0.0,295000.0
213,1.0,58.00,1.0,60.23,0.0,204000.0


In [4]:
features = dataframe.values[:, 1:]
labels = dataframe.values[:, -1].astype(int)
print('array of labels: shape ' + str(np.shape(labels)))
print('array of feature matrix: shape ' + str(np.shape(features)))

array of labels: shape (215,)
array of feature matrix: shape (215, 5)


In [5]:
val_features, train_features = np.split(features, [int(0.2 * features.shape[0])])
val_labels, train_labels = np.split(labels, [int(0.2 * labels.shape[0])])

print(np.shape(train_features))
print(np.shape(val_features))
print(np.shape(train_labels))
print(np.shape(val_labels))

(172, 5)
(43, 5)
(172,)
(43,)


In [6]:
def KNN(train_features, train_labels, test_features, k=2):
    vali_pred = []
    for i in tqdm(range(test_features.shape[0])):
        x = test_features[i, :]  
        distances = np.sum(np.abs(x - train_features), axis=1)
        topk_idx = np.argpartition(distances, k)[:k]
        topk_labels = list(train_labels[topk_idx])
        pred = max(topk_labels, key=topk_labels.count)
        vali_pred.append(pred)
    return np.array(vali_pred)

val_preds = KNN(train_features, train_labels, val_features)
print(val_preds)

100%|████████████████████████████████████████| 43/43 [00:00<00:00, 13246.79it/s]

[270000 200000 250000      0 420000      0      0 252000 230000      0
 260000 250000      0 218000      0 200000 300000      0      0 236000
 265000 400000 360000 300000 360000      0 240000 265000 350000      0
 250000      0 280000 260000      0 300000      0 336000 240000 420000
 285000      0      0]





In [7]:
def evaluation(true, pred):
    acc = np.sum(pred == true) / len(true)

    micro_TP = 0
    micro_FP = 0
    micro_FN = 0
    micro_TN = 0
    macro_F1 = []

    for c in range(6):
        c_label_pos = (true == c)
        c_label_neg = ~c_label_pos
        c_pred_pos = (pred == c)
        c_pred_neg = ~c_pred_pos

        TP = np.sum(c_label_pos * c_pred_pos)
        FP = np.sum(c_label_neg * c_pred_pos)
        FN = np.sum(c_label_pos * c_pred_neg)
        TN = np.sum(c_label_neg * c_pred_neg)

        if TP == 0:
            precision = 0
            recall = 0
            F1 = 0
        else:
            precision = TP / (TP + FP)
            recall = TP / (TP + FN)
            F1 = 2 * precision * recall / (precision + recall)
        macro_F1.append(F1)

        micro_TP += TP
        micro_FP += FP
        micro_TN += TN
        micro_FN += FN

    macro_F1 = np.mean(macro_F1)
    micro_precision = micro_TP / (micro_TP + micro_FP)
    micro_recall = micro_TP / (micro_TP + micro_FN)
    micro_F1 = 2 * micro_precision * micro_recall / (micro_precision + micro_recall)

    print('accuracy = %.6f' % acc)
    print('macro F1 = %.6f' % macro_F1)
    print('micro F1 = %.6f' % micro_F1)
    
evaluation(val_labels, val_preds)

accuracy = 0.837209
macro F1 = 0.166667
micro F1 = 1.000000
