# KNN code

In [14]:
import pandas as pd
import numpy as np
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import f1_score
from sklearn.model_selection import ShuffleSplit
from sklearn.model_selection import cross_val_score
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn import metrics

from sklearn.metrics import accuracy_score

In [18]:
#read train data
train = pd.read_csv('../data/generated/Knn_train_eng.csv')
test = pd.read_csv('../data/generated/Knn_test_eng.csv')

#extract labels
train_y = train['label']
#remove labels from train
train.drop(columns=['label'], inplace=True)

print(train.values.shape)
print(train_y.values.shape)

(24840, 67)
(24840,)


In [19]:
def NormalizeData(train, CVorTest, PCA_comp = None, ScaleCat = False):
    '''
    Normalize data using a standard scaler
    train:
        dataframe that will be use to fit and transformed by the scaler and PCA
    CVorTest:
        dataframe that will be transformed the scaler and PCA
    PCA_comp:
        Number of PCA components to keep, if None, PCA not applied
    ScaleCat:
        Scale or not the categorical columns with the standard scaler
    '''
    
    sc = StandardScaler()
    
    if ScaleCat:
        scale_columns = train.columns
    else:
        scale_columns = [col for col in train.columns[~train.columns.str.startswith('Cat_')]]
          
    #perform feature scaling    
    print('Nb columns to scale :', len(scale_columns))
    train.loc[:, scale_columns] = sc.fit_transform(train.loc[:, scale_columns]) 
    CVorTest.loc[:, scale_columns] = sc.transform(CVorTest.loc[:, scale_columns]) 
    
    if PCA_comp is None:
        return train.values, CVorTest.values
    
    pca = PCA(PCA_comp)
    train = pca.fit_transform(train)
    CVorTest = pca.transform(CVorTest)
    
    return train, CVorTest

In [20]:
#Normalize Data
train, test = NormalizeData(train, test, None, False)

Nb columns to scale : 21


In [23]:
#Perfrom Knn
n_samples = train.shape[0]
neigh = KNeighborsClassifier(n_neighbors=4, weights='distance', algorithm = 'auto')
neigh.fit(train, train_y)
pred_y = neigh.predict(test)

#Do Cross Validation
cv = ShuffleSplit(n_splits=10, test_size=0.2, random_state=0)
knn_model = cross_val_score(neigh, train, train_y, cv=cv, scoring='f1_macro')  
knn_model.mean()

0.930995887301086

In [None]:
# Parameter tuning
# Applying Grid Search CV to find the best model and the best parameters
from sklearn.model_selection import GridSearchCV
parameters = [{'n_neighbors':[1, 50], 'weights':['distance', 'uniform'], 'algorithm' : ['auto', 'ball_tree', 'kd_tree', 'brute'] }] 
grid_search = GridSearchCV(estimator = neigh,
                          param_grid = parameters,
                          scoring = 'f1_macro',
                          cv = cv)
grid_search = grid_search.fit(train, train_y)
best_accuracy = grid_search.best_score_
best_parameters = grid_search.best_params_
print(best_accuracy, best_parameters)

In [122]:
## Save results to submission file
pred_df = pd.DataFrame(pred_y, columns=['label'])
pred_df.to_csv("Knn__submission.csv", index=True, index_label='Id')
pred_y.shape

(10647,)