In [2]:
import numpy as np
import pandas as pd
import sklearn.metrics
from sklearn.preprocessing import normalize
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import train_test_split
from sklearn.metrics import log_loss
from sklearn.metrics import accuracy_score
from sklearn import model_selection

In [3]:
df_train_mean_encoding = pd.read_csv('../Feature_Encoding/data/train_mean_encoding.csv')
df_test_mean_encoding = pd.read_csv('../Feature_Encoding/data/test_mean_encoding.csv')
df_train_binary_encoding = pd.read_csv('../Feature_Encoding/data/train_binary_encoding.csv')
df_test_binary_encoding = pd.read_csv('../Feature_Encoding/data/test_binary_encoding.csv')
train = pd.read_csv('../Feature_Engineering/data/other-cleaned_train.csv')

In [4]:
def cross_val(model, x_train, y_train):
    score_cross_val = model_selection.cross_val_score(model, x_train, y_train, cv=5)
    print(score_cross_val.mean())

In [5]:
def test_model(model, x_test, y_test):
    predictions = model.predict_proba(x_test)[:,1]
    logloss = log_loss(y_test, predictions)
    accuracy = accuracy_score(y_test, predictions.round())
    print("Accuracy: %.2f%%, Logloss: %.2f" % (accuracy*100.0, logloss))

In [6]:
#Realiza busqueda completa combinando los parametros
def knn(x_train, y_train, x_validation, y_validation):
    knn_classifier = KNeighborsClassifier()
    params_knn = {'weights':['uniform','distance'],'leaf_size':[10,15,30,45],'n_neighbors':[5,10,25,50],'p':[1,2,3,4]}    
    knn_gs = GridSearchCV(knn_classifier, params_knn, cv=2,verbose=3)
    knn_gs.fit(x_train, y_train)
    knn_best = knn_gs.best_estimator_
    print(knn_gs.best_params_)
    print('knn: {}'.format(knn_best.score(x_validation, y_validation)))
    return knn_best

In [7]:
y = train.Target
x_train_mean_encoding, x_validation_mean_encoding, y_train_mean_encoding, y_validation_mean_encoding = train_test_split(df_train_mean_encoding, y, test_size=0.3, stratify=y)
x_train_binary_encoding, x_validation_binary_encoding, y_train_binary_encoding, y_validation_binary_encoding = train_test_split(df_train_binary_encoding, y, test_size=0.3, stratify=y)

In [8]:
#Para normalizar las columnas
normalized_mean = normalize(df_train_mean_encoding,axis=0)
normalized_bin = normalize(df_train_binary_encoding,axis=0)

y = train.Target
normalized_x_train_mean_encoding, normalized_x_validation_mean_encoding, normalized_y_train_mean_encoding, normalized_y_validation_mean_encoding = train_test_split(normalized_mean, y, test_size=0.3, stratify=y)
normalized_x_train_binary_encoding, normalized_x_validation_binary_encoding, normalized_y_train_binary_encoding, normalized_y_validation_binary_encoding = train_test_split(normalized_bin, y, test_size=0.3, stratify=y)

## Mean encoding

In [9]:
knn_mean_encoding = knn(x_train_mean_encoding, y_train_mean_encoding, x_validation_mean_encoding, y_validation_mean_encoding)
test_model(knn_mean_encoding,x_validation_mean_encoding,y_validation_mean_encoding)
cross_val(knn_mean_encoding, x_train_mean_encoding, y_train_mean_encoding)

Fitting 2 folds for each of 128 candidates, totalling 256 fits
[CV] leaf_size=10, n_neighbors=5, p=1, weights=uniform ...............


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


[CV]  leaf_size=10, n_neighbors=5, p=1, weights=uniform, score=0.728, total=   0.3s
[CV] leaf_size=10, n_neighbors=5, p=1, weights=uniform ...............


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.3s remaining:    0.0s


[CV]  leaf_size=10, n_neighbors=5, p=1, weights=uniform, score=0.723, total=   0.3s
[CV] leaf_size=10, n_neighbors=5, p=1, weights=distance ..............
[CV]  leaf_size=10, n_neighbors=5, p=1, weights=distance, score=0.741, total=   0.1s
[CV] leaf_size=10, n_neighbors=5, p=1, weights=distance ..............


[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:    0.5s remaining:    0.0s


[CV]  leaf_size=10, n_neighbors=5, p=1, weights=distance, score=0.739, total=   0.1s
[CV] leaf_size=10, n_neighbors=5, p=2, weights=uniform ...............
[CV]  leaf_size=10, n_neighbors=5, p=2, weights=uniform, score=0.721, total=   0.2s
[CV] leaf_size=10, n_neighbors=5, p=2, weights=uniform ...............
[CV]  leaf_size=10, n_neighbors=5, p=2, weights=uniform, score=0.714, total=   0.2s
[CV] leaf_size=10, n_neighbors=5, p=2, weights=distance ..............
[CV]  leaf_size=10, n_neighbors=5, p=2, weights=distance, score=0.734, total=   0.1s
[CV] leaf_size=10, n_neighbors=5, p=2, weights=distance ..............
[CV]  leaf_size=10, n_neighbors=5, p=2, weights=distance, score=0.732, total=   0.1s
[CV] leaf_size=10, n_neighbors=5, p=3, weights=uniform ...............
[CV]  leaf_size=10, n_neighbors=5, p=3, weights=uniform, score=0.722, total=   0.4s
[CV] leaf_size=10, n_neighbors=5, p=3, weights=uniform ...............
[CV]  leaf_size=10, n_neighbors=5, p=3, weights=uniform, score=0.71

[Parallel(n_jobs=1)]: Done 256 out of 256 | elapsed:  1.7min finished


knn: 0.7857847976307996
Accuracy: 78.58%, Logloss: 0.46
0.7767805343661769


In [10]:
normalized_knn_mean_encoding = knn(normalized_x_train_mean_encoding, normalized_y_train_mean_encoding, normalized_x_validation_mean_encoding, normalized_y_validation_mean_encoding)
test_model(normalized_knn_mean_encoding,normalized_x_validation_mean_encoding,normalized_y_validation_mean_encoding)
cross_val(normalized_knn_mean_encoding, normalized_x_train_mean_encoding, normalized_y_train_mean_encoding)

Fitting 2 folds for each of 128 candidates, totalling 256 fits
[CV] leaf_size=10, n_neighbors=5, p=1, weights=uniform ...............


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


[CV]  leaf_size=10, n_neighbors=5, p=1, weights=uniform, score=0.872, total=   2.4s
[CV] leaf_size=10, n_neighbors=5, p=1, weights=uniform ...............


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    2.4s remaining:    0.0s


[CV]  leaf_size=10, n_neighbors=5, p=1, weights=uniform, score=0.866, total=   2.2s
[CV] leaf_size=10, n_neighbors=5, p=1, weights=distance ..............


[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:    4.6s remaining:    0.0s


[CV]  leaf_size=10, n_neighbors=5, p=1, weights=distance, score=0.887, total=   2.3s
[CV] leaf_size=10, n_neighbors=5, p=1, weights=distance ..............
[CV]  leaf_size=10, n_neighbors=5, p=1, weights=distance, score=0.884, total=   2.1s
[CV] leaf_size=10, n_neighbors=5, p=2, weights=uniform ...............
[CV]  leaf_size=10, n_neighbors=5, p=2, weights=uniform, score=0.861, total=   1.8s
[CV] leaf_size=10, n_neighbors=5, p=2, weights=uniform ...............
[CV]  leaf_size=10, n_neighbors=5, p=2, weights=uniform, score=0.854, total=   1.7s
[CV] leaf_size=10, n_neighbors=5, p=2, weights=distance ..............
[CV]  leaf_size=10, n_neighbors=5, p=2, weights=distance, score=0.873, total=   1.6s
[CV] leaf_size=10, n_neighbors=5, p=2, weights=distance ..............
[CV]  leaf_size=10, n_neighbors=5, p=2, weights=distance, score=0.872, total=   1.5s
[CV] leaf_size=10, n_neighbors=5, p=3, weights=uniform ...............
[CV]  leaf_size=10, n_neighbors=5, p=3, weights=uniform, score=0.8

[Parallel(n_jobs=1)]: Done 256 out of 256 | elapsed: 25.5min finished


knn: 0.906614017769003
Accuracy: 90.66%, Logloss: 0.83
0.8982909820247713


## Binary encoding

In [11]:
knn_bin_encoding = knn(x_train_binary_encoding, y_train_binary_encoding, x_validation_binary_encoding, y_validation_binary_encoding)
test_model(knn_bin_encoding,x_validation_binary_encoding,y_validation_binary_encoding)
cross_val(knn_bin_encoding, x_train_binary_encoding, y_train_binary_encoding)

Fitting 2 folds for each of 128 candidates, totalling 256 fits
[CV] leaf_size=10, n_neighbors=5, p=1, weights=uniform ...............


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


[CV]  leaf_size=10, n_neighbors=5, p=1, weights=uniform, score=0.725, total=   0.4s
[CV] leaf_size=10, n_neighbors=5, p=1, weights=uniform ...............


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.4s remaining:    0.0s


[CV]  leaf_size=10, n_neighbors=5, p=1, weights=uniform, score=0.728, total=   0.4s
[CV] leaf_size=10, n_neighbors=5, p=1, weights=distance ..............


[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:    0.9s remaining:    0.0s


[CV]  leaf_size=10, n_neighbors=5, p=1, weights=distance, score=0.745, total=   0.3s
[CV] leaf_size=10, n_neighbors=5, p=1, weights=distance ..............
[CV]  leaf_size=10, n_neighbors=5, p=1, weights=distance, score=0.750, total=   0.3s
[CV] leaf_size=10, n_neighbors=5, p=2, weights=uniform ...............
[CV]  leaf_size=10, n_neighbors=5, p=2, weights=uniform, score=0.711, total=   0.4s
[CV] leaf_size=10, n_neighbors=5, p=2, weights=uniform ...............
[CV]  leaf_size=10, n_neighbors=5, p=2, weights=uniform, score=0.716, total=   0.4s
[CV] leaf_size=10, n_neighbors=5, p=2, weights=distance ..............
[CV]  leaf_size=10, n_neighbors=5, p=2, weights=distance, score=0.732, total=   0.3s
[CV] leaf_size=10, n_neighbors=5, p=2, weights=distance ..............
[CV]  leaf_size=10, n_neighbors=5, p=2, weights=distance, score=0.738, total=   0.3s
[CV] leaf_size=10, n_neighbors=5, p=3, weights=uniform ...............
[CV]  leaf_size=10, n_neighbors=5, p=3, weights=uniform, score=0.7

[Parallel(n_jobs=1)]: Done 256 out of 256 | elapsed:  3.7min finished


knn: 0.7893385982230997
Accuracy: 78.93%, Logloss: 0.44
0.7742420694677463


In [12]:
normalized_knn_bin_encoding = knn(normalized_x_train_binary_encoding, normalized_y_train_binary_encoding, normalized_x_validation_binary_encoding, normalized_y_validation_binary_encoding)
test_model(normalized_knn_bin_encoding,normalized_x_validation_binary_encoding,normalized_y_validation_binary_encoding)
cross_val(normalized_knn_bin_encoding, normalized_x_train_binary_encoding, normalized_y_train_binary_encoding)

Fitting 2 folds for each of 128 candidates, totalling 256 fits
[CV] leaf_size=10, n_neighbors=5, p=1, weights=uniform ...............


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


[CV]  leaf_size=10, n_neighbors=5, p=1, weights=uniform, score=0.831, total=  12.0s
[CV] leaf_size=10, n_neighbors=5, p=1, weights=uniform ...............


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:   12.0s remaining:    0.0s


[CV]  leaf_size=10, n_neighbors=5, p=1, weights=uniform, score=0.826, total=  13.0s
[CV] leaf_size=10, n_neighbors=5, p=1, weights=distance ..............


[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:   25.0s remaining:    0.0s


[CV]  leaf_size=10, n_neighbors=5, p=1, weights=distance, score=0.841, total=  11.8s
[CV] leaf_size=10, n_neighbors=5, p=1, weights=distance ..............
[CV]  leaf_size=10, n_neighbors=5, p=1, weights=distance, score=0.838, total=  12.8s
[CV] leaf_size=10, n_neighbors=5, p=2, weights=uniform ...............
[CV]  leaf_size=10, n_neighbors=5, p=2, weights=uniform, score=0.831, total=  13.3s
[CV] leaf_size=10, n_neighbors=5, p=2, weights=uniform ...............
[CV]  leaf_size=10, n_neighbors=5, p=2, weights=uniform, score=0.825, total=  14.0s
[CV] leaf_size=10, n_neighbors=5, p=2, weights=distance ..............
[CV]  leaf_size=10, n_neighbors=5, p=2, weights=distance, score=0.840, total=  15.0s
[CV] leaf_size=10, n_neighbors=5, p=2, weights=distance ..............
[CV]  leaf_size=10, n_neighbors=5, p=2, weights=distance, score=0.835, total=  13.9s
[CV] leaf_size=10, n_neighbors=5, p=3, weights=uniform ...............
[CV]  leaf_size=10, n_neighbors=5, p=3, weights=uniform, score=0.8

[Parallel(n_jobs=1)]: Done 256 out of 256 | elapsed: 172.4min finished


knn: 0.8604146100691017
Accuracy: 86.04%, Logloss: 0.55
0.8586898411995992


In [15]:
y_pred_knn_mean = knn_mean_encoding.predict_proba(df_test_mean_encoding)[:,1]
submission_knn_mean = pd.DataFrame(data={'Opportunity_ID':df_test_mean_encoding['Opportunity_ID'], 'Target': y_pred_knn_mean})
submission_knn_mean = submission_knn_mean.groupby("Opportunity_ID").agg({"Target":"mean"}).reset_index()
submission_knn_mean.to_csv('submits/mean_knn.csv', index=False)

In [17]:
y_pred_knn_mean_norm = normalized_knn_mean_encoding.predict_proba(df_test_mean_encoding)[:,1]
submission_knn_mean_norm = pd.DataFrame(data={'Opportunity_ID':df_test_mean_encoding['Opportunity_ID'], 'Target': y_pred_knn_mean_norm})
submission_knn_mean_norm = submission_knn_mean_norm.groupby("Opportunity_ID").agg({"Target":"mean"}).reset_index()
submission_knn_mean_norm.to_csv('submits/mean_knn_norm.csv', index=False)

In [18]:
y_pred_knn_binary = knn_bin_encoding.predict_proba(df_test_binary_encoding)[:,1]
submission_knn_bin = pd.DataFrame(data={'Opportunity_ID':df_test_binary_encoding['Opportunity_ID'], 'Target': y_pred_knn_binary})
submission_knn_bin = submission_knn_bin.groupby("Opportunity_ID").agg({"Target":"mean"}).reset_index()
submission_knn_bin.to_csv('submits/bin_knn.csv', index=False)

In [19]:
y_pred_knn_binary_norm = normalized_knn_bin_encoding.predict_proba(df_test_binary_encoding)[:,1]
submission_knn_bin_norm = pd.DataFrame(data={'Opportunity_ID':df_test_binary_encoding['Opportunity_ID'], 'Target': y_pred_knn_binary_norm})
submission_knn_bin_norm = submission_knn_bin_norm.groupby("Opportunity_ID").agg({"Target":"mean"}).reset_index()
submission_knn_bin_norm.to_csv('submits/bin_knn_nor.csv', index=False)