In [1]:
import pandas as pd

from sklearn.metrics import log_loss
from sklearn.metrics import accuracy_score

from sklearn import model_selection, metrics, preprocessing, neighbors

In [2]:
df_train_binary_encoding = pd.read_csv('../../Feature_Encoding/data/train_binary_encoding.csv')
df_test_binary_encoding = pd.read_csv('../../Feature_Encoding/data/test_binary_encoding.csv')
train = pd.read_csv('../../Feature_Engineering/data/other-cleaned_train.csv')

In [3]:
def cross_val(model, x_train, y_train):
    score_cross_val = model_selection.cross_val_score(model, x_train, y_train, cv=5)
    print(score_cross_val.mean())

In [4]:
def test_model(model, x_test, y_test):
    predictions = model.predict_proba(x_test)[:,1]
    logloss = metrics.log_loss(y_test, predictions)
    accuracy = metrics.accuracy_score(y_test, predictions.round())
    print("Accuracy: %.2f%%, Logloss: %.2f" % (accuracy*100.0, logloss))

In [5]:
#Realiza busqueda completa combinando los parametros
def knn(x_train, y_train, x_validation, y_validation):
    knn_classifier = neighbors.KNeighborsClassifier()
    params_knn = {'weights':['uniform','distance'],'leaf_size':[10,15,30,45],'n_neighbors':[5,10,25,50],'p':[1,2,3,4]}    
    knn_gs = model_selection.GridSearchCV(knn_classifier, params_knn, cv=2,verbose=3)
    knn_gs.fit(x_train, y_train)
    knn_best = knn_gs.best_estimator_
    print(knn_gs.best_params_)
    print('knn: {}'.format(knn_best.score(x_validation, y_validation)))
    return knn_best

In [6]:
y = train.Target
x_train_binary_encoding, x_validation_binary_encoding, y_train_binary_encoding, y_validation_binary_encoding = model_selection.train_test_split(df_train_binary_encoding, y, test_size=0.3, stratify=y)

In [7]:
#Para normalizar las columnas
normalized_bin = preprocessing.normalize(df_train_binary_encoding,axis=0)

y = train.Target
normalized_x_train_binary_encoding, normalized_x_validation_binary_encoding, normalized_y_train_binary_encoding, normalized_y_validation_binary_encoding = model_selection.train_test_split(normalized_bin, y, test_size=0.3, stratify=y)

## Binary encoding

In [8]:
knn_bin_encoding = knn(x_train_binary_encoding, y_train_binary_encoding, x_validation_binary_encoding, y_validation_binary_encoding)
test_model(knn_bin_encoding,x_validation_binary_encoding,y_validation_binary_encoding)
cross_val(knn_bin_encoding, x_train_binary_encoding, y_train_binary_encoding)

Fitting 2 folds for each of 128 candidates, totalling 256 fits
[CV 1/2] END leaf_size=10, n_neighbors=5, p=1, weights=uniform; total time=   6.4s
[CV 2/2] END leaf_size=10, n_neighbors=5, p=1, weights=uniform; total time=   6.4s
[CV 1/2] END leaf_size=10, n_neighbors=5, p=1, weights=distance; total time=   6.2s
[CV 2/2] END leaf_size=10, n_neighbors=5, p=1, weights=distance; total time=   5.9s
[CV 1/2] END leaf_size=10, n_neighbors=5, p=2, weights=uniform; total time=   1.3s
[CV 2/2] END leaf_size=10, n_neighbors=5, p=2, weights=uniform; total time=   1.3s
[CV 1/2] END leaf_size=10, n_neighbors=5, p=2, weights=distance; total time=   1.1s
[CV 2/2] END leaf_size=10, n_neighbors=5, p=2, weights=distance; total time=   1.1s
[CV 1/2] END leaf_size=10, n_neighbors=5, p=3, weights=uniform; total time= 1.5min
[CV 2/2] END leaf_size=10, n_neighbors=5, p=3, weights=uniform; total time= 1.5min
[CV 1/2] END leaf_size=10, n_neighbors=5, p=3, weights=distance; total time= 1.5min
[CV 2/2] END leaf_s

[CV 2/2] END leaf_size=15, n_neighbors=25, p=1, weights=uniform; total time=  12.5s
[CV 1/2] END leaf_size=15, n_neighbors=25, p=1, weights=distance; total time=  12.5s
[CV 2/2] END leaf_size=15, n_neighbors=25, p=1, weights=distance; total time=  13.0s
[CV 1/2] END leaf_size=15, n_neighbors=25, p=2, weights=uniform; total time=   2.9s
[CV 2/2] END leaf_size=15, n_neighbors=25, p=2, weights=uniform; total time=   2.8s
[CV 1/2] END leaf_size=15, n_neighbors=25, p=2, weights=distance; total time=   2.6s
[CV 2/2] END leaf_size=15, n_neighbors=25, p=2, weights=distance; total time=   2.9s
[CV 1/2] END leaf_size=15, n_neighbors=25, p=3, weights=uniform; total time= 3.6min
[CV 2/2] END leaf_size=15, n_neighbors=25, p=3, weights=uniform; total time= 3.5min
[CV 1/2] END leaf_size=15, n_neighbors=25, p=3, weights=distance; total time= 3.6min
[CV 2/2] END leaf_size=15, n_neighbors=25, p=3, weights=distance; total time= 3.5min
[CV 1/2] END leaf_size=15, n_neighbors=25, p=4, weights=uniform; total

[CV 2/2] END leaf_size=45, n_neighbors=5, p=1, weights=distance; total time=  11.1s
[CV 1/2] END leaf_size=45, n_neighbors=5, p=2, weights=uniform; total time=   2.6s
[CV 2/2] END leaf_size=45, n_neighbors=5, p=2, weights=uniform; total time=   2.6s
[CV 1/2] END leaf_size=45, n_neighbors=5, p=2, weights=distance; total time=   2.2s
[CV 2/2] END leaf_size=45, n_neighbors=5, p=2, weights=distance; total time=   2.2s
[CV 1/2] END leaf_size=45, n_neighbors=5, p=3, weights=uniform; total time= 3.2min
[CV 2/2] END leaf_size=45, n_neighbors=5, p=3, weights=uniform; total time= 3.0min
[CV 1/2] END leaf_size=45, n_neighbors=5, p=3, weights=distance; total time= 3.3min
[CV 2/2] END leaf_size=45, n_neighbors=5, p=3, weights=distance; total time= 3.1min
[CV 1/2] END leaf_size=45, n_neighbors=5, p=4, weights=uniform; total time= 3.1min
[CV 2/2] END leaf_size=45, n_neighbors=5, p=4, weights=uniform; total time= 3.3min
[CV 1/2] END leaf_size=45, n_neighbors=5, p=4, weights=distance; total time= 3.1mi

In [9]:
normalized_knn_bin_encoding = knn(normalized_x_train_binary_encoding, normalized_y_train_binary_encoding, normalized_x_validation_binary_encoding, normalized_y_validation_binary_encoding)
test_model(normalized_knn_bin_encoding,normalized_x_validation_binary_encoding,normalized_y_validation_binary_encoding)
cross_val(normalized_knn_bin_encoding, normalized_x_train_binary_encoding, normalized_y_train_binary_encoding)

Fitting 2 folds for each of 128 candidates, totalling 256 fits
[CV 1/2] END leaf_size=10, n_neighbors=5, p=1, weights=uniform; total time=   8.3s
[CV 2/2] END leaf_size=10, n_neighbors=5, p=1, weights=uniform; total time=   8.3s
[CV 1/2] END leaf_size=10, n_neighbors=5, p=1, weights=distance; total time=   8.0s
[CV 2/2] END leaf_size=10, n_neighbors=5, p=1, weights=distance; total time=   8.0s
[CV 1/2] END leaf_size=10, n_neighbors=5, p=2, weights=uniform; total time=   1.7s
[CV 2/2] END leaf_size=10, n_neighbors=5, p=2, weights=uniform; total time=   1.5s
[CV 1/2] END leaf_size=10, n_neighbors=5, p=2, weights=distance; total time=   1.4s
[CV 2/2] END leaf_size=10, n_neighbors=5, p=2, weights=distance; total time=   1.3s
[CV 1/2] END leaf_size=10, n_neighbors=5, p=3, weights=uniform; total time= 2.7min
[CV 2/2] END leaf_size=10, n_neighbors=5, p=3, weights=uniform; total time= 2.6min
[CV 1/2] END leaf_size=10, n_neighbors=5, p=3, weights=distance; total time= 2.8min
[CV 2/2] END leaf_s

[CV 2/2] END leaf_size=15, n_neighbors=25, p=1, weights=uniform; total time=   8.3s
[CV 1/2] END leaf_size=15, n_neighbors=25, p=1, weights=distance; total time=   8.3s
[CV 2/2] END leaf_size=15, n_neighbors=25, p=1, weights=distance; total time=   9.3s
[CV 1/2] END leaf_size=15, n_neighbors=25, p=2, weights=uniform; total time=   2.4s
[CV 2/2] END leaf_size=15, n_neighbors=25, p=2, weights=uniform; total time=   2.0s
[CV 1/2] END leaf_size=15, n_neighbors=25, p=2, weights=distance; total time=   2.0s
[CV 2/2] END leaf_size=15, n_neighbors=25, p=2, weights=distance; total time=   1.7s
[CV 1/2] END leaf_size=15, n_neighbors=25, p=3, weights=uniform; total time= 2.8min
[CV 2/2] END leaf_size=15, n_neighbors=25, p=3, weights=uniform; total time= 2.8min
[CV 1/2] END leaf_size=15, n_neighbors=25, p=3, weights=distance; total time= 2.8min
[CV 2/2] END leaf_size=15, n_neighbors=25, p=3, weights=distance; total time= 2.8min
[CV 1/2] END leaf_size=15, n_neighbors=25, p=4, weights=uniform; total

[CV 2/2] END leaf_size=45, n_neighbors=5, p=1, weights=distance; total time=   8.2s
[CV 1/2] END leaf_size=45, n_neighbors=5, p=2, weights=uniform; total time=   2.1s
[CV 2/2] END leaf_size=45, n_neighbors=5, p=2, weights=uniform; total time=   1.9s
[CV 1/2] END leaf_size=45, n_neighbors=5, p=2, weights=distance; total time=   1.8s
[CV 2/2] END leaf_size=45, n_neighbors=5, p=2, weights=distance; total time=   1.6s
[CV 1/2] END leaf_size=45, n_neighbors=5, p=3, weights=uniform; total time= 2.7min
[CV 2/2] END leaf_size=45, n_neighbors=5, p=3, weights=uniform; total time= 3.0min
[CV 1/2] END leaf_size=45, n_neighbors=5, p=3, weights=distance; total time= 2.6min
[CV 2/2] END leaf_size=45, n_neighbors=5, p=3, weights=distance; total time= 2.5min
[CV 1/2] END leaf_size=45, n_neighbors=5, p=4, weights=uniform; total time= 2.7min
[CV 2/2] END leaf_size=45, n_neighbors=5, p=4, weights=uniform; total time= 2.5min
[CV 1/2] END leaf_size=45, n_neighbors=5, p=4, weights=distance; total time= 2.6mi

In [10]:
y_pred_knn_binary = knn_bin_encoding.predict_proba(df_test_binary_encoding)[:,1]
submission_knn_bin = pd.DataFrame(data={'Opportunity_ID':df_test_binary_encoding['Opportunity_ID'], 'Target': y_pred_knn_binary})
submission_knn_bin = submission_knn_bin.groupby("Opportunity_ID").agg({"Target":"mean"}).reset_index()
submission_knn_bin.to_csv('../submits/bin_knn.csv', index=False)

In [11]:
y_pred_knn_binary_norm = normalized_knn_bin_encoding.predict_proba(df_test_binary_encoding)[:,1]
submission_knn_bin_norm = pd.DataFrame(data={'Opportunity_ID':df_test_binary_encoding['Opportunity_ID'], 'Target': y_pred_knn_binary_norm})
submission_knn_bin_norm = submission_knn_bin_norm.groupby("Opportunity_ID").agg({"Target":"mean"}).reset_index()
submission_knn_bin_norm.to_csv('../submits/bin_knn_nor.csv', index=False)