In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from category_encoders import BaseNEncoder
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import log_loss
from sklearn.metrics import accuracy_score
from sklearn import model_selection
from sklearn.feature_selection import SelectFromModel

In [2]:
train = pd.read_csv('../../Feature_Engineering/data/other-cleaned_train.csv')
test = pd.read_csv('../../Feature_Engineering/data/other-cleaned_test.csv')

In [3]:
train.drop(columns = ['Unnamed: 0'], inplace = True)
test.drop(columns = ['Unnamed: 0'], inplace = True)

In [4]:
X_train = train.copy()
X_test = test.copy()

In [5]:
categ_columns = train.drop(columns = ["Opportunity_ID","ID", "Pricing, Delivery_Terms_Quote_Appr",\
                                    "Bureaucratic_Code_0_Approval","Bureaucratic_Code_0_Approved",\
                                    "Submitted_for_Approval","ASP","ASP_(converted)","TRF","Total_Amount",\
                                    "Total_Taxable_Amount","diferencia_en_dias","Last_Modified_DOY","Last_Modified_Year",\
                                    "Opportunity_Created_DOY","Opportunity_Created_Year","Quote_Expiry_DOY",\
                                     "Quote_Expiry_Year","Planned_Delivery_Start_DOY","Planned_Delivery_Start_Year",\
                                    "Planned_Delivery_End_DOY","Planned_Delivery_End_Year",\
                                    "Target"]).columns
for column in categ_columns:
    encoder = BaseNEncoder(base = 5)
    encoder.fit(train[column], train['Target'])
    feature_encoded = encoder.transform(train[column])
    X_train = X_train.join(feature_encoded.add_suffix('_base_n'))
    X_train.drop(columns=[column], inplace = True)

In [6]:
categ_columns = test.drop(columns = ["Opportunity_ID","ID", "Pricing, Delivery_Terms_Quote_Appr",\
                                    "Bureaucratic_Code_0_Approval","Bureaucratic_Code_0_Approved",\
                                    "Submitted_for_Approval","ASP","ASP_(converted)","TRF","Total_Amount",\
                                    "Total_Taxable_Amount","diferencia_en_dias","Last_Modified_DOY","Last_Modified_Year",\
                                    "Opportunity_Created_DOY","Opportunity_Created_Year","Quote_Expiry_DOY",\
                                     "Quote_Expiry_Year","Planned_Delivery_Start_DOY","Planned_Delivery_Start_Year",\
                                    "Planned_Delivery_End_DOY","Planned_Delivery_End_Year"]).columns
for column in categ_columns:
    encoder = BaseNEncoder(base = 5)
    encoder.fit(train[column], train['Target'])
    feature_encoded = encoder.transform(test[column])
    X_test = X_test.join(feature_encoded.add_suffix('_base_n'))
    X_test.drop(columns=[column], inplace = True)

In [7]:
X_train["Total_Amount"] = pd.to_numeric(X_train["Total_Amount"],errors='coerce').fillna(X_train["Total_Amount"].mean())
X_train["Opportunity_Created_Year"] = pd.to_numeric(X_train["Opportunity_Created_Year"],errors='coerce').fillna(0)
X_train["Quote_Expiry_DOY"] = pd.to_numeric(X_train["Quote_Expiry_DOY"],errors='coerce').fillna(0)
X_train["Quote_Expiry_Year"] = pd.to_numeric(X_train["Quote_Expiry_Year"],errors='coerce').fillna(0)
X_train["Planned_Delivery_End_DOY"] = pd.to_numeric(X_train["Planned_Delivery_End_DOY"],errors='coerce').fillna(0)
X_train["Planned_Delivery_End_Year"] = pd.to_numeric(X_train["Planned_Delivery_End_Year"],errors='coerce').fillna(0)

X_train = X_train.drop(columns = 'Target')

In [8]:
X_test["Total_Amount"] = pd.to_numeric(X_test["Total_Amount"],errors='coerce').fillna(test["Total_Amount"].mean())
X_test["Opportunity_Created_Year"] = pd.to_numeric(X_test["Opportunity_Created_Year"],errors='coerce').fillna(0)
X_test["Quote_Expiry_DOY"] = pd.to_numeric(X_test["Quote_Expiry_DOY"],errors='coerce').fillna(0)
X_test["Quote_Expiry_Year"] = pd.to_numeric(X_test["Quote_Expiry_Year"],errors='coerce').fillna(0)
X_test["Planned_Delivery_End_DOY"] = pd.to_numeric(X_test["Planned_Delivery_End_DOY"],errors='coerce').fillna(0)
X_test["Planned_Delivery_End_Year"] = pd.to_numeric(X_test["Planned_Delivery_End_Year"],errors='coerce').fillna(0)

## Model: Bagging

In [9]:
def cross_val(model, x_train, y_train):
    score_cross_val = model_selection.cross_val_score(model, x_train, y_train, cv=5)
    print(score_cross_val.mean())

In [10]:
#Realiza busqueda completa combinando los parametros
def knn(x_train, y_train, x_validation, y_validation):
    knn_classifier = KNeighborsClassifier()
    params_knn = {'weights':['uniform','distance'],'leaf_size':[10,15,30],'n_neighbors':[5,10,25],'p':[1,2,3,4]}    
    knn_gs = GridSearchCV(knn_classifier, params_knn, cv=2,verbose=3)
    knn_gs.fit(x_train, y_train)
    knn_best = knn_gs.best_estimator_
    print(knn_gs.best_params_)
    print('knn: {}'.format(knn_best.score(x_validation, y_validation)))
    return knn_best

In [11]:
def test_model(model, x_test, y_test):
    predictions = model.predict_proba(x_test)[:,1]
    logloss = log_loss(y_test, predictions)
    accuracy = accuracy_score(y_test, predictions.round())
    print("Accuracy: %.2f%%, Logloss: %.2f" % (accuracy*100.0, logloss))

In [12]:
def best_features(model,train):
    importance = model.feature_importances_
    result = pd.DataFrame([train.columns,importance]).transpose()
    result.columns = ["Feature","Importance"]
    return result.sort_values(by='Importance', ascending=False).head(15)["Feature"]
    
    
def plot_features(model,train):
    fig = plt.gcf()
    fig.set_size_inches(350, 350)
    selection = SelectFromModel(model, threshold=0.040, prefit=True)
    selected_dataset = selection.transform(train)
    model.plot_importance(booster=model)

    plt.rcParams["figure.figsize"] = (40,20)
    plt.xlabel("\nFeature importance", fontsize=40)
    plt.ylabel("Features", fontsize=35)
    plt.xticks(fontsize=20)
    plt.yticks(fontsize=20)
    plt.show()

In [13]:
y = train.Target
x_train, x_validation, y_train, y_validation = train_test_split(X_train, y, test_size=0.3, stratify=y)

In [14]:
knn_model = knn(x_train, y_train, x_validation, y_validation)
test_model(knn_model,x_validation,y_validation)
cross_val(knn_model, x_train, y_train)

Fitting 2 folds for each of 72 candidates, totalling 144 fits
[CV] leaf_size=10, n_neighbors=5, p=1, weights=uniform ...............


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


[CV]  leaf_size=10, n_neighbors=5, p=1, weights=uniform, score=0.725, total=   0.3s
[CV] leaf_size=10, n_neighbors=5, p=1, weights=uniform ...............


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.3s remaining:    0.0s


[CV]  leaf_size=10, n_neighbors=5, p=1, weights=uniform, score=0.735, total=   0.3s
[CV] leaf_size=10, n_neighbors=5, p=1, weights=distance ..............


[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:    0.7s remaining:    0.0s


[CV]  leaf_size=10, n_neighbors=5, p=1, weights=distance, score=0.745, total=   0.2s
[CV] leaf_size=10, n_neighbors=5, p=1, weights=distance ..............
[CV]  leaf_size=10, n_neighbors=5, p=1, weights=distance, score=0.751, total=   0.2s
[CV] leaf_size=10, n_neighbors=5, p=2, weights=uniform ...............
[CV]  leaf_size=10, n_neighbors=5, p=2, weights=uniform, score=0.719, total=   0.3s
[CV] leaf_size=10, n_neighbors=5, p=2, weights=uniform ...............
[CV]  leaf_size=10, n_neighbors=5, p=2, weights=uniform, score=0.729, total=   0.3s
[CV] leaf_size=10, n_neighbors=5, p=2, weights=distance ..............
[CV]  leaf_size=10, n_neighbors=5, p=2, weights=distance, score=0.737, total=   0.2s
[CV] leaf_size=10, n_neighbors=5, p=2, weights=distance ..............
[CV]  leaf_size=10, n_neighbors=5, p=2, weights=distance, score=0.745, total=   0.2s
[CV] leaf_size=10, n_neighbors=5, p=3, weights=uniform ...............
[CV]  leaf_size=10, n_neighbors=5, p=3, weights=uniform, score=0.7

[Parallel(n_jobs=1)]: Done 144 out of 144 | elapsed:  1.4min finished


knn: 0.7865745310957551
Accuracy: 78.66%, Logloss: 0.46
0.7766118666726816


In [15]:
y_pred = knn_model.predict_proba(X_test)[:,1]
submission_knn = pd.DataFrame(data={'Opportunity_ID':X_test['Opportunity_ID'], 'Target': y_pred})
submission_knn = submission_knn.groupby("Opportunity_ID").agg({"Target":"mean"}).reset_index()
submission_knn.to_csv('../submits/knn_base_n_encoding.csv', index=False)