In [7]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from category_encoders import WOEEncoder
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV
import xgboost as xgb
from xgboost import XGBClassifier
from sklearn.metrics import log_loss
from sklearn.metrics import accuracy_score
from sklearn import model_selection
import scipy.stats as stats
from sklearn.feature_selection import SelectFromModel

In [8]:
train = pd.read_csv('../../Feature_Engineering/data/other-cleaned_train.csv')
test = pd.read_csv('../../Feature_Engineering/data/other-cleaned_test.csv')

In [9]:
train.drop(columns = ['Unnamed: 0'], inplace = True)
test.drop(columns = ['Unnamed: 0'], inplace = True)

In [10]:
X_train = train.copy()
X_test = test.copy()

In [11]:
categ_columns = train.drop(columns = ["Opportunity_ID","ID", "Pricing, Delivery_Terms_Quote_Appr",\
                                    "Bureaucratic_Code_0_Approval","Bureaucratic_Code_0_Approved",\
                                    "Submitted_for_Approval","ASP","ASP_(converted)","TRF","Total_Amount",\
                                    "Total_Taxable_Amount","diferencia_en_dias","Last_Modified_DOY","Last_Modified_Year",\
                                    "Opportunity_Created_DOY","Opportunity_Created_Year","Quote_Expiry_DOY",\
                                     "Quote_Expiry_Year","Planned_Delivery_Start_DOY","Planned_Delivery_Start_Year",\
                                    "Planned_Delivery_End_DOY","Planned_Delivery_End_Year",\
                                    "Target"]).columns
for column in categ_columns:
    encoder = WOEEncoder()
    encoder.fit(train[column], train['Target'])
    feature_encoded = encoder.transform(train[column])
    X_train = X_train.join(feature_encoded.add_suffix('_woe'))
    X_train.drop(columns=[column], inplace = True)

In [12]:
categ_columns = test.drop(columns = ["Opportunity_ID","ID", "Pricing, Delivery_Terms_Quote_Appr",\
                                    "Bureaucratic_Code_0_Approval","Bureaucratic_Code_0_Approved",\
                                    "Submitted_for_Approval","ASP","ASP_(converted)","TRF","Total_Amount",\
                                    "Total_Taxable_Amount","diferencia_en_dias","Last_Modified_DOY","Last_Modified_Year",\
                                    "Opportunity_Created_DOY","Opportunity_Created_Year","Quote_Expiry_DOY",\
                                     "Quote_Expiry_Year","Planned_Delivery_Start_DOY","Planned_Delivery_Start_Year",\
                                    "Planned_Delivery_End_DOY","Planned_Delivery_End_Year"]).columns
for column in categ_columns:
    encoder = WOEEncoder()
    encoder.fit(train[column], train['Target'])
    feature_encoded = encoder.transform(test[column])
    X_test = X_test.join(feature_encoded.add_suffix('_woe'))
    X_test.drop(columns=[column], inplace = True)

In [13]:
X_train["Total_Amount"] = pd.to_numeric(X_train["Total_Amount"],errors='coerce').fillna(X_train["Total_Amount"].mean())
X_train["Opportunity_Created_Year"] = pd.to_numeric(X_train["Opportunity_Created_Year"],errors='coerce').fillna(0)
X_train["Quote_Expiry_DOY"] = pd.to_numeric(X_train["Quote_Expiry_DOY"],errors='coerce').fillna(0)
X_train["Quote_Expiry_Year"] = pd.to_numeric(X_train["Quote_Expiry_Year"],errors='coerce').fillna(0)
X_train["Planned_Delivery_End_DOY"] = pd.to_numeric(X_train["Planned_Delivery_End_DOY"],errors='coerce').fillna(0)
X_train["Planned_Delivery_End_Year"] = pd.to_numeric(X_train["Planned_Delivery_End_Year"],errors='coerce').fillna(0)

X_train = X_train.drop(columns = 'Target')

In [14]:
X_test["Total_Amount"] = pd.to_numeric(X_test["Total_Amount"],errors='coerce').fillna(test["Total_Amount"].mean())
X_test["Opportunity_Created_Year"] = pd.to_numeric(X_test["Opportunity_Created_Year"],errors='coerce').fillna(0)
X_test["Quote_Expiry_DOY"] = pd.to_numeric(X_test["Quote_Expiry_DOY"],errors='coerce').fillna(0)
X_test["Quote_Expiry_Year"] = pd.to_numeric(X_test["Quote_Expiry_Year"],errors='coerce').fillna(0)
X_test["Planned_Delivery_End_DOY"] = pd.to_numeric(X_test["Planned_Delivery_End_DOY"],errors='coerce').fillna(0)
X_test["Planned_Delivery_End_Year"] = pd.to_numeric(X_test["Planned_Delivery_End_Year"],errors='coerce').fillna(0)


## Model: XGBOOST

In [15]:
def cross_val(model, x_train, y_train):
    score_cross_val = model_selection.cross_val_score(model, x_train, y_train, cv=5)
    print(score_cross_val.mean())

In [16]:
def xgboost(x_train, y_train, x_validation, y_validation):
    xgb_classifier = XGBClassifier()
    params_xgb = {'n_estimators': [50,75,100], 'learning_rate': [0.01, 0.05, 0.1], 'gamma': [0, 1, 5],'max_depth':[3,10,6,15]}    
    xgb_gs = GridSearchCV(xgb_classifier, params_xgb, cv=2)
    xgb_gs.fit(x_train, y_train)
    xgb_best = xgb_gs.best_estimator_
    print(xgb_gs.best_params_)
    print('xgb: {}'.format(xgb_best.score(x_validation, y_validation)))
    return xgb_best


In [17]:
def xgboost2(x_train, y_train, x_validation, y_validation):
    params_xgb_2={
        'n_estimators':stats.randint(10,300),'learning_rate':stats.uniform(0.01,0.3),
        'subsample':stats.uniform(0.3,0.7),'min_child_weight':[1,5,10],
        'max_depth':[3,10,6,15],'gamma':stats.randint(0,10),'colsample_bytree':stats.uniform(0.,0.6)
    }
    
    xgb_rs = RandomizedSearchCV(xgb.XGBClassifier(n_jobs=-1),
                          param_distributions=params_xgb_2,
                          cv=2,
                          scoring='neg_log_loss',
                          verbose=1,
                          n_iter=150)
    
    xgb_rs.fit(x_train, y_train)
    xgb_best = xgb_rs.best_estimator_
    print(xgb_rs.best_params_)
    print('xgb: {}'.format(xgb_best.score(x_validation, y_validation)))
    return xgb_best

In [18]:
def test_model(model, x_test, y_test):
    predictions = model.predict_proba(x_test)[:,1]
    logloss = log_loss(y_test, predictions)
    accuracy = accuracy_score(y_test, predictions.round())
    print("Accuracy: %.2f%%, Logloss: %.2f" % (accuracy*100.0, logloss))

In [30]:
def best_features(model,train):
    importance = model.feature_importances_
    result = pd.DataFrame([train.columns,importance]).transpose()
    result.columns = ["Feature","Importance"]
    return result.sort_values(by='Importance', ascending=False).head(15)["Feature"].to_list()
    
    
def plot_features(model,train):
    fig = plt.gcf()
    fig.set_size_inches(350, 350)
    selection = SelectFromModel(model, threshold=0.040, prefit=True)
    selected_dataset = selection.transform(train)
    model.plot_importance(booster=model)

    plt.rcParams["figure.figsize"] = (40,20)
    plt.xlabel("\nFeature importance", fontsize=40)
    plt.ylabel("Features", fontsize=35)
    plt.xticks(fontsize=20)
    plt.yticks(fontsize=20)
    plt.show()

### xgb 1 with all features

In [20]:
y = train.Target
x_train, x_validation, y_train, y_validation = train_test_split(X_train, y, test_size=0.3, stratify=y)

In [21]:
xgb_model = xgboost(x_train, y_train, x_validation, y_validation)
test_model(xgb_model,x_validation,y_validation)
cross_val(xgb_model, x_train, y_train)

{'gamma': 0, 'learning_rate': 0.1, 'max_depth': 15, 'n_estimators': 100}
xgb: 0.9293188548864758
Accuracy: 92.93%, Logloss: 0.18
0.9251147305505849


In [22]:
best_features = best_features(xgb_model,X_train)
if "Opportunity_ID" not in best_features: 
    best_features.append("Opportunity_ID")

In [23]:
y_pred = xgb_model.predict_proba(X_test)[:,1]
submission_xgb = pd.DataFrame(data={'Opportunity_ID':X_test['Opportunity_ID'], 'Target': y_pred})
submission_xgb = submission_xgb.groupby("Opportunity_ID").agg({"Target":"mean"}).reset_index()
submission_xgb.to_csv('../submits/xgb_with_woe_encoding.csv', index=False)

### xgb 1 with best features

In [24]:
X_train_best_features = X_train.loc[:,best_features]
X_test_best_features = X_test.loc[:,best_features]

In [25]:
x_best_train, x_best_validation, y_best_train, y_best_validation = train_test_split(X_train_best_features, y, test_size=0.3, stratify=y)

In [26]:
xgb_model_2 = xgboost(x_best_train, y_best_train, x_best_validation, y_best_validation)
test_model(xgb_model_2,x_best_validation,y_best_validation)
cross_val(xgb_model_2, x_best_train, y_best_train)

{'gamma': 0, 'learning_rate': 0.1, 'max_depth': 15, 'n_estimators': 100}
xgb: 0.9249753208292202
Accuracy: 92.50%, Logloss: 0.19
0.9167370194617671


In [27]:
y_pred_2 = xgb_model_2.predict_proba(X_test_best_features)[:,1]
submission_xgb_2 = pd.DataFrame(data={'Opportunity_ID':X_test_best_features['Opportunity_ID'], 'Target': y_pred_2})
submission_xgb_2 = submission_xgb_2.groupby("Opportunity_ID").agg({"Target":"mean"}).reset_index()
submission_xgb_2.to_csv('../submits/xgb_best_features_woe_encoding.csv', index=False)

### xgb boost 2 with all features

In [28]:
xgb_model_3 = xgboost2(x_train, y_train, x_validation, y_validation)
test_model(xgb_model_3,x_validation,y_validation)
cross_val(xgb_model_3, x_train, y_train)

Fitting 2 folds for each of 150 candidates, totalling 300 fits


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done 300 out of 300 | elapsed: 10.2min finished


{'colsample_bytree': 0.5674542619617974, 'gamma': 4, 'learning_rate': 0.07217205748199952, 'max_depth': 15, 'min_child_weight': 1, 'n_estimators': 225, 'subsample': 0.9088612748963232}
xgb: 0.9255676209279369
Accuracy: 92.56%, Logloss: 0.18
0.92282978633516


In [31]:
best_features_2 = best_features(xgb_model_3,X_train)
if "Opportunity_ID" not in best_features_2: 
    best_features_2.append("Opportunity_ID")

In [32]:
y_pred_3 = xgb_model_3.predict_proba(X_test)[:,1]
submission_xgb_3 = pd.DataFrame(data={'Opportunity_ID':X_test['Opportunity_ID'], 'Target': y_pred_3})
submission_xgb_3 = submission_xgb_3.groupby("Opportunity_ID").agg({"Target":"mean"}).reset_index()
submission_xgb_3.to_csv('../submits/xgb_2_with_woe_encoding.csv', index=False)

### xgb  2 with best features

In [33]:
X_train_best_features_2 = X_train.loc[:,best_features_2]
X_test_best_features_2 = X_test.loc[:,best_features_2]

In [34]:
x_best_train_2, x_best_validation_2, y_best_train_2, y_best_validation_2 = train_test_split(X_train_best_features_2, y, test_size=0.3, stratify=y)

In [36]:
xgb_model_4 = xgboost2(x_best_train_2, y_best_train_2, x_best_validation_2, y_best_validation_2)
test_model(xgb_model_4,x_best_validation_2,y_best_validation_2)
cross_val(xgb_model_4, x_best_train_2, y_best_train_2)

Fitting 2 folds for each of 150 candidates, totalling 300 fits


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done 300 out of 300 | elapsed:  3.7min finished


{'colsample_bytree': 0.4070202587614009, 'gamma': 3, 'learning_rate': 0.16589516205536506, 'max_depth': 15, 'min_child_weight': 1, 'n_estimators': 114, 'subsample': 0.8773801786765878}
xgb: 0.9062191510365252
Accuracy: 90.62%, Logloss: 0.22
0.9081061457194352


In [37]:
y_pred_4 = xgb_model_4.predict_proba(X_test_best_features_2)[:,1]
submission_xgb_4 = pd.DataFrame(data={'Opportunity_ID':X_test_best_features_2['Opportunity_ID'], 'Target': y_pred_4})
submission_xgb_4 = submission_xgb_4.groupby("Opportunity_ID").agg({"Target":"mean"}).reset_index()
submission_xgb_4.to_csv('../submits/xgb_2_best_features_with_woe_encoding.csv', index=False)