In [147]:
import numpy as np
from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import train_test_split
import scipy.stats as stats
from sklearn.metrics import log_loss
import sklearn.metrics 
import xgboost as xgb
import category_encoders
import pandas as pd

In [148]:
sklearn.metrics.SCORERS.keys()

dict_keys(['explained_variance', 'r2', 'max_error', 'neg_median_absolute_error', 'neg_mean_absolute_error', 'neg_mean_squared_error', 'neg_mean_squared_log_error', 'neg_root_mean_squared_error', 'neg_mean_poisson_deviance', 'neg_mean_gamma_deviance', 'accuracy', 'roc_auc', 'roc_auc_ovr', 'roc_auc_ovo', 'roc_auc_ovr_weighted', 'roc_auc_ovo_weighted', 'balanced_accuracy', 'average_precision', 'neg_log_loss', 'neg_brier_score', 'adjusted_rand_score', 'homogeneity_score', 'completeness_score', 'v_measure_score', 'mutual_info_score', 'adjusted_mutual_info_score', 'normalized_mutual_info_score', 'fowlkes_mallows_score', 'precision', 'precision_macro', 'precision_micro', 'precision_samples', 'precision_weighted', 'recall', 'recall_macro', 'recall_micro', 'recall_samples', 'recall_weighted', 'f1', 'f1_macro', 'f1_micro', 'f1_samples', 'f1_weighted', 'jaccard', 'jaccard_macro', 'jaccard_micro', 'jaccard_samples', 'jaccard_weighted'])

In [149]:
df_train = pd.read_csv('data/train.csv')
df_test = pd.read_csv('data/test.csv')
features = pd.read_csv('data/features.csv')

In [150]:
target = features["Stage"]
features.drop(columns=["Stage"], inplace=True)

In [151]:
best_features = features.loc[:,["Pricing, Delivery_Terms_Quote_Appr","Pricing, Delivery_Terms_Approved","Bureaucratic_Code_0_Approval",\
                       "Bureaucratic_Code_0_Approved","Submitted_for_Approval","ASP","ASP_(converted)","TRF",\
                        "Total_Amount","Total_Taxable_Amount"]]
best_features.head(1)

Unnamed: 0,"Pricing, Delivery_Terms_Quote_Appr","Pricing, Delivery_Terms_Approved",Bureaucratic_Code_0_Approval,Bureaucratic_Code_0_Approved,Submitted_for_Approval,ASP,ASP_(converted),TRF,Total_Amount,Total_Taxable_Amount
0,0,0,0,0,0,0.315,0.35629,2,662287.5,662287.5


In [152]:
xtrb,xteb,ytrb,yteb=train_test_split(best_features,target)

In [153]:
#params={
 #   'n_estimators':stats.randint(10,500),
  #  'learning_rate':stats.uniform(0.01,0.3),
  #  'subsample':stats.uniform(0.3,0.7),
 #   'min_child_weight':[1,5,10],
 #   'max_depth':[3,10,6],
  #  'gamma':stats.randint(0,10),
   # 'colsample_bytree':stats.uniform(0.,0.6)
#}

params={
    'n_estimators':stats.randint(5,10),
    'learning_rate':stats.uniform(0.1,0.2),
    'max_depth':[3,5,7],
    'gamma':stats.randint(8,12),
    'alpha':stats.randint(8,12),
    'colsample_bytree':stats.uniform(0.2,0.4)
}


xgboost_search = RandomizedSearchCV(xgb.XGBClassifier(n_jobs=-1),
                          param_distributions=params,
                          cv=2,
                          scoring='neg_log_loss',
                          verbose=1,
                          n_iter=100)

In [154]:
xgboost_search.fit(xtrb,ytrb)

Fitting 2 folds for each of 100 candidates, totalling 200 fits


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done 200 out of 200 | elapsed:    4.2s finished


RandomizedSearchCV(cv=2, estimator=XGBClassifier(n_jobs=-1), n_iter=100,
                   param_distributions={'alpha': <scipy.stats._distn_infrastructure.rv_frozen object at 0x7feee360c210>,
                                        'colsample_bytree': <scipy.stats._distn_infrastructure.rv_frozen object at 0x7feee361afd0>,
                                        'gamma': <scipy.stats._distn_infrastructure.rv_frozen object at 0x7feee360cc90>,
                                        'learning_rate': <scipy.stats._distn_infrastructure.rv_frozen object at 0x7feee360cf50>,
                                        'max_depth': [3, 5, 7],
                                        'n_estimators': <scipy.stats._distn_infrastructure.rv_frozen object at 0x7feee16e5910>},
                   scoring='neg_log_loss', verbose=1)

In [155]:
print(xgboost_search.best_score_, xgboost_search.best_estimator_)

-0.4539473525681987 XGBClassifier(alpha=11, colsample_bytree=0.5663535118998535, gamma=8,
              learning_rate=0.29021836075911334, max_depth=7, n_estimators=8,
              n_jobs=-1)


In [156]:
xgboost_search.score(X=xteb,y=yteb)

-0.4413017254923791

In [157]:
xgboost_predictor = xgboost_search.best_estimator_
importance = xgboost_predictor.feature_importances_
pd.DataFrame([best_features.columns,importance])

Unnamed: 0,0,1,2,3,4,5,6,7,8,9
0,"Pricing, Delivery_Terms_Quote_Appr","Pricing, Delivery_Terms_Approved",Bureaucratic_Code_0_Approval,Bureaucratic_Code_0_Approved,Submitted_for_Approval,ASP,ASP_(converted),TRF,Total_Amount,Total_Taxable_Amount
1,0.0280644,0.379716,0.0482786,0.049367,0,0.0634855,0.0585002,0.274041,0.0317251,0.0668222


In [158]:
xgboost_predictor.fit(best_features,target)

XGBClassifier(alpha=11, colsample_bytree=0.5663535118998535, gamma=8,
              learning_rate=0.29021836075911334, max_depth=7, n_estimators=8,
              n_jobs=-1)

In [159]:
to_predict = df_test.loc[:,["Pricing, Delivery_Terms_Quote_Appr","Pricing, Delivery_Terms_Approved","Bureaucratic_Code_0_Approval",\
                       "Bureaucratic_Code_0_Approved","Submitted_for_Approval","ASP","ASP_(converted)","TRF",\
                        "Total_Amount","Total_Taxable_Amount"]]

prediction = xgboost_predictor.predict(to_predict)
df_test["Target"] = prediction
df_test.head()

subm = df_test.groupby("Opportunity_ID").mean()
subm = subm.loc[:,["Target"]]
subm.head()
subm.to_csv('data/submit_xgboost.csv')