In [1]:
import catboost as cb
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import sklearn as skl
from sklearn.metrics import confusion_matrix, accuracy_score
import skopt
import scipy
from skopt.space import Real

PREDICCION_REAL = False
MAXIMIZAR_HIPERPARAMETROS = True
PARAMETROS = {
    "iterations" : 6,
    'learning_rate': 0.015085957893870945,
    "random_seed" : 9355,
    "l2_leaf_reg" : 6.150356581299571e-06,
    'subsample': 0.31586225948456137,
    "random_strength" : 0.27365314292727794,
    'depth': 15,
    "rsm" : 0.36719138525672734,
    "early_stopping_rounds" : 3,
    "border_count" : 21324,
    "loss_function":'Logloss',
    "verbose":True,
    "has_time" : True,
    'bagging_temperature': 7.77711454463817e-07,
    'scale_pos_weight': 0.3982269802506704
}

In [2]:
#APERTURA DE ARCHIVO DE ARCHIVOS
entrenamiento = pd.read_csv("entrenamiento-listo.csv")
test = pd.read_csv("test-listo.csv")

In [3]:
#FILTRADO DE COLUMNAS - NO REMOVER STAGE O FECHA
fugas = ['ID','Opportunity_Name','Sales_Contract_No','Account_Name','Account_Owner','Opportunity_Owner','Last_Modified_By','Product_Family','Product_Name','ASP','ASP_(converted)']
#'Total_Taxable_Amount'
columnas_fecha = ['Month','Last_Modified_Date','Account_Created_Date','Opportunity_Created_Date','Quote_Expiry_Date','Planned_Delivery_Start_Date','Planned_Delivery_End_Date']
#entrenamiento = entrenamiento.drop(columns=fugas)
#test = test.drop(columns=fugas)
#test = test.drop(columns=columnas_fecha)
#entrenamiento = entrenamiento.drop(columns=columnas_fecha)

In [4]:
entrenamiento.sort_values("Fecha")
objetivo = (entrenamiento['Stage'] == 'Closed Won').astype(int)
entrenamiento = entrenamiento.drop(columns=['Stage',"Fecha"])
columnas_object = list(entrenamiento.select_dtypes(include=['object']).columns)
if 'Stage' in columnas_object : columnas_object.remove('Stage')

In [5]:
if not PREDICCION_REAL:
    test_label = (test['Stage'] == 'Closed Won').astype(int)
    test = test.drop(columns=['Stage'])

In [6]:
entrenamiento_pool = cb.Pool(entrenamiento, objetivo,cat_features=columnas_object)
test_pool = cb.Pool(test, cat_features=columnas_object)

In [7]:
model = cb.CatBoostClassifier(**PARAMETROS)

model.fit(entrenamiento_pool)
# make the prediction using the resulting model
preds = model.predict(test_pool)
print("class = ", preds)

0:	learn: 0.6251398	total: 8.98s	remaining: 44.9s
1:	learn: 0.5615255	total: 20.6s	remaining: 41.2s
2:	learn: 0.5015362	total: 31.2s	remaining: 31.2s
3:	learn: 0.4551814	total: 39.8s	remaining: 19.9s
4:	learn: 0.4105875	total: 49.1s	remaining: 9.82s
5:	learn: 0.3730502	total: 53.3s	remaining: 0us
class =  [1 1 0 ... 1 0 0]


In [8]:
resultados = test[['Opportunity_ID']].copy()
resultados['Target'] = pd.Series(preds)
resultados = resultados.groupby('Opportunity_ID').mean()
resultados = resultados.reset_index()
#resultados['Target'] = resultados['Target'].apply(lambda x: int(x >= 0.5))    
resultados.to_csv("prediccion.csv", index=False)
resultados['Target'].value_counts()

1.000000    1447
0.000000    1090
0.500000      17
0.333333       2
0.666667       1
0.857143       1
0.833333       1
0.800000       1
0.777778       1
0.250000       1
Name: Target, dtype: int64

In [9]:
if not PREDICCION_REAL:
    cm = confusion_matrix(test_label, preds)
    print(cm)
    accuracy_score(test_label, preds)

[[1533  182]
 [ 184 2531]]


In [10]:
accuracy_score(test_label, preds)

0.9173814898419864

In [11]:
model.get_feature_importance(prettified=True)

Unnamed: 0,Feature Id,Importances
0,Sales_Contract_No,57.115252
1,Delivery_Terms,4.695373
2,Account_Name,3.328441
3,Account_Type,3.104905
4,Source,2.552632
5,Bureaucratic_Code,2.375937
6,Last_Modified_By,2.305174
7,Region,2.281342
8,ASP,2.20044
9,Planned_Delivery_End_Date,2.106708


In [12]:
preds = model.predict(entrenamiento_pool)
accuracy_score(objetivo, preds)

0.9958880139982502

In [13]:
if(not MAXIMIZAR_HIPERPARAMETROS): sys.exit()

In [19]:
from catboost import CatBoostClassifier
from skopt import BayesSearchCV
from skopt.space import Real
from sklearn.model_selection import StratifiedKFold

# Classifier
bayes_cv_tuner = BayesSearchCV(
estimator = CatBoostClassifier(
silent=True,cat_features=columnas_object, bootstrap_type='MVS'
),
search_spaces = {
        "iterations" : (5, 20),
        'learning_rate': Real(low=0.01, high=1, prior='log-uniform'),
        "random_seed" : (1,40000),
        "l2_leaf_reg" : Real(low=1e-9, high=1000, prior='log-uniform'),
        'subsample': Real(low=0.01, high=1, prior='uniform'),
        "random_strength" : Real(low=1e-9, high=1000, prior='log-uniform'),
        'depth': (1, 16),
        "rsm" : Real(low=0.01, high=1, prior='uniform'),
        "early_stopping_rounds" : (1, 5),
        "border_count" : (1,65535),
        "bagging_temperature" : Real(low=1e-9,high=1000,prior='log-uniform'),
        "scale_pos_weight" : Real(low = 0.01,high = 1.0,prior="uniform")
},
cv = skl.model_selection.TimeSeriesSplit(),
n_jobs = 1,
n_iter = 5,
verbose = 1,
refit = True,
random_state = 72
)

In [20]:
def status_print(optim_result):
    """Status callback durring bayesian hyperparameter search"""
    # Get all the models tested so far in DataFrame format
    all_models = pd.DataFrame(bayes_cv_tuner.cv_results_)    

    # Get current parameters and the best parameters    
    best_params = pd.Series(bayes_cv_tuner.best_params_)
    print('Model #{}\nBest ROC-AUC: {}\nBest params: {}\n'.format(
        len(all_models),
        np.round(bayes_cv_tuner.best_score_, 4),
        bayes_cv_tuner.best_params_
    ))

In [21]:
resultCAT = bayes_cv_tuner.fit(entrenamiento, objetivo, callback=status_print)

Fitting 5 folds for each of 1 candidates, totalling 5 fits


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   5 out of   5 | elapsed:    5.7s finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


Model #1
Best ROC-AUC: 0.9566
Best params: OrderedDict([('bagging_temperature', 7.385095136875193e-08), ('border_count', 60371), ('depth', 5), ('early_stopping_rounds', 2), ('iterations', 12), ('l2_leaf_reg', 0.10952643394246851), ('learning_rate', 0.030724310795859633), ('random_seed', 30688), ('random_strength', 0.013384337685341417), ('rsm', 0.23362736443976684), ('scale_pos_weight', 0.5319710298103609), ('subsample', 0.7693297128407879)])

Fitting 5 folds for each of 1 candidates, totalling 5 fits


[Parallel(n_jobs=1)]: Done   5 out of   5 | elapsed:   10.4s finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


Model #2
Best ROC-AUC: 0.9566
Best params: OrderedDict([('bagging_temperature', 7.385095136875193e-08), ('border_count', 60371), ('depth', 5), ('early_stopping_rounds', 2), ('iterations', 12), ('l2_leaf_reg', 0.10952643394246851), ('learning_rate', 0.030724310795859633), ('random_seed', 30688), ('random_strength', 0.013384337685341417), ('rsm', 0.23362736443976684), ('scale_pos_weight', 0.5319710298103609), ('subsample', 0.7693297128407879)])

Fitting 5 folds for each of 1 candidates, totalling 5 fits


[Parallel(n_jobs=1)]: Done   5 out of   5 | elapsed:   10.5s finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


Model #3
Best ROC-AUC: 0.9566
Best params: OrderedDict([('bagging_temperature', 7.385095136875193e-08), ('border_count', 60371), ('depth', 5), ('early_stopping_rounds', 2), ('iterations', 12), ('l2_leaf_reg', 0.10952643394246851), ('learning_rate', 0.030724310795859633), ('random_seed', 30688), ('random_strength', 0.013384337685341417), ('rsm', 0.23362736443976684), ('scale_pos_weight', 0.5319710298103609), ('subsample', 0.7693297128407879)])

Fitting 5 folds for each of 1 candidates, totalling 5 fits


[Parallel(n_jobs=1)]: Done   5 out of   5 | elapsed:  7.1min finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


Model #4
Best ROC-AUC: 0.9636
Best params: OrderedDict([('bagging_temperature', 1.1063407351624084), ('border_count', 7137), ('depth', 15), ('early_stopping_rounds', 4), ('iterations', 14), ('l2_leaf_reg', 0.17260178161990627), ('learning_rate', 0.027307511040591478), ('random_seed', 5712), ('random_strength', 2.3464965049108726e-07), ('rsm', 0.819600044656442), ('scale_pos_weight', 0.9880746674599722), ('subsample', 0.23970990732871852)])

Fitting 5 folds for each of 1 candidates, totalling 5 fits


[Parallel(n_jobs=1)]: Done   5 out of   5 | elapsed:    6.2s finished


Model #5
Best ROC-AUC: 0.9636
Best params: OrderedDict([('bagging_temperature', 1.1063407351624084), ('border_count', 7137), ('depth', 15), ('early_stopping_rounds', 4), ('iterations', 14), ('l2_leaf_reg', 0.17260178161990627), ('learning_rate', 0.027307511040591478), ('random_seed', 5712), ('random_strength', 2.3464965049108726e-07), ('rsm', 0.819600044656442), ('scale_pos_weight', 0.9880746674599722), ('subsample', 0.23970990732871852)])



In [23]:
print(bayes_cv_tuner.best_score_)
print(bayes_cv_tuner.best_params_)

0.9635695538057742
OrderedDict([('bagging_temperature', 1.1063407351624084), ('border_count', 7137), ('depth', 15), ('early_stopping_rounds', 4), ('iterations', 14), ('l2_leaf_reg', 0.17260178161990627), ('learning_rate', 0.027307511040591478), ('random_seed', 5712), ('random_strength', 2.3464965049108726e-07), ('rsm', 0.819600044656442), ('scale_pos_weight', 0.9880746674599722), ('subsample', 0.23970990732871852)])


In [24]:
preds = bayes_cv_tuner.predict(test.values)

In [25]:
accuracy_score(test_label, preds)

0.9489841986455982