In [1]:
import sys
import catboost as cb
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import sklearn as skl
from sklearn.metrics import confusion_matrix, accuracy_score, log_loss
import skopt
import scipy
import seaborn as sns
from skopt.space import Real

PREDICCION_REAL = False
MAXIMIZAR_HIPERPARAMETROS = False
PARAMETROS = {
    'task_type' : 'GPU',
    'devices' : '0:1',
    'bootstrap_type' : 'MVS',
    'verbose' : True,
    'has_time' : True,
    
    #"rsm" : 0.36719138525672734,
    'bagging_temperature': 1.1063407351624084,
    'border_count': 7137,
    'depth': 9,
    'early_stopping_rounds': 4,
     'iterations': 100,
    'l2_leaf_reg': 0.17260178161990627,
    'learning_rate': 0.027307511040591478,
    'random_seed': 5712,
    'random_strength': 2.3464965049108726e-07,
    'scale_pos_weight': 0.819600044656442,
    'subsample': 0.9880746674599722
}

In [2]:
#APERTURA DE ARCHIVO DE ARCHIVOS
entrenamiento = pd.read_pickle("../Archivos/Arboles_entrenamiento.pkl")
test = pd.read_pickle("../Archivos/Arboles_validacion.pkl")

if (PREDICCION_REAL):
    entrenamiento = entrenamiento.append(test)
    test = pd.read_pickle("../Archivos/Arboles_test.pkl")

In [3]:
#FILTRADO DE COLUMNAS - NO REMOVER STAGE O FECHA

#columnas_fecha = ['Month','Last_Modified_Date','Account_Created_Date','Opportunity_Created_Date','Quote_Expiry_Date','Planned_Delivery_Start_Date','Planned_Delivery_End_Date']
fugas = ['Sales_Contract_No','ID','Account_Name','Account_Owner','Opportunity_Owner','Last_Modified_By','ASP','ASP_(converted)']
otros = ['Currency']

entrenamiento = entrenamiento.drop(columns=fugas)
test = test.drop(columns=fugas)

#entrenamiento = entrenamiento.drop(columns=columnas_fecha)
#test = test.drop(columns=columnas_fecha)

In [4]:
#FECHAS A DIAS

columnas_fecha = ['Year-Month','Last_Modified_Date','Account_Created_Date','Opportunity_Created_Date','Quote_Expiry_Date','Planned_Delivery_Start_Date','Planned_Delivery_End_Date']
def fecha_a_dias(x):
    fecha_origen = pd.to_datetime('01/01/2000', format='%m/%d/%Y')
    for columna in columnas_fecha:
        x[columna] = x[columna].apply(lambda x : (x - fecha_origen).days)

fecha_a_dias(entrenamiento)
fecha_a_dias(test)

In [5]:
objetivo = entrenamiento['Stage']
entrenamiento = entrenamiento.drop(columns=['Stage'])
columnas_category = list(entrenamiento.select_dtypes(include=['category']).columns)
if 'Stage' in columnas_category : columnas_category.remove('Stage')

In [6]:
if not PREDICCION_REAL:
    test_label = test['Stage']
    test = test.drop(columns=['Stage'])

In [7]:
entrenamiento_pool = cb.Pool(entrenamiento, objetivo,cat_features=columnas_category)
test_pool = cb.Pool(test, cat_features=columnas_category)

In [8]:
model = cb.CatBoostClassifier(**PARAMETROS)
model.fit(entrenamiento_pool)
preds_temp = model.predict_proba(test_pool)

#Array de probabilidad [0,1] a lista de probabilidades de 1
preds = []
for x in  preds_temp:
    preds.append(x[1])

0:	learn: 0.6436846	total: 69.3ms	remaining: 6.86s
1:	learn: 0.6261029	total: 152ms	remaining: 7.47s
2:	learn: 0.6131353	total: 226ms	remaining: 7.32s
3:	learn: 0.6016845	total: 298ms	remaining: 7.16s
4:	learn: 0.5914834	total: 365ms	remaining: 6.94s
5:	learn: 0.5735644	total: 434ms	remaining: 6.8s
6:	learn: 0.5600564	total: 498ms	remaining: 6.62s
7:	learn: 0.5508455	total: 561ms	remaining: 6.46s
8:	learn: 0.5439811	total: 620ms	remaining: 6.27s
9:	learn: 0.5369880	total: 683ms	remaining: 6.14s
10:	learn: 0.5015042	total: 748ms	remaining: 6.05s
11:	learn: 0.4701994	total: 820ms	remaining: 6.01s
12:	learn: 0.4432040	total: 890ms	remaining: 5.96s
13:	learn: 0.4192764	total: 963ms	remaining: 5.91s
14:	learn: 0.3985177	total: 1.03s	remaining: 5.83s
15:	learn: 0.3814873	total: 1.1s	remaining: 5.76s
16:	learn: 0.3660204	total: 1.17s	remaining: 5.72s
17:	learn: 0.3528781	total: 1.24s	remaining: 5.65s
18:	learn: 0.3405750	total: 1.31s	remaining: 5.58s
19:	learn: 0.3299763	total: 1.38s	remainin

In [9]:
if not (PREDICCION_REAL):
    print(log_loss(test_label, preds))

0.42146099279299004


In [10]:
if not (PREDICCION_REAL):
    resultados = pd.DataFrame()
    resultados['Opportunity_ID'] = test['Opportunity_ID']
    resultados['Target'] = pd.Series(preds)
    #resultados['Target'] = resultados['Target'].apply(lambda x: int(x >= 0.5))    
    resultados.to_csv("../Archivos/prediccion_arbol_validacion.csv", index=False)
    resultados['Target'].value_counts()

In [11]:
model.get_feature_importance(prettified=True).head(30)

Unnamed: 0,Feature Id,Importances
0,Opportunity_Name,35.458509
1,Planned_Opportunity_Duration,7.162253
2,Quote_Expiry_Date,5.10471
3,Territory,5.052795
4,Planned_Deliver_Duration,4.106573
5,Account_Created_Date,4.083699
6,Total_Amount(USD),3.278363
7,Bureaucratic_Code,2.840443
8,Opportunity_Total_Amount_Region_avg_Ratio,2.255358
9,Billing_Country,1.751939


In [12]:
if (PREDICCION_REAL):
    resultados = pd.DataFrame()
    resultados['Opportunity_ID'] = test['Opportunity_ID']
    resultados['Target'] = pd.Series(preds)
    resultados = resultados.groupby('Opportunity_ID').mean()
    resultados = resultados.reset_index()
    #resultados['Target'] = resultados['Target'].apply(lambda x: int(x >= 0.5))    
    resultados.to_csv("../Archivos/prediccion_arbol_test.csv", index=False)
    resultados['Target'].value_counts()

In [13]:
preds = model.predict(entrenamiento_pool)
log_loss(objetivo, preds)

0.022871998008338633

In [14]:
if(not MAXIMIZAR_HIPERPARAMETROS): sys.exit()

SystemExit: 

  warn("To exit: use 'exit', 'quit', or Ctrl-D.", stacklevel=1)


In [None]:
from catboost import CatBoostClassifier
from skopt import BayesSearchCV
from skopt.space import Real
from sklearn.model_selection import StratifiedKFold

# Classifier
bayes_cv_tuner = BayesSearchCV(
estimator = CatBoostClassifier(
silent=True,cat_features=columnas_category,task_type="GPU", devices='0:1', bootstrap_type='MVS',
      bagging_temperature= 1.1063407351624084, border_count= 7137, depth= 9, early_stopping_rounds= 4,
    l2_leaf_reg= 0.17260178161990627, learning_rate= 0.027307511040591478, random_seed= 5712,
    random_strength= 2.3464965049108726e-07, scale_pos_weight= 0.819600044656442, subsample= 0.9880746674599722
),
search_spaces = {
        "iterations" : (5, 450),
        'learning_rate': Real(low=0.01, high=5, prior='log-uniform'),
        "l2_leaf_reg" : Real(low=1e-9, high=5000, prior='log-uniform'),
        'subsample': Real(low=0.01, high=1, prior='uniform'),
        "random_strength" : Real(low=1e-9, high=5000, prior='log-uniform'),
        'depth': (1, 10),
        #"rsm" : Real(low=0.01, high=1, prior='uniform'),
        "border_count" : (1,65535),
        "bagging_temperature" : Real(low=1e-9,high=5000,prior='log-uniform'),
        "scale_pos_weight" : Real(low = 0.01,high = 1.0,prior="uniform")
},
cv = skl.model_selection.TimeSeriesSplit(),
n_jobs = 1,
n_iter = 6,
verbose = 1,
refit = True,
random_state = 72
)

In [None]:
def status_print(optim_result):
    """Status callback durring bayesian hyperparameter search"""
    # Get all the models tested so far in DataFrame format
    all_models = pd.DataFrame(bayes_cv_tuner.cv_results_)    

    # Get current parameters and the best parameters    
    best_params = pd.Series(bayes_cv_tuner.best_params_)
    print('Model #{}\nBest ROC-AUC: {}\nBest params: {}\n'.format(
        len(all_models),
        np.round(bayes_cv_tuner.best_score_, 4),
        bayes_cv_tuner.best_params_
    ))

In [None]:
resultCAT = bayes_cv_tuner.fit(entrenamiento, objetivo, callback=status_print)

In [None]:
print(bayes_cv_tuner.best_score_)
print(bayes_cv_tuner.best_params_)

In [None]:
preds = bayes_cv_tuner.predict(test.values)

In [None]:
accuracy_score(test_label, preds)

In [None]:
from sklearn.datasets import load_iris
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import RandomizedSearchCV
from scipy.stats import uniform

logistic = cb.CatBoostClassifier(silent=True,cat_features=columnas_category,task_type="GPU", devices='0:1',bootstrap_type='MVS')
distributions = {"iterations" : (5, 450),
        'learning_rate': Real(low=0.01, high=1, prior='log-uniform'),
        "l2_leaf_reg" : Real(low=1e-9, high=5000, prior='log-uniform'),
        'subsample': Real(low=0.01, high=1, prior='uniform'),
        "random_strength" : Real(low=1e-9, high=5000, prior='log-uniform'),
        'depth': (1, 10),
        "rsm" : Real(low=0.01, high=1, prior='uniform'),
        "border_count" : (1,65535),
        "bagging_temperature" : Real(low=1e-9,high=5000,prior='log-uniform'),
        "scale_pos_weight" : Real(low = 0.01,high = 1.0,prior="uniform")
}
clf = RandomizedSearchCV(logistic, distributions, random_state=0)
search = clf.fit(entrenamiento, objetivo)

In [None]:
search.best_params_