In [558]:
import pandas as pd
import matplotlib.pyplot as plt
import xgboost as xgb
import catboost as ctb
from sklearn.metrics import log_loss
from sklearn.metrics import roc_auc_score
from sklearn.metrics import f1_score
from sklearn.metrics import confusion_matrix
import numpy as np
import category_encoders as ce


In [559]:
train_set = pd.read_csv('train_prepared.csv', encoding='utf-8')
test_set = pd.read_csv('test_prepared.csv', encoding='utf-8')

In [560]:
X = train_set.drop(columns=['Target', 'Opportunity_ID'])
y = train_set.Target

In [561]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = \
    train_test_split(X, y, test_size=0.2, random_state=123)

cat = ctb.CatBoostClassifier(silent=True)

cat.fit(X_train, y_train)

<catboost.core.CatBoostClassifier at 0x12470b5e0>

In [562]:
print(log_loss(y_test, cat.predict_proba(X_test)))
f1_score(y_test, cat.predict(X_test))

0.2568603189981852


0.8911079410366144

In [563]:
print(log_loss(test_set.Target, cat.predict_proba(test_set.drop(columns=['Target', 'Opportunity_ID']))))
f1_score(test_set.Target, cat.predict(test_set.drop(columns=['Target', 'Opportunity_ID'])))

0.4998636703073566


0.8131067961165048

In [564]:
f_imp = cat.feature_importances_
f = X_test.columns
fts = ['Opportunity_ID', 'Target']

for i in range (len(f_imp)):
    print("{: >10}\t\t{: >50}".format(f[i],f_imp[i]))
    if f_imp[i]>=1:
        fts.append(f[i])

Pricing, Delivery_Terms_Quote_Appr		                               0.17158873091029692
Pricing, Delivery_Terms_Approved		                                0.2921818180324234
Bureaucratic_Code_0_Approval		                                0.6168487628029252
Bureaucratic_Code_0_Approved		                               0.08208175041822692
Quote_Type		                              0.058011010086005003
Convertibility		                                2.1453769616128793
Total_Amount_Sum_USD		                                1.0261316704869852
Total_Taxable_Amount_USD		                                1.5015943887978114
Year_Creation		                                1.5136448688338762
Month_Creation		                                1.3273446120597159
Year_Delivery		                                1.3249450300331205
Month_Delivery		                                0.7807357054004878
Days_Passed		                                3.3838996329723643
Wait_Time_Days		                                 6.66163

In [565]:
train_filtered = train_set.loc[:, fts]

In [566]:
X = train_filtered.drop(columns=['Opportunity_ID', 'Target'])
y = train_filtered.Target

In [567]:
X_train, X_test, y_train, y_test = \
    train_test_split(X, y, test_size=0.2, random_state=123)

In [568]:
cat_filtered = ctb.CatBoostClassifier(silent=True)
cat_filtered.fit(X_train, y_train)

<catboost.core.CatBoostClassifier at 0x124446430>

In [569]:
print(log_loss(y_test, cat_filtered.predict_proba(X_test)))
f1_score(y_test, cat_filtered.predict(X_test))

0.25785224914054666


0.8944866920152091

In [570]:
params = {'depth':[3,1,2,6,4,5,7,8,9,10],
          'iterations':[250,100,500,1000],
          'learning_rate':[0.03,0.001,0.01,0.1,0.2,0.3], 
          'l2_leaf_reg':[3,1,5,10,100],
          'border_count':[32,5,10,20,50,100,200,250]
          #'ctr_border_count':[50,5,10,20,100,200]
            }

#con train

#from sklearn.model_selection import RandomizedSearchCV

#cat_tuning = ctb.CatBoostClassifier(silent=True)

#random_search = RandomizedSearchCV(cat_tuning,param_distributions=params\
#                                   ,n_iter=100, scoring='f1',n_jobs=-1,cv=5,verbose=3)

#random_search.fit(X_train,y_train)

In [571]:
#random_search.best_params_

In [572]:
test_filtered = test_set.loc[:, fts]

In [573]:
test_filtered

Unnamed: 0,Opportunity_ID,Target,Convertibility,Total_Amount_Sum_USD,Total_Taxable_Amount_USD,Year_Creation,Month_Creation,Year_Delivery,Days_Passed,Wait_Time_Days,...,Bureaucratic_Code_target,Account_Name_target,Account_Owner_target,Opportunity_Owner_target,Or,Last_Modified_By_Owners,Expiry_Days_Create,Expiry_Days_Modified,Days_Modified_Diff_to_Start_Delivery,Days_Modified_Diff_to_End_Delivery
0,10689,1,1.131098,4.155868e+05,415586.217750,2019,4,2019,0.0,19.0,...,0.593111,0.428571,0.586683,0.573316,0,True,18.0,18.0,-19.0,-49.0
1,10690,1,1.131096,8.571262e+05,757783.500000,2019,4,2019,13.0,50.0,...,0.593111,0.428571,0.586683,0.573316,0,True,18.0,5.0,-37.0,-64.0
2,10691,1,1.000000,2.103750e+04,189.135216,2019,4,2019,4.0,172.0,...,0.593111,0.515395,0.354839,0.515395,0,True,,,-168.0,-168.0
3,10692,1,1.000000,2.169106e+06,19501.101707,2019,4,2019,4.0,242.0,...,0.283333,0.780282,0.571654,0.562254,0,True,,,-238.0,-238.0
4,10693,1,1.000000,5.752500e+03,51.717188,2019,4,2019,1.0,20.0,...,0.593111,0.944444,0.682692,0.673469,0,True,30.0,29.0,-19.0,-19.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1562,12364,1,1.000000,1.477500e+05,167118.681818,2019,4,2019,0.0,5.0,...,0.593111,0.599998,0.586683,0.573316,0,True,28.0,28.0,-5.0,-14.0
1563,12365,1,1.000000,4.505490e+04,50961.689339,2019,4,2019,1.0,158.0,...,0.593111,0.349142,0.348416,0.343373,0,True,28.0,27.0,-157.0,-248.0
1564,12366,1,1.000000,1.001220e+05,100122.000000,2019,4,2019,1.0,158.0,...,0.593111,0.349142,0.348416,0.343373,0,True,28.0,27.0,-157.0,-248.0
1565,12367,0,1.000000,1.432200e+05,161995.861176,2019,4,2019,0.0,97.0,...,0.593111,0.515395,0.354839,0.515395,0,True,30.0,30.0,-97.0,-97.0


In [574]:
#test_pred_CB = random_search.predict_proba(test_filtered.drop(columns=['Opportunity_ID', 'Target']))

In [575]:
#test_set['Pred_Tuned'] = pd.DataFrame(test_pred_CB)[1].to_list()
#test_set

In [576]:
#log_loss(test_set.Target, test_pred_CB)

In [577]:
best = {'learning_rate': 0.01,
 'l2_leaf_reg': 5,
 'iterations': 300,
 'depth': 7,
 'border_count': 135}

cat = ctb.CatBoostClassifier(silent=True, **best)

cat.fit(X_train, y_train)

<catboost.core.CatBoostClassifier at 0x129c0f550>

In [578]:
print(log_loss(y_test, cat.predict_proba(X_test)))
f1_score(y_test,cat.predict(X_test))

0.3175762383687645


0.8697718631178707

In [579]:
cat.predict_proba(test_set.drop(columns=['Opportunity_ID', 'Target']))

array([[0.29123087, 0.70876913],
       [0.5696416 , 0.4303584 ],
       [0.66548206, 0.33451794],
       ...,
       [0.78046059, 0.21953941],
       [0.59544165, 0.40455835],
       [0.86397998, 0.13602002]])

In [580]:
print(log_loss(test_filtered.Target, cat.predict_proba\
               (test_filtered.drop(columns=['Opportunity_ID', 'Target']))))
f1_score(test_filtered.Target, cat.predict(test_filtered.drop(columns=['Opportunity_ID', 'Target'])))

0.43648457997588985


0.8170212765957446

In [581]:
final = test_filtered.loc[:, ['Opportunity_ID', 'Target']].copy()
final['Target'] = pd.DataFrame(cat.predict_proba(test_filtered.drop(columns=['Opportunity_ID', 'Target']))[:, 1])
final.to_csv('predictions/new_cat.csv', index=False)

In [582]:
X_train.columns

Index(['Convertibility', 'Total_Amount_Sum_USD', 'Total_Taxable_Amount_USD',
       'Year_Creation', 'Month_Creation', 'Year_Delivery', 'Days_Passed',
       'Wait_Time_Days', 'Delivery_Window', 'Account_LifeSpan_at_Creation',
       'Account_LifeSpan_at_Deliv', 'Wait_Delivery_Cmp', 'Days_Left_Cmp',
       'USD_Per_Day_Waited', 'USD_Per_Day_Passed', 'USD_Per_Account_Day',
       'Total_Products', 'Product_Price_Mean', 'Product_Price_Std',
       'Product_Price_Min', 'Product_Family_target', 'Product_Name_target',
       'Territory_target', 'Billing_Country_target', 'Delivery_Terms_target',
       'Account_Type_target', 'Bureaucratic_Code_target',
       'Account_Name_target', 'Account_Owner_target',
       'Opportunity_Owner_target', 'Or', 'Last_Modified_By_Owners',
       'Expiry_Days_Create', 'Expiry_Days_Modified',
       'Days_Modified_Diff_to_Start_Delivery',
       'Days_Modified_Diff_to_End_Delivery'],
      dtype='object')

In [583]:
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import KBinsDiscretizer
from sklearn.compose import make_column_selector
from sklearn.impute import KNNImputer

#numeric_cols = X_train.select_dtypes(include=['int', 'float64']).columns.to_list()
numeric_cols = ['Total_Amount_Sum_USD', 'Total_Taxable_Amount_USD']

# Transformaciones para las variables numéricas
numeric_transformer = Pipeline(
                        steps=[
                            ('imputer', SimpleImputer()),
                            ('scaler', KBinsDiscretizer(n_bins=5, strategy='quantile'))
                        ]
                      )

preprocessor = ColumnTransformer(
                    transformers=[
                        ('numeric', numeric_transformer, numeric_cols)
                        ],
                    remainder='passthrough'
                    )

In [584]:
cat = ctb.CatBoostClassifier(silent=True, **best)

pipe = Pipeline([('preprocessing', preprocessor),('rf',cat)])

X_train = X_train.replace([np.inf,-np.inf], np.nan)
#X_test = X_test.replace([np.inf,-np.inf], np.nan)

#test_filtered = test_filtered.replace([np.inf,-np.inf], np.nan)
# Train
# ==============================================================================
# Se asigna el resultado a _ para que no se imprima por pantalla
_ = pipe.fit(X=X_train, y=y_train)

In [585]:
print(log_loss(y_test, pipe.predict_proba(X_test)))
f1_score(y_test,pipe.predict(X_test))

0.3162949930822135


0.8682834046600095

In [586]:
print(log_loss(test_filtered.Target, pipe.predict_proba\
               (test_filtered.drop(columns=['Opportunity_ID', 'Target']))))
f1_score(test_filtered.Target, pipe.predict(test_filtered.drop(columns=['Opportunity_ID', 'Target'])))

0.439228729394903


0.8132271892222903

In [587]:
best = {'learning_rate': 0.01,
 'l2_leaf_reg': 5,
 'iterations': 250,
 'depth': 7,
 'border_count': 135}

cat_bag = ctb.CatBoostClassifier(silent=True, **best)

cat_bagging = BaggingClassifier(base_estimator= pipe, n_estimators=10, random_state=0)

cat_bagging.fit(X_train, y_train)

ValueError: Specifying the columns using strings is only supported for pandas DataFrames

In [None]:
print(log_loss(y_test, cat_bagging.predict_proba(X_test)))
f1_score(y_test,cat_bagging.predict(X_test))

In [None]:
print(log_loss(test_filtered.Target, cat_bagging.predict_proba\
               (test_filtered.drop(columns=['Opportunity_ID', 'Target']))))
f1_score(test_filtered.Target, cat_bagging.predict(test_filtered.drop(columns=['Opportunity_ID', 'Target'])))