In [94]:
import pandas as pd
import matplotlib.pyplot as plt
import xgboost as xgb
import catboost as ctb
from sklearn.metrics import log_loss
from sklearn.metrics import roc_auc_score
from sklearn.metrics import f1_score
from sklearn.metrics import confusion_matrix
import numpy as np
import category_encoders as ce

In [95]:
train_set = pd.read_csv('train_prepared.csv', encoding='utf-8')
test_set = pd.read_csv('test_prepared.csv', encoding='utf-8')

train_set = train_set.replace([np.inf,-np.inf], np.nan)
test_set = test_set.replace([np.inf,-np.inf], np.nan)

In [96]:
train_set.head()

Unnamed: 0,"Pricing, Delivery_Terms_Quote_Appr","Pricing, Delivery_Terms_Approved",Bureaucratic_Code_0_Approval,Bureaucratic_Code_0_Approved,Opportunity_ID,Quote_Type,Target,Convertibility,Total_Amount_Sum_USD,Total_Taxable_Amount_USD,...,Opportunity_Type_target,Source _target,Region_target,Or,Expiry_Days_Create,Expiry_Days_Modified,Days_Modified_Diff_to_Start_Delivery,Days_Modified_Diff_to_End_Delivery,Quarter_Creation,Quarter_Delivery
0,1,1,1,1,0,1,0,1.131096,5964043.8,5964043.8,...,0.517054,0.509776,0.46282,0,54.0,-135.0,43.0,-17.0,4,2
1,0,0,0,0,1,1,1,1.131094,54552.68,54552.68,...,0.517054,0.509776,0.46282,0,44.0,5.0,-3.0,-5.0,4,1
2,0,0,0,0,2,1,1,1.0,83865.6,83865.6,...,0.517054,0.438859,0.444896,0,297.0,1.0,248.0,248.0,4,1
3,1,0,1,0,3,1,0,1.0,7421881.5,7421881.5,...,0.156954,0.632822,0.444896,1,,,54.0,-4.0,4,1
4,1,0,1,0,4,1,0,1.0,13357192.5,13357192.5,...,0.156954,0.632822,0.444896,1,542.0,-298.0,54.0,27.0,4,1


In [97]:
X = train_set.drop(columns=['Target', 'Opportunity_ID'])
y = train_set.Target

In [98]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = \
    train_test_split(X, y, test_size=0.2, random_state=123)

cat = ctb.CatBoostClassifier(silent=True)

cat.fit(X_train, y_train)

<catboost.core.CatBoostClassifier at 0xbdc84f0>

In [99]:
f_imp = cat.feature_importances_
f = X_test.columns
fts = ['Opportunity_ID', 'Target']

for i in range (len(f_imp)):
    print("{: >10}\t\t{: >50}".format(f[i],f_imp[i]))
    if f_imp[i]>=1:
        fts.append(f[i])


Pricing, Delivery_Terms_Quote_Appr		                               0.21392452698740255
Pricing, Delivery_Terms_Approved		                                0.5275456635330713
Bureaucratic_Code_0_Approval		                               0.49755601801424015
Bureaucratic_Code_0_Approved		                               0.13713396866727937
Quote_Type		                               0.03917718277168097
Convertibility		                                 3.363900280649805
Total_Amount_Sum_USD		                                1.1015124095825457
Total_Taxable_Amount_USD		                                 2.405097020450264
Year_Creation		                                1.6368461099199387
Month_Creation		                                1.5012355691918575
Year_Delivery		                                1.6764926731210048
Month_Delivery		                                 1.032218663699171
Days_Passed		                                 4.069970428874594
Wait_Time_Days		                                 7.97529

In [100]:
train_filtered = train_set.loc[:, fts]

In [101]:
X = train_filtered.drop(columns=['Opportunity_ID', 'Target'])
y = train_filtered.Target

In [102]:
X_train, X_test, y_train, y_test = \
    train_test_split(X, y, test_size=0.2, random_state=123)

In [103]:
test_filtered = test_set.loc[:, fts]

In [104]:
test_filtered.head()

Unnamed: 0,Opportunity_ID,Target,Convertibility,Total_Amount_Sum_USD,Total_Taxable_Amount_USD,Year_Creation,Month_Creation,Year_Delivery,Month_Delivery,Days_Passed,...,Delivery_Terms_target,Bureaucratic_Code_target,Opportunity_Type_target,Source _target,Region_target,Or,Expiry_Days_Create,Expiry_Days_Modified,Days_Modified_Diff_to_Start_Delivery,Days_Modified_Diff_to_End_Delivery
0,10689,1,1.131098,415586.8,415586.8,2019,4,2019,5,0.0,...,0.479354,0.593111,0.702041,0.438859,0.46282,0,18.0,18.0,-19.0,-49.0
1,10690,1,1.131096,857126.2,857126.2,2019,4,2019,6,13.0,...,0.479354,0.593111,0.702041,0.438859,0.46282,0,18.0,5.0,-37.0,-64.0
2,10691,1,1.0,21037.5,21037.5,2019,4,2019,10,4.0,...,0.543384,0.593111,0.687187,0.556701,0.444896,0,,,-168.0,-168.0
3,10692,1,1.0,2169106.0,2169106.0,2019,4,2019,12,4.0,...,0.543384,0.283333,0.517054,0.632822,0.444896,0,,,-238.0,-238.0
4,10693,1,1.0,5752.5,5752.5,2019,4,2019,5,1.0,...,0.543384,0.593111,0.687187,0.556701,0.444896,0,30.0,29.0,-19.0,-19.0


In [105]:
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import KBinsDiscretizer
from sklearn.compose import make_column_selector
from sklearn.impute import SimpleImputer


numeric_cols = X_train.select_dtypes(include=['float64', 'int']).columns.to_list()

# Transformaciones para las variables numéricas
numeric_transformer = Pipeline(
                        steps=[
                            ('imputer', SimpleImputer()),
                            ('scaler', KBinsDiscretizer(n_bins=5, strategy='quantile'))
                        ]
                      )

preprocessor = ColumnTransformer(
                    transformers=[
                        ('numeric', numeric_transformer, numeric_cols)
                        ],
                    remainder='passthrough'
                    )

In [None]:
# Se combinan los pasos de preprocesado y el modelo en un mismo pipeline
best = {'learning_rate': 0.01,
 'l2_leaf_reg': 5,
 'iterations': 500,
 'depth': 7,
 'border_count': 200}

cat = ctb.CatBoostClassifier(silent=True, **best)

pipe = Pipeline([('preprocessing', preprocessor),('rf',cat)])

# Train
# ==============================================================================
# Se asigna el resultado a _ para que no se imprima por pantalla
_ = pipe.fit(X=X_train, y=y_train)



In [None]:
log_loss(y_test, pipe.predict_proba(X_test))

In [None]:
log_loss(test_filtered.Target, pipe.predict_proba(test_filtered.drop(columns=['Opportunity_ID', 'Target'])))

# CatBoost solo

In [None]:
best = {'learning_rate': 0.01,
 'l2_leaf_reg': 5,
 'iterations': 500,
 'depth': 7,
 'border_count': 200}

cat = ctb.CatBoostClassifier(silent=True, **best)

cat.fit(X_train, y_train)

In [None]:
log_loss(y_test, cat.predict_proba(X_test))

In [None]:
log_loss(test_filtered.Target, cat.predict_proba(test_filtered.drop(columns=['Opportunity_ID', 'Target'])))

# Guardado de la prediccion

In [None]:
final = test_filtered.loc[:, ['Opportunity_ID', 'Target']].copy()
final['Target'] = pd.DataFrame(pipe.predict_proba(test_filtered.drop(columns=['Opportunity_ID', 'Target']))[:, 1])
final.to_csv('new_cat.csv', index=False)