In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import xgboost as xgb
import catboost as ctb
from sklearn.metrics import log_loss
from sklearn.metrics import roc_auc_score
from sklearn.metrics import f1_score
from sklearn.metrics import confusion_matrix
import numpy as np
import category_encoders as ce

In [2]:
train_set = pd.read_csv('train_prepared.csv', encoding='utf-8')
test_set = pd.read_csv('test_prepared.csv', encoding='utf-8')

In [3]:
train_set.head()

Unnamed: 0,"Pricing, Delivery_Terms_Quote_Appr","Pricing, Delivery_Terms_Approved",Bureaucratic_Code_0_Approval,Bureaucratic_Code_0_Approved,Opportunity_ID,Quote_Type,Target,Convertibility,Total_Amount_Sum_USD,Total_Taxable_Amount_USD,...,Opportunity_Type_target,Source _target,Region_target,Or,Expiry_Days_Create,Expiry_Days_Modified,Days_Modified_Diff_to_Start_Delivery,Days_Modified_Diff_to_End_Delivery,Quarter_Creation,Quarter_Delivery
0,1,1,1,1,0,1,0,1.131096,5964043.8,5964043.8,...,0.517054,0.509776,0.46282,0,54.0,-135.0,43.0,-17.0,4,2
1,0,0,0,0,1,1,1,1.131094,54552.68,54552.68,...,0.517054,0.509776,0.46282,0,44.0,5.0,-3.0,-5.0,4,1
2,0,0,0,0,2,1,1,1.0,83865.6,83865.6,...,0.517054,0.438859,0.444896,0,297.0,1.0,248.0,248.0,4,1
3,1,0,1,0,3,1,0,1.0,7421881.5,7421881.5,...,0.156954,0.632822,0.444896,1,,,54.0,-4.0,4,1
4,1,0,1,0,4,1,0,1.0,13357192.5,13357192.5,...,0.156954,0.632822,0.444896,1,542.0,-298.0,54.0,27.0,4,1


In [4]:
X = train_set.drop(columns=['Target', 'Opportunity_ID'])
y = train_set.Target

In [None]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = \
    train_test_split(X, y, test_size=0.2, random_state=123)

cat = ctb.CatBoostClassifier(silent=True)

cat.fit(X_train, y_train)

In [None]:
f_imp = cat.feature_importances_
f = X_test.columns
fts = ['Opportunity_ID', 'Target']

for i in range (len(f_imp)):
    print("{: >10}\t\t{: >50}".format(f[i],f_imp[i]))
    if f_imp[i]>=1:
        fts.append(f[i])

In [None]:
train_filtered = train_set.loc[:, fts]
train_filtered = train_filtered.replace([np.inf,-np.inf], np.nan)

In [None]:
X = train_filtered.drop(columns=['Opportunity_ID', 'Target'])
y = train_filtered.Target

In [None]:
X_train, X_test, y_train, y_test = \
    train_test_split(X, y, test_size=0.2, random_state=123)

In [None]:
#params = {'depth':[3,1,2,6,4,5,7,8,9,10],
#          'iterations':[250,100,500,1000],
#          'learning_rate':[0.03,0.001,0.01,0.1,0.2,0.3], 
#          'l2_leaf_reg':[3,1,5,10,100],
#          'border_count':[32,5,10,20,50,100,200]
          #'ctr_border_count':[50,5,10,20,100,200]
#            }

#con train

#from sklearn.model_selection import RandomizedSearchCV

#cat_tuning = ctb.CatBoostClassifier(silent=True)

#random_search = RandomizedSearchCV(cat_tuning,param_distributions=params,n_iter=200, scoring='neg_log_loss',n_jobs=-1,cv=5,verbose=3)

#random_search.fit(X_train,y_train)

In [None]:
#random_search

In [None]:
test_filtered = test_set.loc[:, fts]

In [None]:
test_filtered

In [None]:
#test_pred_CB = random_search.predict_proba(test_filtered.drop(columns=['Opportunity_ID', 'Target']))

In [None]:
#test_set['Pred_Tuned'] = pd.DataFrame(test_pred_CB)[1].to_list()
#test_set

In [None]:
#log_loss(test_set.Target, test_pred_CB)

In [None]:
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import KBinsDiscretizer
from sklearn.compose import make_column_selector
from sklearn.impute import SimpleImputer

numeric_cols = X_train.select_dtypes(include=['float64', 'int']).columns.to_list()

# Transformaciones para las variables numéricas
numeric_transformer = Pipeline(
                        steps=[
                            ('imputer', SimpleImputer()),
                            ('scaler', KBinsDiscretizer(n_bins=5, strategy='quantile'))
                        ]
                      )

preprocessor = ColumnTransformer(
                    transformers=[
                        ('numeric', numeric_transformer, numeric_cols)
                        ],
                    remainder='passthrough'
                    )

In [None]:
# Se combinan los pasos de preprocesado y el modelo en un mismo pipeline
best = {'learning_rate': 0.01,
 'l2_leaf_reg': 5,
 'iterations': 500,
 'depth': 7,
 'border_count': 200}

cat = ctb.CatBoostClassifier(silent=True, **best)

pipe = Pipeline([('preprocessing', preprocessor),('rf',cat)])

# Train
# ==============================================================================
# Se asigna el resultado a _ para que no se imprima por pantalla
_ = pipe.fit(X=X_train, y=y_train)

In [None]:
log_loss(y_test, pipe.predict_proba(X_test))

In [None]:
log_loss(test_filtered.Target, pipe.predict_proba(test_filtered.drop(columns=['Opportunity_ID', 'Target'])))

In [None]:
best = {'learning_rate': 0.01,
 'l2_leaf_reg': 5,
 'iterations': 500,
 'depth': 7,
 'border_count': 200}

cat = ctb.CatBoostClassifier(silent=True, **best)

cat.fit(X_train, y_train)

In [None]:
log_loss(y_test, cat.predict_proba(X_test))

In [None]:
log_loss(test_filtered.Target, cat.predict_proba(test_filtered.drop(columns=['Opportunity_ID', 'Target'])))

In [None]:
cat.predict_proba(test_set.drop(columns=['Opportunity_ID', 'Target']))

In [None]:
final = test_filtered.loc[:, ['Opportunity_ID', 'Target']].copy()
final['Target'] = pd.DataFrame(pipe.predict_proba(test_filtered.drop(columns=['Opportunity_ID', 'Target']))[:, 1])
final.to_csv('new_cat.csv', index=False)