In [56]:
import pandas as pd
import matplotlib.pyplot as plt
import xgboost as xgb
import catboost as ctb
import lightgbm as lgb
from sklearn.metrics import log_loss
import numpy as np
import category_encoders as ce
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import BaggingClassifier
from sklearn import svm
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.impute import KNNImputer

In [57]:
train_set = pd.read_csv('train_prepared.csv', encoding='utf-8')
test_set = pd.read_csv('test_prepared.csv', encoding='utf-8')

train_set = train_set.replace([-np.inf, np.inf], np.nan)
test_set = test_set.replace([-np.inf, np.inf], np.nan)

train_set.rename(columns={"Pricing, Delivery_Terms_Quote_Appr": "Pricing_Delivery_Terms_Quote_Appr", 
                          "Pricing, Delivery_Terms_Approved": "Pricing_Delivery_Terms_Approved"}, inplace=True)

test_set.rename(columns={"Pricing, Delivery_Terms_Quote_Appr": "Pricing_Delivery_Terms_Quote_Appr", 
                         "Pricing, Delivery_Terms_Approved": "Pricing_Delivery_Terms_Approved"}, inplace=True)

In [58]:
X = train_set.drop(columns=['Target', 'Opportunity_ID'])
y = train_set.Target

In [59]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = \
    train_test_split(X, y, test_size=0.2, random_state=123)

In [60]:
final = ctb.CatBoostClassifier(silent=True)

In [61]:
model_rf = RandomForestClassifier(n_estimators= 50, min_samples_split= 2, 
                           min_samples_leaf=5, max_features= 'sqrt', 
                           max_depth= 25,bootstrap= False)

In [62]:
best = {'learning_rate': 0.01,
 'l2_leaf_reg': 2.45,
 'iterations': 500,
 'depth': 5,
 'border_count': 200}

model_cat = ctb.CatBoostClassifier(silent=True)

In [63]:
xgb_best = {'colsample_bytree': 0.84, 
        'gamma': 0.49, 'learning_rate': 0.05, 'max_depth': 23,
        'min_child_weight': 6.0, 'n_estimators': 50, 'subsample': 0.84}

model_xgb = xgb.XGBClassifier()

In [64]:
lgbm_best = {'subsample_for_bin': 250000, 'subsample': 0.8131313131313131, 
        'reg_lambda': 0.2857142857142857, 'reg_alpha': 0.18367346938775508, 
        'num_leaves': 35, 'min_child_samples': 85, 'learning_rate': 0.07016445423929361, 
        'colsample_bytree': 0.3, 'boosting_type': 'gbdt'}

model_lgbm = lgb.LGBMClassifier(**lgbm_best)

In [65]:
model_svm = svm.SVC(probability=True)

In [66]:
best_lr = {'C': 1.8017093069504055, 'penalty': 'l1', 'solver': 'liblinear'}

model_lr = LogisticRegression(max_iter=1000)

In [67]:
model_knn = KNeighborsClassifier(n_neighbors=33)

In [68]:
from sklearn.ensemble import StackingClassifier

rf_cat_xgb_lgbm = StackingClassifier(estimators=[('rf', model_rf), \
                                        ('cat', model_cat),('xgb',model_xgb), ('lgbm', model_lgbm), 
                                        ('svm', model_svm), ('knn', model_knn), ('lr', model_lr)]
                              , final_estimator = LogisticRegression())

In [69]:
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler
from sklearn.compose import make_column_selector
from sklearn.impute import SimpleImputer

numeric_cols = X_train.select_dtypes(include=['int', 'float64']).columns.to_list()
#numeric_cols = ['Total_Amount_Sum_USD', 'Total_Taxable_Amount_USD']

# Transformaciones para las variables numéricas
numeric_transformer = Pipeline(
                        steps=[
                            ('imputer', SimpleImputer()),
                            ('scaler', StandardScaler())
                        ]
                      )

preprocessor = ColumnTransformer(
                    transformers=[
                        ('numeric', numeric_transformer, numeric_cols)
                        ],
                    remainder='passthrough'
                    )

In [70]:
pipe1 = Pipeline([('preprocessing', preprocessor),('stacking', rf_cat_xgb_lgbm)])

In [71]:
pipe1.fit(X_train, y_train)

Pipeline(steps=[('preprocessing',
                 ColumnTransformer(remainder='passthrough',
                                   transformers=[('numeric',
                                                  Pipeline(steps=[('imputer',
                                                                   SimpleImputer()),
                                                                  ('scaler',
                                                                   StandardScaler())]),
                                                  ['Convertibility',
                                                   'Total_Amount_Sum_USD',
                                                   'Total_Taxable_Amount_USD',
                                                   'Year_Creation',
                                                   'Month_Creation',
                                                   'Year_Delivery',
                                                   'Month_Delivery',
                    

In [72]:
log_loss(y_test, pipe1.predict_proba(X_test))

0.30943095759988076

In [73]:
p1 = pd.DataFrame(pipe1.predict_proba(test_set.drop(columns=['Opportunity_ID', 'Target']))[:, 1])
p1['Opportunity_ID'] = test_set.Opportunity_ID.to_list()
p1.columns = ['Target', 'Opportunity_ID']
p1.loc[:, ['Opportunity_ID', 'Target']]

Unnamed: 0,Opportunity_ID,Target
0,10689,0.934331
1,10690,0.766277
2,10691,0.469640
3,10692,0.149215
4,10693,0.944905
...,...,...
1562,12364,0.948986
1563,12365,0.171831
1564,12366,0.158789
1565,12367,0.736404


In [74]:
p1.loc[:, ['Opportunity_ID', 'Target']].to_csv('predictions/stacking_all_models.csv', index=False)

In [78]:
from sklearn.ensemble import StackingClassifier

final = ctb.CatBoostClassifier(silent=True)

knn_lr_svm = StackingClassifier(estimators=[('knn', model_rf), \
                                        ('lr', model_lr),('svm',model_lr)], final_estimator = LogisticRegression())

pipe2 = Pipeline([('preprocessing', preprocessor),('stacking', knn_lr_svm)])

In [79]:
pipe2.fit(X_train, y_train)

Pipeline(steps=[('preprocessing',
                 ColumnTransformer(remainder='passthrough',
                                   transformers=[('numeric',
                                                  Pipeline(steps=[('imputer',
                                                                   SimpleImputer()),
                                                                  ('scaler',
                                                                   StandardScaler())]),
                                                  ['Convertibility',
                                                   'Total_Amount_Sum_USD',
                                                   'Total_Taxable_Amount_USD',
                                                   'Year_Creation',
                                                   'Month_Creation',
                                                   'Year_Delivery',
                                                   'Month_Delivery',
                    

In [80]:
log_loss(y_test, pipe2.predict_proba(X_test))

0.33470995568856454

In [82]:
p2 = pd.DataFrame(pipe2.predict_proba(test_set.drop(columns=['Opportunity_ID', 'Target']))[:, 1])
p2['Opportunity_ID'] = test_set.Opportunity_ID.to_list()
p2.columns = ['Target', 'Opportunity_ID']
p2.loc[:, ['Opportunity_ID', 'Target']]

Unnamed: 0,Opportunity_ID,Target
0,10689,0.924333
1,10690,0.676623
2,10691,0.330864
3,10692,0.196808
4,10693,0.944083
...,...,...
1562,12364,0.947195
1563,12365,0.212245
1564,12366,0.208684
1565,12367,0.359329


In [83]:
p2.loc[:, ['Opportunity_ID', 'Target']].to_csv('predictions/stacking_worst_models.csv', index=False)

In [None]:
sub.to_csv('predictions/stacking.csv', index=False)

In [84]:
from sklearn.metrics import f1_score
print(f1_score(test_set.Target, pipe2.predict(test_set.drop(columns=['Opportunity_ID', 'Target']))))
log_loss(test_set.Target, pipe2.predict_proba(test_set.drop(columns=['Opportunity_ID', 'Target'])))

0.7963525835866262


0.475769215860681

In [85]:
cat_xgb_lgbm = StackingClassifier(estimators=[('cat', model_cat),('xgb',model_xgb), ('lgbm', model_lgbm)]
                              , final_estimator = LogisticRegression())

pipe3 = Pipeline([('preprocessing', preprocessor),('stacking', cat_xgb_lgbm)])

pipe3.fit(X_train, y_train)

Pipeline(steps=[('preprocessing',
                 ColumnTransformer(remainder='passthrough',
                                   transformers=[('numeric',
                                                  Pipeline(steps=[('imputer',
                                                                   SimpleImputer()),
                                                                  ('scaler',
                                                                   StandardScaler())]),
                                                  ['Convertibility',
                                                   'Total_Amount_Sum_USD',
                                                   'Total_Taxable_Amount_USD',
                                                   'Year_Creation',
                                                   'Month_Creation',
                                                   'Year_Delivery',
                                                   'Month_Delivery',
                    

In [87]:
print(log_loss(y_test, pipe3.predict_proba(X_test)))
f1_score(y_test, pipe3.predict(X_test))

0.3114943763995922


0.8696069031639501

In [89]:
p3 = pd.DataFrame(pipe3.predict_proba(test_set.drop(columns=['Opportunity_ID', 'Target']))[:, 1])
p3['Opportunity_ID'] = test_set.Opportunity_ID.to_list()
p3.columns = ['Target', 'Opportunity_ID']
p3.loc[:, ['Opportunity_ID', 'Target']]

Unnamed: 0,Opportunity_ID,Target
0,10689,0.951260
1,10690,0.814998
2,10691,0.525687
3,10692,0.094356
4,10693,0.940114
...,...,...
1562,12364,0.953515
1563,12365,0.131826
1564,12366,0.138901
1565,12367,0.827806


In [90]:
p3.loc[:, ['Opportunity_ID', 'Target']].to_csv('predictions/stacking_best_models.csv', index=False)

In [91]:
print(f1_score(test_set.Target, pipe.predict(test_set.drop(columns=['Opportunity_ID', 'Target']))))
log_loss(test_set.Target, pipe.predict_proba(test_set.drop(columns=['Opportunity_ID', 'Target']))) 

0.7973462002412545


0.46950702655436927