In [88]:
import pandas as pd
import matplotlib.pyplot as plt
import xgboost as xgb
import catboost as ctb
import lightgbm as lgb
from sklearn.metrics import log_loss
import numpy as np
import category_encoders as ce
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import BaggingClassifier
from sklearn import svm
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.impute import KNNImputer

In [89]:
train_set = pd.read_csv('train_prepared.csv', encoding='utf-8')
test_set = pd.read_csv('test_prepared.csv', encoding='utf-8')

train_set = train_set.replace([-np.inf, np.inf], np.nan)
test_set = test_set.replace([-np.inf, np.inf], np.nan)

train_set.rename(columns={"Pricing, Delivery_Terms_Quote_Appr": "Pricing_Delivery_Terms_Quote_Appr", 
                          "Pricing, Delivery_Terms_Approved": "Pricing_Delivery_Terms_Approved"}, inplace=True)

test_set.rename(columns={"Pricing, Delivery_Terms_Quote_Appr": "Pricing_Delivery_Terms_Quote_Appr", 
                         "Pricing, Delivery_Terms_Approved": "Pricing_Delivery_Terms_Approved"}, inplace=True)

In [90]:
X = train_set.drop(columns=['Target', 'Opportunity_ID'])
y = train_set.Target

In [91]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = \
    train_test_split(X, y, test_size=0.2, random_state=123)

In [92]:
final = ctb.CatBoostClassifier(silent=True)

In [93]:
model_rf = RandomForestClassifier(n_estimators= 2000, min_samples_split= 2, 
                           min_samples_leaf=1, max_features= 'sqrt', 
                           max_depth= 30,bootstrap= False)

In [94]:
best = {'learning_rate': 0.01,
 'l2_leaf_reg': 2.55,
 'iterations': 265,
 'depth': 7,
 'border_count': 135}

model_cat = ctb.CatBoostClassifier(silent=True)

In [95]:
xgb_best = {'colsample_bytree': 0.84, 
        'gamma': 0.49, 'learning_rate': 0.04, 'max_depth': 23,
        'min_child_weight': 6.0, 'n_estimators': 55, 'subsample': 0.84}

model_xgb = xgb.XGBClassifier()

In [96]:
lgbm_best = {'subsample_for_bin': 250000, 'subsample': 0.8131313131313131, 
        'reg_lambda': 0.2857142857142857, 'reg_alpha': 0.18367346938775508, 
        'num_leaves': 35, 'min_child_samples': 83, 'learning_rate': 0.07016445423929361, 
        'colsample_bytree': 0.6, 'boosting_type': 'gbdt'}

model_lgbm = lgb.LGBMClassifier()

In [97]:
model_svm = svm.SVC(probability=True)

In [98]:
best_lr = {'C': 2.976522963488366, 'penalty': 'l1', 'solver': 'liblinear'}

model_lr = LogisticRegression()

In [99]:
model_knn = KNeighborsClassifier()

In [100]:
model_nb = GaussianNB()

In [101]:
bag_knn = BaggingClassifier(base_estimator= model_knn, n_estimators=8, random_state=0)
bag_cat = BaggingClassifier(base_estimator= model_cat, n_estimators=8, random_state=0)
bag_lr = BaggingClassifier(base_estimator= model_lr, n_estimators=8, random_state=0)
bag_rf = BaggingClassifier(base_estimator= model_rf, n_estimators=8, random_state=0)
bag_svm = BaggingClassifier(base_estimator= model_svm, n_estimators=8, random_state=0)
bag_nb = BaggingClassifier(base_estimator= model_nb, n_estimators=8, random_state=0)

In [102]:
from sklearn.ensemble import StackingClassifier

rf_cat_xgb_lgbm = StackingClassifier(estimators=[('rf', model_rf), \
                                        ('cat', model_cat),('xgb',model_xgb), ('lgbm', model_lgbm)]
                              , final_estimator = final)

In [103]:
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler
from sklearn.compose import make_column_selector
from sklearn.impute import SimpleImputer

numeric_cols = X_train.select_dtypes(include=['int', 'float64']).columns.to_list()
#numeric_cols = ['Total_Amount_Sum_USD', 'Total_Taxable_Amount_USD']

# Transformaciones para las variables num√©ricas
numeric_transformer = Pipeline(
                        steps=[
                            ('imputer', SimpleImputer()),
                            ('scaler', StandardScaler())
                        ]
                      )

preprocessor = ColumnTransformer(
                    transformers=[
                        ('numeric', numeric_transformer, numeric_cols)
                        ],
                    remainder='passthrough'
                    )

In [104]:
pipe = Pipeline([('preprocessing', preprocessor),('stacking', rf_cat_xgb_lgbm)])

In [105]:
pipe.fit(X_train, y_train)

Pipeline(steps=[('preprocessing',
                 ColumnTransformer(remainder='passthrough',
                                   transformers=[('numeric',
                                                  Pipeline(steps=[('imputer',
                                                                   SimpleImputer()),
                                                                  ('scaler',
                                                                   StandardScaler())]),
                                                  ['Convertibility',
                                                   'Total_Amount_Sum_USD',
                                                   'Total_Taxable_Amount_USD',
                                                   'Year_Creation',
                                                   'Month_Creation',
                                                   'Year_Delivery',
                                                   'Month_Delivery',
                    

In [106]:
log_loss(y_test, pipe.predict_proba(X_test))

0.28620704828374965

In [107]:
from sklearn.metrics import f1_score
print(f1_score(test_set.Target, pipe.predict(test_set.drop(columns=['Opportunity_ID', 'Target']))))
log_loss(test_set.Target, pipe.predict_proba(test_set.drop(columns=['Opportunity_ID', 'Target'])))

0.792654028436019


0.5434076954418902

In [108]:
from sklearn.ensemble import StackingClassifier

final = ctb.CatBoostClassifier(silent=True)

rf_lr_svm = StackingClassifier(estimators=[('rf', model_rf), \
                                        ('lr', model_lr),('svm',model_lr)], final_estimator = final)

pipe = Pipeline([('preprocessing', preprocessor),('stacking', rf_lr_svm)])

In [109]:
pipe.fit(X_train, y_train)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

Pipeline(steps=[('preprocessing',
                 ColumnTransformer(remainder='passthrough',
                                   transformers=[('numeric',
                                                  Pipeline(steps=[('imputer',
                                                                   SimpleImputer()),
                                                                  ('scaler',
                                                                   StandardScaler())]),
                                                  ['Convertibility',
                                                   'Total_Amount_Sum_USD',
                                                   'Total_Taxable_Amount_USD',
                                                   'Year_Creation',
                                                   'Month_Creation',
                                                   'Year_Delivery',
                                                   'Month_Delivery',
                    

In [110]:
log_loss(y_test, pipe.predict_proba(X_test))

0.3108944794039116

In [111]:
from sklearn.metrics import f1_score
print(f1_score(test_set.Target, pipe.predict(test_set.drop(columns=['Opportunity_ID', 'Target']))))
log_loss(test_set.Target, pipe.predict_proba(test_set.drop(columns=['Opportunity_ID', 'Target'])))

0.7851851851851852


0.4778058844547014

In [112]:
best_finl = {'learning_rate': 0.01,
 'l2_leaf_reg': 5,
 'iterations': 350,
 'depth': 7,
 'border_count': 135}

cat_xgb_lgbm = StackingClassifier(estimators=[('cat', model_cat),('xgb',model_xgb), ('lgbm', model_lgbm)]
                              , final_estimator = svm.SVC(probability=True))

pipe = Pipeline([('preprocessing', preprocessor),('stacking', cat_xgb_lgbm)])

pipe.fit(X_train, y_train)

Pipeline(steps=[('preprocessing',
                 ColumnTransformer(remainder='passthrough',
                                   transformers=[('numeric',
                                                  Pipeline(steps=[('imputer',
                                                                   SimpleImputer()),
                                                                  ('scaler',
                                                                   StandardScaler())]),
                                                  ['Convertibility',
                                                   'Total_Amount_Sum_USD',
                                                   'Total_Taxable_Amount_USD',
                                                   'Year_Creation',
                                                   'Month_Creation',
                                                   'Year_Delivery',
                                                   'Month_Delivery',
                    

In [113]:
print(log_loss(y_test, pipe.predict_proba(X_test)))
f1_score(y_test, pipe.predict(X_test))

0.350355595248929


0.8812589413447782

In [114]:
print(f1_score(test_set.Target, pipe.predict(test_set.drop(columns=['Opportunity_ID', 'Target']))))
log_loss(test_set.Target, pipe.predict_proba(test_set.drop(columns=['Opportunity_ID', 'Target']))) 

0.7971360381861575


0.5368685820046675

In [115]:
#sub.to_csv('predictions/stacking.csv', index=False)