In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import xgboost as xgb
import catboost as ctb
import lightgbm as lgb
from sklearn.metrics import log_loss
import numpy as np
import category_encoders as ce
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import BaggingClassifier
from sklearn import svm
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.impute import KNNImputer

In [2]:
train_set = pd.read_csv('train_prepared.csv', encoding='utf-8')
test_set = pd.read_csv('test_prepared.csv', encoding='utf-8')

train_set = train_set.replace([-np.inf, np.inf], np.nan)
test_set = test_set.replace([-np.inf, np.inf], np.nan)

train_set.rename(columns={"Pricing, Delivery_Terms_Quote_Appr": "Pricing_Delivery_Terms_Quote_Appr", 
                          "Pricing, Delivery_Terms_Approved": "Pricing_Delivery_Terms_Approved"}, inplace=True)

test_set.rename(columns={"Pricing, Delivery_Terms_Quote_Appr": "Pricing_Delivery_Terms_Quote_Appr", 
                         "Pricing, Delivery_Terms_Approved": "Pricing_Delivery_Terms_Approved"}, inplace=True)

In [3]:
X = train_set.drop(columns=['Target', 'Opportunity_ID'])
y = train_set.Target

In [4]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = \
    train_test_split(X, y, test_size=0.2, random_state=123)

In [5]:
final = ctb.CatBoostClassifier(silent=True)

In [6]:
model_rf = RandomForestClassifier(n_estimators= 50, min_samples_split= 2, 
                           min_samples_leaf=5, max_features= 'sqrt', 
                           max_depth= 25,bootstrap= False)

In [7]:
best = {'learning_rate': 0.01,
 'l2_leaf_reg': 2.45,
 'iterations': 500,
 'depth': 5,
 'border_count': 200}

model_cat = ctb.CatBoostClassifier(silent=True)

In [8]:
xgb_best = {'colsample_bytree': 0.84, 
        'gamma': 0.49, 'learning_rate': 0.05, 'max_depth': 23,
        'min_child_weight': 6.0, 'n_estimators': 50, 'subsample': 0.84}

model_xgb = xgb.XGBClassifier()

In [9]:
lgbm_best = {'subsample_for_bin': 250000, 'subsample': 0.8131313131313131, 
        'reg_lambda': 0.2857142857142857, 'reg_alpha': 0.18367346938775508, 
        'num_leaves': 35, 'min_child_samples': 85, 'learning_rate': 0.07016445423929361, 
        'colsample_bytree': 0.3, 'boosting_type': 'gbdt'}

model_lgbm = lgb.LGBMClassifier(**lgbm_best)

In [10]:
model_svm = svm.SVC(probability=True)

In [11]:
best_lr = {'C': 1.8017093069504055, 'penalty': 'l1', 'solver': 'liblinear'}

model_lr = LogisticRegression(max_iter=1000)

In [12]:
model_knn = KNeighborsClassifier(n_neighbors=33)

In [13]:
from sklearn.ensemble import StackingClassifier

rf_cat_xgb_lgbm = StackingClassifier(estimators=[('rf', model_rf), \
                                        ('cat', model_cat),('xgb',model_xgb), ('lgbm', model_lgbm), 
                                        ('svm', model_svm), ('knn', model_knn), ('lr', model_lr)]
                              , final_estimator = LogisticRegression())

In [14]:
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler
from sklearn.compose import make_column_selector
from sklearn.impute import SimpleImputer

numeric_cols = X_train.select_dtypes(include=['int', 'float64']).columns.to_list()
#numeric_cols = ['Total_Amount_Sum_USD', 'Total_Taxable_Amount_USD']

# Transformaciones para las variables numéricas
numeric_transformer = Pipeline(
                        steps=[
                            ('imputer', SimpleImputer()),
                            ('scaler', StandardScaler())
                        ]
                      )

preprocessor = ColumnTransformer(
                    transformers=[
                        ('numeric', numeric_transformer, numeric_cols)
                        ],
                    remainder='passthrough'
                    )

In [15]:
pipe1 = Pipeline([('preprocessing', preprocessor),('stacking', rf_cat_xgb_lgbm)])

In [16]:
pipe1.fit(X_train, y_train)

Pipeline(steps=[('preprocessing',
                 ColumnTransformer(remainder='passthrough',
                                   transformers=[('numeric',
                                                  Pipeline(steps=[('imputer',
                                                                   SimpleImputer()),
                                                                  ('scaler',
                                                                   StandardScaler())]),
                                                  ['Convertibility',
                                                   'Total_Amount_Sum_USD',
                                                   'Total_Taxable_Amount_USD',
                                                   'Year_Creation',
                                                   'Month_Creation',
                                                   'Year_Delivery',
                                                   'Month_Delivery',
                    

In [17]:
log_loss(y_test, pipe1.predict_proba(X_test))

0.307545552194805

In [18]:
p1 = pd.DataFrame(pipe1.predict_proba(test_set.drop(columns=['Opportunity_ID']))[:, 1])
p1['Opportunity_ID'] = test_set.Opportunity_ID.to_list()
p1.columns = ['Target', 'Opportunity_ID']
p1.loc[:, ['Opportunity_ID', 'Target']]

Unnamed: 0,Opportunity_ID,Target
0,10689,0.930978
1,10690,0.760158
2,10691,0.462854
3,10692,0.143901
4,10693,0.943738
...,...,...
1562,12364,0.949985
1563,12365,0.169665
1564,12366,0.179067
1565,12367,0.765002


In [19]:
p1.loc[:, ['Opportunity_ID', 'Target']].to_csv('predictions/stacking_all_models.csv', index=False)

In [20]:
from sklearn.ensemble import StackingClassifier

final = ctb.CatBoostClassifier(silent=True)

knn_lr_svm = StackingClassifier(estimators=[('knn', model_rf), \
                                        ('lr', model_lr),('svm',model_lr)], final_estimator = LogisticRegression())

pipe2 = Pipeline([('preprocessing', preprocessor),('stacking', knn_lr_svm)])

In [21]:
pipe2.fit(X_train, y_train)

Pipeline(steps=[('preprocessing',
                 ColumnTransformer(remainder='passthrough',
                                   transformers=[('numeric',
                                                  Pipeline(steps=[('imputer',
                                                                   SimpleImputer()),
                                                                  ('scaler',
                                                                   StandardScaler())]),
                                                  ['Convertibility',
                                                   'Total_Amount_Sum_USD',
                                                   'Total_Taxable_Amount_USD',
                                                   'Year_Creation',
                                                   'Month_Creation',
                                                   'Year_Delivery',
                                                   'Month_Delivery',
                    

In [22]:
log_loss(y_test, pipe2.predict_proba(X_test))

0.3360201562928716

In [23]:
p2 = pd.DataFrame(pipe2.predict_proba(test_set.drop(columns=['Opportunity_ID']))[:, 1])
p2['Opportunity_ID'] = test_set.Opportunity_ID.to_list()
p2.columns = ['Target', 'Opportunity_ID']
p2.loc[:, ['Opportunity_ID', 'Target']]

Unnamed: 0,Opportunity_ID,Target
0,10689,0.905333
1,10690,0.738675
2,10691,0.503509
3,10692,0.249574
4,10693,0.919733
...,...,...
1562,12364,0.951655
1563,12365,0.259333
1564,12366,0.199016
1565,12367,0.486106


In [24]:
p2.loc[:, ['Opportunity_ID', 'Target']].to_csv('predictions/stacking_worst_models.csv', index=False)

In [None]:
cat_xgb_lgbm = StackingClassifier(estimators=[('cat', model_cat),('xgb',model_xgb), ('lgbm', model_lgbm)]
                              , final_estimator = LogisticRegression())

pipe3 = Pipeline([('preprocessing', preprocessor),('stacking', cat_xgb_lgbm)])

pipe3.fit(X_train, y_train)

In [None]:
print(log_loss(y_test, pipe3.predict_proba(X_test)))
f1_score(y_test, pipe3.predict(X_test))

In [None]:
p3 = pd.DataFrame(pipe3.predict_proba(test_set.drop(columns=['Opportunity_ID']))[:, 1])
p3['Opportunity_ID'] = test_set.Opportunity_ID.to_list()
p3.columns = ['Target', 'Opportunity_ID']
p3.loc[:, ['Opportunity_ID', 'Target']]

In [None]:
p3.loc[:, ['Opportunity_ID', 'Target']].to_csv('predictions/stacking_best_models.csv', index=False)