In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import xgboost as xgb
import catboost as ctb
import lightgbm as lgb
from sklearn.metrics import log_loss
import numpy as np
import category_encoders as ce
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import VotingClassifier
from sklearn import svm
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.impute import KNNImputer

In [2]:
train_set = pd.read_csv('train_prepared.csv', encoding='utf-8')
test_set = pd.read_csv('test_prepared.csv', encoding='utf-8')

train_set = train_set.replace([-np.inf, np.inf], np.nan)
test_set = test_set.replace([-np.inf, np.inf], np.nan)

train_set.rename(columns={"Pricing, Delivery_Terms_Quote_Appr": "Pricing_Delivery_Terms_Quote_Appr", 
                          "Pricing, Delivery_Terms_Approved": "Pricing_Delivery_Terms_Approved"}, inplace=True)

test_set.rename(columns={"Pricing, Delivery_Terms_Quote_Appr": "Pricing_Delivery_Terms_Quote_Appr", 
                         "Pricing, Delivery_Terms_Approved": "Pricing_Delivery_Terms_Approved"}, inplace=True)

In [3]:
X = train_set.drop(columns=['Target', 'Opportunity_ID'])
y = train_set.Target

In [4]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = \
    train_test_split(X, y, test_size=0.2, random_state=123)

In [5]:
model_rf = RandomForestClassifier(n_estimators= 50, min_samples_split= 2, 
                           min_samples_leaf=5, max_features= 'sqrt', 
                           max_depth= 25,bootstrap= False)

In [6]:
best = {'learning_rate': 0.01,
 'l2_leaf_reg': 2.45,
 'iterations': 500,
 'depth': 5,
 'border_count': 200}

model_cat = ctb.CatBoostClassifier(silent=True)

In [7]:
xgb_best = {'colsample_bytree': 0.84, 
        'gamma': 0.49, 'learning_rate': 0.05, 'max_depth': 23,
        'min_child_weight': 6.0, 'n_estimators': 50, 'subsample': 0.84}

model_xgb = xgb.XGBClassifier()

In [8]:
lgbm_best = {'subsample_for_bin': 250000, 'subsample': 0.8131313131313131, 
        'reg_lambda': 0.2857142857142857, 'reg_alpha': 0.18367346938775508, 
        'num_leaves': 35, 'min_child_samples': 85, 'learning_rate': 0.07016445423929361, 
        'colsample_bytree': 0.3, 'boosting_type': 'gbdt'}

model_lgbm = lgb.LGBMClassifier(**lgbm_best)

In [9]:
model_svm = svm.SVC(probability=True)

In [10]:
best_lr = {'C': 1.8017093069504055, 'penalty': 'l1', 'solver': 'liblinear'}

model_lr = LogisticRegression(max_iter=1000)

In [11]:
model_knn = KNeighborsClassifier(n_neighbors=33)

In [12]:
from sklearn.ensemble import StackingClassifier

rf_cat_xgb_lgbm = VotingClassifier(estimators=[('rf', model_rf), \
                                        ('cat', model_cat),('xgb',model_xgb), ('lgbm', model_lgbm), 
                                        ('svm', model_svm), ('knn', model_knn), ('lr', model_lr)]
                              , voting='soft')

In [13]:
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler
from sklearn.compose import make_column_selector
from sklearn.impute import SimpleImputer

numeric_cols = X_train.select_dtypes(include=['int', 'float64']).columns.to_list()
#numeric_cols = ['Total_Amount_Sum_USD', 'Total_Taxable_Amount_USD']

# Transformaciones para las variables numéricas
numeric_transformer = Pipeline(
                        steps=[
                            ('imputer', SimpleImputer()),
                            ('scaler', StandardScaler())
                        ]
                      )

preprocessor = ColumnTransformer(
                    transformers=[
                        ('numeric', numeric_transformer, numeric_cols)
                        ],
                    remainder='passthrough'
                    )

In [14]:
pipe1 = Pipeline([('preprocessing', preprocessor),('voting', rf_cat_xgb_lgbm)])

In [15]:
pipe1.fit(X_train, y_train)

Pipeline(steps=[('preprocessing',
                 ColumnTransformer(remainder='passthrough',
                                   transformers=[('numeric',
                                                  Pipeline(steps=[('imputer',
                                                                   SimpleImputer()),
                                                                  ('scaler',
                                                                   StandardScaler())]),
                                                  ['Convertibility',
                                                   'Total_Amount_Sum_USD',
                                                   'Total_Taxable_Amount_USD',
                                                   'Year_Creation',
                                                   'Month_Creation',
                                                   'Year_Delivery',
                                                   'Month_Delivery',
                    

In [16]:
log_loss(y_test, pipe1.predict_proba(X_test))

0.3323680794514107

In [17]:
p1 = pd.DataFrame(pipe1.predict_proba(test_set.drop(columns=['Opportunity_ID']))[:, 1])
p1['Opportunity_ID'] = test_set.Opportunity_ID.to_list()
p1.columns = ['Target', 'Opportunity_ID']
p1.loc[:, ['Opportunity_ID', 'Target']]

Unnamed: 0,Opportunity_ID,Target
0,10689,0.889292
1,10690,0.752738
2,10691,0.488283
3,10692,0.369197
4,10693,0.924817
...,...,...
1562,12364,0.925412
1563,12365,0.309756
1564,12366,0.321415
1565,12367,0.670238


In [18]:
p1.loc[:, ['Opportunity_ID', 'Target']].to_csv('predictions/voting_all_models.csv', index=False)

In [19]:
from sklearn.ensemble import StackingClassifier

final = ctb.CatBoostClassifier(silent=True)

knn_lr_svm = VotingClassifier(estimators=[('knn', model_rf), \
                                        ('lr', model_lr),('svm',model_lr)],
                              voting='soft')

pipe2 = Pipeline([('preprocessing', preprocessor),('voting', knn_lr_svm)])

In [20]:
pipe2.fit(X_train, y_train)

Pipeline(steps=[('preprocessing',
                 ColumnTransformer(remainder='passthrough',
                                   transformers=[('numeric',
                                                  Pipeline(steps=[('imputer',
                                                                   SimpleImputer()),
                                                                  ('scaler',
                                                                   StandardScaler())]),
                                                  ['Convertibility',
                                                   'Total_Amount_Sum_USD',
                                                   'Total_Taxable_Amount_USD',
                                                   'Year_Creation',
                                                   'Month_Creation',
                                                   'Year_Delivery',
                                                   'Month_Delivery',
                    

In [21]:
log_loss(y_test, pipe2.predict_proba(X_test))

0.3839560087149259

In [22]:
p2 = pd.DataFrame(pipe2.predict_proba(test_set.drop(columns=['Opportunity_ID']))[:, 1])
p2['Opportunity_ID'] = test_set.Opportunity_ID.to_list()
p2.columns = ['Target', 'Opportunity_ID']
p2.loc[:, ['Opportunity_ID', 'Target']]

Unnamed: 0,Opportunity_ID,Target
0,10689,0.854241
1,10690,0.770478
2,10691,0.511125
3,10692,0.469132
4,10693,0.873491
...,...,...
1562,12364,0.896275
1563,12365,0.387518
1564,12366,0.377121
1565,12367,0.585545


In [23]:
p2.loc[:, ['Opportunity_ID', 'Target']].to_csv('predictions/voting_worst_models.csv', index=False)

In [24]:
cat_xgb_lgbm = VotingClassifier(estimators=[('cat', model_cat),('xgb',model_xgb), ('lgbm', model_lgbm)]
                              , voting='soft')

pipe3 = Pipeline([('preprocessing', preprocessor),('voting', cat_xgb_lgbm)])

pipe3.fit(X_train, y_train)

Pipeline(steps=[('preprocessing',
                 ColumnTransformer(remainder='passthrough',
                                   transformers=[('numeric',
                                                  Pipeline(steps=[('imputer',
                                                                   SimpleImputer()),
                                                                  ('scaler',
                                                                   StandardScaler())]),
                                                  ['Convertibility',
                                                   'Total_Amount_Sum_USD',
                                                   'Total_Taxable_Amount_USD',
                                                   'Year_Creation',
                                                   'Month_Creation',
                                                   'Year_Delivery',
                                                   'Month_Delivery',
                    

In [26]:
print(log_loss(y_test, pipe3.predict_proba(X_test)))

0.29928859562598303


In [27]:
p3 = pd.DataFrame(pipe3.predict_proba(test_set.drop(columns=['Opportunity_ID']))[:, 1])
p3['Opportunity_ID'] = test_set.Opportunity_ID.to_list()
p3.columns = ['Target', 'Opportunity_ID']
p3.loc[:, ['Opportunity_ID', 'Target']]

Unnamed: 0,Opportunity_ID,Target
0,10689,0.963660
1,10690,0.744223
2,10691,0.514019
3,10692,0.149301
4,10693,0.946525
...,...,...
1562,12364,0.990031
1563,12365,0.183628
1564,12366,0.194174
1565,12367,0.760209


In [28]:
p3.loc[:, ['Opportunity_ID', 'Target']].to_csv('predictions/voting_best_models.csv', index=False)