In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import StackingClassifier, RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from lightgbm import LGBMClassifier
from xgboost import XGBClassifier
from sklearn.metrics import classification_report
import os
import yaml

In [2]:
df = pd.read_csv("../data/processed/train_processed.csv")
X = df.drop('outcome', axis=1)
y = df.outcome
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=123)

In [3]:
with open("../params.yaml", 'r') as file:
    params = yaml.safe_load(file)
    
rfc_params = params['random_forest_classifier']
lgbm_params = params['lightgbm']
xgboost_params = params['xgboost']

In [4]:
model_rfc = RandomForestClassifier(**rfc_params, random_state=123)
model_lgbm = LGBMClassifier(**lgbm_params, random=123)
model_xgboost = XGBClassifier(**xgboost_params, random_state=123)

In [5]:
model_logreg = LogisticRegression(max_iter=1000, solver='lbfgs')

In [6]:
model_knn = KNeighborsClassifier(n_neighbors=5)

# Model Stacking

In [None]:
stacking_model = StackingClassifier(
    estimators = [
        ('random_forest_classifier', model_rfc),
        ('xgboost', model_xgboost),
        ('lightgbm', model_lgbm)
    ],
    final_estimator=model_logreg,
    cv=5
)
stacking_model.fit(X_train, y_train)


[LightGBM] [Info] Number of positive: 2067, number of negative: 2065
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000174 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1620
[LightGBM] [Info] Number of data points in the train set: 4132, number of used features: 14
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500242 -> initscore=0.000968
[LightGBM] [Info] Start training from score 0.000968
[LightGBM] [Info] Number of positive: 1653, number of negative: 1652
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000126 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1592
[LightGBM] [Info] Number of data points in the train set: 3305, number of used features: 14
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500151 -> initscore=0.000605
[LightGBM] [Info] Start training from score 0.000605
[LightGBM] [Info] Numb

0,1,2
,estimators,"[('random_forest_classifier', ...), ('xgboost', ...), ...]"
,final_estimator,LogisticRegre...max_iter=1000)
,cv,5
,stack_method,'auto'
,n_jobs,
,passthrough,False
,verbose,0

0,1,2
,n_estimators,364
,criterion,'gini'
,max_depth,10
,min_samples_split,19
,min_samples_leaf,10
,min_weight_fraction_leaf,0.0
,max_features,'sqrt'
,max_leaf_nodes,
,min_impurity_decrease,0.0
,bootstrap,False

0,1,2
,objective,'binary:logistic'
,base_score,
,booster,
,callbacks,
,colsample_bylevel,
,colsample_bynode,
,colsample_bytree,0.8
,device,
,early_stopping_rounds,
,enable_categorical,False

0,1,2
,boosting_type,'gbdt'
,num_leaves,31
,max_depth,8
,learning_rate,0.028
,n_estimators,75
,subsample_for_bin,200000
,objective,
,class_weight,
,min_split_gain,0.0
,min_child_weight,0.001

0,1,2
,penalty,'l2'
,dual,False
,tol,0.0001
,C,1.0
,fit_intercept,True
,intercept_scaling,1
,class_weight,
,random_state,
,solver,'lbfgs'
,max_iter,1000


In [8]:
y_pred = stacking_model.predict(X_test)
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

         0.0       0.73      0.71      0.72       526
         1.0       0.71      0.72      0.72       508

    accuracy                           0.72      1034
   macro avg       0.72      0.72      0.72      1034
weighted avg       0.72      0.72      0.72      1034



In [9]:
df_test = pd.read_csv("../data/processed/test_processed.csv")
Xtest = df.drop('outcome', axis=1)
ytest = df.outcome

In [10]:
y_pred = stacking_model.predict(Xtest)
print(classification_report(ytest, y_pred))

              precision    recall  f1-score   support

         0.0       0.77      0.76      0.76      2591
         1.0       0.76      0.77      0.77      2575

    accuracy                           0.76      5166
   macro avg       0.76      0.76      0.76      5166
weighted avg       0.76      0.76      0.76      5166



In [13]:
import pickle as pkl
with open("model2.pkl", "wb") as file:
    pkl.dump(stacking_model, file)