In [2]:
import numpy as np
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier, BaggingClassifier
from xgboost import XGBClassifier

In [23]:
from sklearn.ensemble import AdaBoostClassifier

In [3]:
data = pd.read_csv("../data/raw/turnover.csv")
data.head()

Unnamed: 0,stag,event,gender,age,industry,profession,traffic,coach,head_gender,greywage,way,extraversion,independ,selfcontrol,anxiety,novator
0,7.030801,1,m,35.0,Banks,HR,rabrecNErab,no,f,white,bus,6.2,4.1,5.7,7.1,8.3
1,22.965092,1,m,33.0,Banks,HR,empjs,no,m,white,bus,6.2,4.1,5.7,7.1,8.3
2,15.934292,1,f,35.0,PowerGeneration,HR,rabrecNErab,no,m,white,bus,6.2,6.2,2.6,4.8,8.3
3,15.934292,1,f,35.0,PowerGeneration,HR,rabrecNErab,no,m,white,bus,5.4,7.6,4.9,2.5,6.7
4,8.410678,1,m,32.0,Retail,Commercial,youjs,yes,f,white,bus,3.0,4.1,8.0,7.1,3.7


In [4]:
X = data.drop(columns = ["event"])
y = data["event"]

In [5]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42, stratify = y)

In [6]:
cat_cols = X_train.select_dtypes(include = ["object"]).columns

In [7]:
cat_transformer = OneHotEncoder(handle_unknown='ignore')

In [8]:
preprocessor = ColumnTransformer(
    transformers=[
        ('cat', cat_transformer, cat_cols)  # Apply to categorical
    ]
)

In [9]:
pipeline_log_reg = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', LogisticRegression())
])

In [10]:
pipeline_log_reg.fit(X_train, y_train)
y_pred = pipeline_log_reg.predict(X_test)
score = accuracy_score(y_test, y_pred)
report_log_reg = classification_report(y_test, y_pred, output_dict = True)
print(score)
print(report_log_reg)

0.613941018766756
{'0': {'precision': 0.6111111111111112, 'recall': 0.5978260869565217, 'f1-score': 0.6043956043956044, 'support': 184.0}, '1': {'precision': 0.616580310880829, 'recall': 0.6296296296296297, 'f1-score': 0.6230366492146597, 'support': 189.0}, 'accuracy': 0.613941018766756, 'macro avg': {'precision': 0.6138457109959701, 'recall': 0.6137278582930756, 'f1-score': 0.6137161268051321, 'support': 373.0}, 'weighted avg': {'precision': 0.6138823678308877, 'recall': 0.613941018766756, 'f1-score': 0.6138410667838121, 'support': 373.0}}


In [11]:
pipeline_svc = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', SVC())
])

In [12]:
pipeline_svc.fit(X_train, y_train)
y_pred = pipeline_svc.predict(X_test)
score = accuracy_score(y_test, y_pred)
report_svc = classification_report(y_test, y_pred, output_dict = True)
print(score)
print(report_svc)

0.6300268096514745
{'0': {'precision': 0.6385542168674698, 'recall': 0.5760869565217391, 'f1-score': 0.6057142857142858, 'support': 184.0}, '1': {'precision': 0.6231884057971014, 'recall': 0.6825396825396826, 'f1-score': 0.6515151515151515, 'support': 189.0}, 'accuracy': 0.6300268096514745, 'macro avg': {'precision': 0.6308713113322857, 'recall': 0.6293133195307108, 'f1-score': 0.6286147186147186, 'support': 373.0}, 'weighted avg': {'precision': 0.6307683233224307, 'recall': 0.6300268096514745, 'f1-score': 0.6289216949270569, 'support': 373.0}}


In [13]:
pipeline_xgb = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', XGBClassifier())
])

In [14]:
pipeline_xgb.fit(X_train, y_train)
y_pred = pipeline_xgb.predict(X_test)
score = accuracy_score(y_test, y_pred)
report_xgb = classification_report(y_test, y_pred, output_dict = True)
print(score)
print(report_xgb)

0.6005361930294906
{'0': {'precision': 0.5956284153005464, 'recall': 0.592391304347826, 'f1-score': 0.5940054495912807, 'support': 184.0}, '1': {'precision': 0.6052631578947368, 'recall': 0.6084656084656085, 'f1-score': 0.6068601583113457, 'support': 189.0}, 'accuracy': 0.6005361930294906, 'macro avg': {'precision': 0.6004457865976416, 'recall': 0.6004284564067173, 'f1-score': 0.6004328039513132, 'support': 373.0}, 'weighted avg': {'precision': 0.6005103626203909, 'recall': 0.6005361930294906, 'f1-score': 0.600518961516461, 'support': 373.0}}


In [15]:
pipeline_rfc = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', RandomForestClassifier())
])

In [16]:
pipeline_rfc.fit(X_train, y_train)
y_pred = pipeline_rfc.predict(X_test)
score = accuracy_score(y_test, y_pred)
report_rfc = classification_report(y_test, y_pred, output_dict = True)
print(score)
print(report_rfc)

0.6434316353887399
{'0': {'precision': 0.6440677966101694, 'recall': 0.6195652173913043, 'f1-score': 0.631578947368421, 'support': 184.0}, '1': {'precision': 0.6428571428571429, 'recall': 0.6666666666666666, 'f1-score': 0.6545454545454545, 'support': 189.0}, 'accuracy': 0.6434316353887399, 'macro avg': {'precision': 0.6434624697336562, 'recall': 0.6431159420289855, 'f1-score': 0.6430622009569378, 'support': 373.0}, 'weighted avg': {'precision': 0.6434543554323625, 'recall': 0.6434316353887399, 'f1-score': 0.6432161319701887, 'support': 373.0}}


In [17]:
est = RandomForestClassifier()
pipeline_bagging = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', BaggingClassifier(est))
])

In [18]:
pipeline_bagging.fit(X_train, y_train)
y_pred = pipeline_bagging.predict(X_test)
score = accuracy_score(y_test, y_pred)
report_bagging = classification_report(y_test, y_pred, output_dict = True)
print(score)
print(report_bagging)

0.6514745308310992
{'0': {'precision': 0.6467391304347826, 'recall': 0.6467391304347826, 'f1-score': 0.6467391304347826, 'support': 184.0}, '1': {'precision': 0.656084656084656, 'recall': 0.656084656084656, 'f1-score': 0.656084656084656, 'support': 189.0}, 'accuracy': 0.6514745308310992, 'macro avg': {'precision': 0.6514118932597193, 'recall': 0.6514118932597193, 'f1-score': 0.6514118932597193, 'support': 373.0}, 'weighted avg': {'precision': 0.6514745308310992, 'recall': 0.6514745308310992, 'f1-score': 0.6514745308310992, 'support': 373.0}}


In [19]:
import mlflow

In [20]:
log_reg_params = pipeline_log_reg.get_params
svc_params = pipeline_svc.get_params
xgb_params = pipeline_xgb.get_params
rfc_params = pipeline_rfc.get_params
bagging_params = pipeline_bagging.get_params

In [22]:
mlflow.set_experiment("Logistic Regression Baseline")
mlflow.set_tracking_uri("http://127.0.0.1:5000/")

with mlflow.start_run():
    #mlflow.log_params(log_reg_params)
    mlflow.log_metrics({
        "accuracy": report_log_reg["accuracy"],
        "recall_class_1": report_log_reg["1"]["recall"],
        "f1_score": report_log_reg["1"]["f1-score"]
    })
    mlflow.sklearn.log_model(pipeline_log_reg, "prep + base log_reg")



🏃 View run lyrical-chimp-550 at: http://127.0.0.1:5000/#/experiments/924626959793777994/runs/a8a8b7273d6441dc8fd0742c220cb7e1
🧪 View experiment at: http://127.0.0.1:5000/#/experiments/924626959793777994


In [24]:
est = RandomForestClassifier()
pipeline_adaboost = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', AdaBoostClassifier(est))
])

In [25]:
pipeline_adaboost.fit(X_train, y_train)
y_pred = pipeline_adaboost.predict(X_test)
score = accuracy_score(y_test, y_pred)
report_adaboost = classification_report(y_test, y_pred, output_dict = True)
print(score)
print(report_adaboost)

0.6407506702412868
{'0': {'precision': 0.6329787234042553, 'recall': 0.6467391304347826, 'f1-score': 0.6397849462365591, 'support': 184.0}, '1': {'precision': 0.6486486486486487, 'recall': 0.6349206349206349, 'f1-score': 0.6417112299465241, 'support': 189.0}, 'accuracy': 0.6407506702412868, 'macro avg': {'precision': 0.640813686026452, 'recall': 0.6408298826777088, 'f1-score': 0.6407480880915416, 'support': 373.0}, 'weighted avg': {'precision': 0.6409187123350606, 'recall': 0.6407506702412868, 'f1-score': 0.6407609988402679, 'support': 373.0}}


In [26]:
mlflow.set_experiment("Bagging RFC Baseline")
mlflow.set_tracking_uri("http://127.0.0.1:5000/")

with mlflow.start_run():
    #mlflow.log_params(bagging_params)
    mlflow.log_metrics({
        "accuracy": report_bagging["accuracy"],
        "recall_class_1": report_bagging["1"]["recall"],
        "f1_score": report_bagging["1"]["f1-score"]
    })
    mlflow.sklearn.log_model(pipeline_bagging, "prep + base bagging_rfc")

2025/03/06 22:17:25 INFO mlflow.tracking.fluent: Experiment with name 'Bagging RFC Baseline' does not exist. Creating a new experiment.


🏃 View run vaunted-bat-416 at: http://127.0.0.1:5000/#/experiments/448668056508548912/runs/26e840e01d6e43008c565a61c29dc6ed
🧪 View experiment at: http://127.0.0.1:5000/#/experiments/448668056508548912


## Bagging Experiments

In [34]:
rfc_bagging = RandomForestClassifier(random_state = 19)
xgb_bagging = XGBClassifier()
svc_bagging = SVC()

models_bagging = [rfc_bagging, xgb_bagging, svc_bagging]

pipeline_bagging_rfc = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', BaggingClassifier(rfc_bagging, random_state = 19))
])

pipeline_bagging_xgb = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', BaggingClassifier(xgb_bagging, random_state = 19))
])

pipeline_bagging_svc = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', BaggingClassifier(svc_bagging, random_state = 19))
])


In [30]:
pipelines_bagging = [pipeline_rfc, pipeline_xgb, pipeline_svc]

In [31]:
reports_baging_exp = []

for pipeline in pipelines_bagging:
    pipeline.fit(X_train, y_train)
    y_pred = pipeline.predict(X_test)
    report = classification_report(y_test, y_pred, output_dict = True)
    reports_baging_exp.append(report)

In [32]:
reports_baging_exp

[{'0': {'precision': 0.6338797814207651,
   'recall': 0.6304347826086957,
   'f1-score': 0.6321525885558583,
   'support': 184.0},
  '1': {'precision': 0.6421052631578947,
   'recall': 0.6455026455026455,
   'f1-score': 0.6437994722955145,
   'support': 189.0},
  'accuracy': 0.6380697050938338,
  'macro avg': {'precision': 0.6379925222893299,
   'recall': 0.6379687140556706,
   'f1-score': 0.6379760304256864,
   'support': 373.0},
  'weighted avg': {'precision': 0.6380476528639756,
   'recall': 0.6380697050938338,
   'f1-score': 0.6380540926491425,
   'support': 373.0}},
 {'0': {'precision': 0.5956284153005464,
   'recall': 0.592391304347826,
   'f1-score': 0.5940054495912807,
   'support': 184.0},
  '1': {'precision': 0.6052631578947368,
   'recall': 0.6084656084656085,
   'f1-score': 0.6068601583113457,
   'support': 189.0},
  'accuracy': 0.6005361930294906,
  'macro avg': {'precision': 0.6004457865976416,
   'recall': 0.6004284564067173,
   'f1-score': 0.6004328039513132,
   'suppor

In [41]:
mlflow.set_experiment("Bagging Experiment 1")
mlflow.set_tracking_uri("http://127.0.0.1:5000/")

for i, element in enumerate(pipelines_bagging):
    pipeline_name = element[0]
    pipeline = element[1]
    report = reports_baging_exp[i]
    model = models_bagging[i]

with mlflow.start_run(run_name = f"pipeline_bagging_{model}"):
    mlflow.log_params({
        "pipeline_name": pipeline_name,
        "estimator": model
    })
    mlflow.log_metrics({
        "accuracy": report["accuracy"],
        "recall_class_1": report["1"]["recall"],
        "f1_score": report["1"]["f1-score"]
    })
    mlflow.sklearn.log_model(pipeline, "pipeline")



🏃 View run pipeline_bagging_SVC() at: http://127.0.0.1:5000/#/experiments/592260611624601794/runs/854071d9f9cc4f6fb0c0545184785a65
🧪 View experiment at: http://127.0.0.1:5000/#/experiments/592260611624601794
