In [18]:
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import roc_auc_score, classification_report
from sklearn.model_selection import cross_val_score, train_test_split

import pandas as pd
from IPython.display import display, Markdown

# Load the dataset
use_columns = ['month_oct', 'high_balance', 'age_group_60+', 'education_tertiary', 'job_student', 'month_mar', 'month_jun']
df = pd.read_parquet('../data/data_encoded.parquet')

# Display the first few rows to ensure the data is loaded correctly
display(Markdown(f'The dataset has {df.shape[0]:,} rows and {df.shape[1]:,} columns.'))
display(Markdown(f'The dataset has the following columns: {", ".join(df.columns)}.'))
display(df.sample(3))
display(df.describe())


The dataset has 26,295 rows and 39 columns.

The dataset has the following columns: campaign, y, job_blue-collar, job_entrepreneur, job_housemaid, job_management, job_retired, job_self-employed, job_services, job_student, job_technician, job_unemployed, marital_married, marital_single, education_secondary, education_tertiary, default_yes, housing_yes, loan_yes, contact_telephone, month_aug, month_dec, month_feb, month_jan, month_jul, month_jun, month_mar, month_may, month_nov, month_oct, high_balance, age_group_18-30, age_group_60+, age_balance_interaction, young_single_tertiary, married_high_balance, age_tertiary_interaction, student_high_balance, retired_high_balance.

Unnamed: 0,campaign,y,job_blue-collar,job_entrepreneur,job_housemaid,job_management,job_retired,job_self-employed,job_services,job_student,...,month_oct,high_balance,age_group_18-30,age_group_60+,age_balance_interaction,young_single_tertiary,married_high_balance,age_tertiary_interaction,student_high_balance,retired_high_balance
25216,1,0,0,0,0,0,0,0,1,0,...,0,0,1,0,0,0,0,0,0,0
30,5,0,0,0,0,1,0,0,0,0,...,0,1,0,0,0,0,1,0,0,0
26046,3,0,0,0,0,1,0,0,0,0,...,0,1,0,0,0,0,1,0,0,0


Unnamed: 0,campaign,y,job_blue-collar,job_entrepreneur,job_housemaid,job_management,job_retired,job_self-employed,job_services,job_student,...,month_oct,high_balance,age_group_18-30,age_group_60+,age_balance_interaction,young_single_tertiary,married_high_balance,age_tertiary_interaction,student_high_balance,retired_high_balance
count,26295.0,26295.0,26295.0,26295.0,26295.0,26295.0,26295.0,26295.0,26295.0,26295.0,...,26295.0,26295.0,26295.0,26295.0,26295.0,26295.0,26295.0,26295.0,26295.0,26295.0
mean,2.901921,0.08804,0.199544,0.035178,0.027572,0.233162,0.034569,0.038296,0.092261,0.013387,...,0.001978,0.399696,0.147823,0.007492,0.004868,0.03514,0.248564,0.00232,0.005552,0.015706
std,3.128301,0.283358,0.399665,0.184232,0.163746,0.422853,0.18269,0.191914,0.289399,0.114926,...,0.044427,0.489845,0.354931,0.086233,0.069601,0.184136,0.432189,0.04811,0.074309,0.124339
min,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
75%,3.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
max,50.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


In [22]:
import mlflow
import mlflow.sklearn
from sklearn.metrics import accuracy_score, roc_auc_score, classification_report

# Set the experiment name
mlflow.set_experiment("customer_subscription_classification")

# Start an MLflow run
with mlflow.start_run():
    # Train your model (Logistic Regression example)
    logreg = LogisticRegression(random_state=42, class_weight='balanced')
    logreg.fit(X_train_sm, y_train_sm)

    # Predict and evaluate
    y_pred_test = logreg.predict(X_test)
    roc_auc = roc_auc_score(y_test, logreg.predict_proba(X_test)[:, 1])

    # Log parameters and metrics
    mlflow.log_param("model", "Logistic Regression")
    mlflow.log_metric("roc_auc", roc_auc)

    # Log the model
    mlflow.sklearn.log_model(logreg, "logreg_model")

    print(f"Logistic Regression ROC AUC Score: {roc_auc}")
    print(classification_report(y_test, y_pred_test))
