In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import (precision_recall_curve,
                             f1_score, classification_report, confusion_matrix, 
                             roc_auc_score, roc_curve, accuracy_score)
from sklearn.ensemble import RandomForestClassifier
from sklearn.calibration import CalibratedClassifierCV
from imblearn.over_sampling import SMOTE
from imblearn.combine import SMOTETomek
from imblearn.pipeline import Pipeline
import json
import matplotlib.pyplot as plt
import seaborn as sns





In [20]:
train=pd.read_csv("../data/results/data_clean.csv")

In [21]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 15 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   CustomerID       1000 non-null   int64  
 1   Age              1000 non-null   int64  
 2   Gender           1000 non-null   object 
 3   MaritalStatus    1000 non-null   object 
 4   IncomeLevel      1000 non-null   object 
 5   TotalSpent       1000 non-null   float64
 6   NumTransactions  1000 non-null   int64  
 7   NumInteractions  1000 non-null   float64
 8   UnresolvedCount  1000 non-null   float64
 9   LastLoginDate    1000 non-null   object 
 10  LoginFrequency   1000 non-null   int64  
 11  ServiceUsage     1000 non-null   object 
 12  ChurnStatus      1000 non-null   int64  
 13  Churn            1000 non-null   object 
 14  AgeGroup         1000 non-null   object 
dtypes: float64(3), int64(5), object(7)
memory usage: 117.3+ KB


## Pre-processing

In [None]:
# Oridnal encode Income Level

train['IncomeLevelEncoded'] = train['IncomeLevel'].map({
    'Low': 0,
    'Medium': 2,
    'High': 3
})

# Drop columns that aren't helpful or are now redundant
drop_cols = [
    'CustomerID', 'LastLoginDate', 'Churn', 'AgeGroup', 'IncomeLevel'  # drop original as it's now encoded
]
train.drop(columns=drop_cols, inplace=True)

# Separate features and target
X = train.drop(columns='ChurnStatus')
y = train['ChurnStatus']

# One-hot encode other categorical variables (leave ordinal ones)
X = pd.get_dummies(X, drop_first=True)

# Scale numeric values
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Convert target to categorical
num_classes = len(y.unique())


# Split data
X_train, X_test, y_train, y_test = train_test_split(
    X_scaled, y, test_size=0.2, random_state=42, stratify=y
)

# Build the model


## Model Training and Evaluation

Algorithm: Random Forest

Hyper-parameter tuning: Grid Search

Optimization: Precision-Recall curve

In [None]:
from sklearn.datasets import make_classification
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.calibration import CalibratedClassifierCV
from sklearn.metrics import (
    precision_recall_curve, classification_report, confusion_matrix,
    roc_auc_score, accuracy_score
)

# 1. Data
X, y = make_classification(
    n_samples=1000, n_features=20, weights=[0.7, 0.3], random_state=42
)
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.3, random_state=42
)

# 2. Pipeline with SMOTE + Tomek
pipeline = Pipeline([
    ('smote', SMOTETomek(random_state=42)),
    ('rf', RandomForestClassifier(class_weight='balanced', random_state=42))
])

# 3. GridSearch with better scoring (area under PR curve = more recall-robust)
param_grid = {
    'rf__n_estimators': [100, 200],
    'rf__max_depth': [5, 10],
    'rf__min_samples_split': [2, 5]
}

grid = GridSearchCV(
    estimator=pipeline,
    param_grid=param_grid,
    scoring='average_precision',
    cv=3,
    n_jobs=-1
)
grid.fit(X_train, y_train)

# 4. Extract tuned model + reapply resampling manually for calibration
best_pipeline = grid.best_estimator_
rf_best = best_pipeline.named_steps['rf']
smote_X_train, smote_y_train = best_pipeline.named_steps['smote'].fit_resample(X_train, y_train)

# 5. Calibrate
calibrated_rf = CalibratedClassifierCV(estimator=rf_best, method='sigmoid', cv=3)
calibrated_rf.fit(smote_X_train, smote_y_train)

# 6. Predict calibrated probabilities
y_proba = calibrated_rf.predict_proba(X_test)[:, 1]
precision, recall, thresholds = precision_recall_curve(y_test, y_proba)

# 7. Threshold for Recall ≥ 80% + Best F1
best_thresh = 0.5
best_f1 = 0
for p, r, t in zip(precision, recall, thresholds):
    if r >= 0.80:
        f1 = 2 * p * r / (p + r + 1e-10)
        if f1 > best_f1:
            best_f1 = f1
            best_thresh = t

print(f"Selected threshold: {best_thresh:.2f} (Recall ≥ 0.80, F1 = {best_f1:.3f})")

# 8. Final Prediction
y_pred = (y_proba >= best_thresh).astype(int)

# 9. Evaluation
report = classification_report(y_test, y_pred, output_dict=True)
conf_matrix = confusion_matrix(y_test, y_pred)
roc_auc = roc_auc_score(y_test, y_proba)
accuracy = accuracy_score(y_test, y_pred)

print(json.dumps(report, indent=2))
print(f"ROC AUC: {roc_auc:.3f}")
print(f"Accuracy: {accuracy:.3f}")
print(f"Confusion Matrix:\n{conf_matrix}")

# 10. Save
metrics_output = {
    "best_params": grid.best_params_,
    "chosen_threshold": float(best_thresh),
    "target_recall_f1": best_f1,
    "roc_auc": roc_auc,
    "accuracy": accuracy,
    "confusion_matrix": conf_matrix.tolist(),
    "classification_report": report
}

with open("model_metrics_balanced_smote_tomek.json", "w") as f:
    json.dump(metrics_output, f, indent=2)


Selected threshold: 0.45 (Recall ≥ 0.80, F1 = 0.789)
{
  "0": {
    "precision": 0.9182692307692307,
    "recall": 0.9009433962264151,
    "f1-score": 0.9095238095238095,
    "support": 212.0
  },
  "1": {
    "precision": 0.7717391304347826,
    "recall": 0.8068181818181818,
    "f1-score": 0.7888888888888889,
    "support": 88.0
  },
  "accuracy": 0.8733333333333333,
  "macro avg": {
    "precision": 0.8450041806020067,
    "recall": 0.8538807890222984,
    "f1-score": 0.8492063492063492,
    "support": 300.0
  },
  "weighted avg": {
    "precision": 0.8752870680044592,
    "recall": 0.8733333333333333,
    "f1-score": 0.8741375661375662,
    "support": 300.0
  }
}
ROC AUC: 0.905
Accuracy: 0.873
Confusion Matrix:
[[191  21]
 [ 17  71]]


## Model Evaluation: Precisin, Recall, F1-score, AUC-ROC

In [27]:
# --- 5. Evaluate the model ---
print("\nClassification Report:\n", classification_report(y_test, y_pred_opt))

# Confusion matrix
conf_mat = confusion_matrix(y_test, y_pred_opt)
sns.heatmap(conf_mat, annot=True, fmt="d", cmap="Blues")
plt.title("Confusion Matrix")
plt.xlabel("Predicted")
plt.ylabel("Actual")
plt.show()

# ROC Curve and AUC
fpr, tpr, _ = roc_curve(y_test, y_proba)
auc = roc_auc_score(y_test, y_proba)
plt.plot(fpr, tpr, label=f"AUC = {auc:.2f}")
plt.plot([0, 1], [0, 1], linestyle="--", color="gray")
plt.xlabel("FPR")
plt.ylabel("TPR")
plt.title("ROC Curve")
plt.legend()
plt.grid(True)
plt.show()


ValueError: Found input variables with inconsistent numbers of samples: [300, 200]