In [4]:
# Load clean dataset & prepare features
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix
import joblib

# Load clean data
df = pd.read_csv("../data/clean_data.csv")
print("Clean data loaded from ./data/clean_data.csv")

Clean data loaded from ./data/clean_data.csv


In [5]:
# Convert suitable columns to categorical dtype
cat_cols = ["Product", "Issue", "Company response", "Consumer disputed?", "Timely response?"]
for col in cat_cols:
    if col in df.columns:
        df[col] = df[col].astype("category")

# For missing values, fill with a new category 'Missing' in categorical columns
for col in cat_cols:
    if df[col].isna().any():
        df[col] = df[col].cat.add_categories("Missing").fillna("Missing")

# Confirm transformation
print(df[cat_cols].info())
print(df[cat_cols].head())

# Prepare features and target
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OrdinalEncoder

# Feature columns (excluding target)
feature_cols = ["Product", "Issue", "Company response", "Consumer disputed?"]
target_col = "Timely response?"

# Separate features and target
X = df[feature_cols]
y = df[target_col].cat.codes  # Encoding target as 0/1

# Train-test split with stratification on target
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

# Display shapes to confirm
print("Training set shape:", X_train.shape)
print("Testing set shape:", X_test.shape)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 28156 entries, 0 to 28155
Data columns (total 5 columns):
 #   Column              Non-Null Count  Dtype   
---  ------              --------------  -----   
 0   Product             28156 non-null  category
 1   Issue               28156 non-null  category
 2   Company response    28156 non-null  category
 3   Consumer disputed?  28156 non-null  category
 4   Timely response?    28156 non-null  category
dtypes: category(5)
memory usage: 141.2 KB
None
           Product                                     Issue  \
0  Debt collection                     Communication tactics   
1  Debt collection     Cont'd attempts collect debt not owed   
2         Mortgage  Application, originator, mortgage broker   
3      Credit card                                     Other   
4  Debt collection     Cont'd attempts collect debt not owed   

          Company response Consumer disputed? Timely response?  
0              In progress            Missin

In [6]:

# Train baseline models with cross-validation
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import StratifiedKFold, cross_validate
from sklearn.metrics import make_scorer, f1_score, precision_score, recall_score, roc_auc_score
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OrdinalEncoder
import numpy as np

# Create encoder
encoder = OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=-1)

# Define models to compare
models = {
    "Logistic Regression": Pipeline([
        ('encoder', OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=-1)),
        ('classifier', LogisticRegression(max_iter=1000, class_weight='balanced', random_state=42))
    ]),
    "Random Forest": Pipeline([
        ('encoder', OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=-1)),
        ('classifier', RandomForestClassifier(n_estimators=100, class_weight='balanced', random_state=42))
    ]),
    "XGBoost": Pipeline([
        ('encoder', OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=-1)),
        ('classifier', XGBClassifier(use_label_encoder=False, eval_metric='logloss', 
                                   scale_pos_weight=(y_train == 0).sum() / (y_train == 1).sum(), 
                                   random_state=42))
    ])
}

# Stratified 5-fold CV setup
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

# Metrics to evaluate
scoring = {
    'f1': make_scorer(f1_score),
    'precision': make_scorer(precision_score),
    'recall': make_scorer(recall_score),
    'roc_auc': 'roc_auc'
}

# Evaluate each model
results = {}
for name, model in models.items():
    cv_results = cross_validate(model, X_train, y_train, cv=cv, scoring=scoring, n_jobs=-1)
    results[name] = {metric: (np.mean(cv_results[f'test_{metric}']), np.std(cv_results[f'test_{metric}'])) for metric in scoring}

# Display results
print("Model comparison (mean ± std):\n")
for name, metrics in results.items():
    print(f"{name}:")
    for metric, (mean_score, std_score) in metrics.items():
        print(f"  {metric}: {mean_score:.4f} ± {std_score:.4f}")
    print()

# Select best model based on F1 score
best_model_name = max(results.keys(), key=lambda x: results[x]['f1'][0])
print(f"Best model: {best_model_name}")

# Train the best model on full training data
best_pipeline = models[best_model_name]
best_pipeline.fit(X_train, y_train)

# Evaluate on test set
from sklearn.metrics import classification_report, confusion_matrix

y_pred = best_pipeline.predict(X_test)
print(f"\nTest Set Performance for {best_model_name}:")
print(classification_report(y_test, y_pred))
print("\nConfusion Matrix:")
print(confusion_matrix(y_test, y_pred))


Model comparison (mean ± std):

Logistic Regression:
  f1: 0.8664 ± 0.0024
  precision: 0.9878 ± 0.0012
  recall: 0.7717 ± 0.0037
  roc_auc: 0.7639 ± 0.0247

Random Forest:
  f1: 0.9293 ± 0.0140
  precision: 0.9899 ± 0.0014
  recall: 0.8760 ± 0.0260
  roc_auc: 0.8614 ± 0.0117

XGBoost:
  f1: 0.9099 ± 0.0200
  precision: 0.9913 ± 0.0017
  recall: 0.8416 ± 0.0360
  roc_auc: 0.8845 ± 0.0137

Best model: Random Forest

Test Set Performance for Random Forest:
              precision    recall  f1-score   support

           0       0.11      0.65      0.19       142
           1       0.99      0.87      0.93      5490

    accuracy                           0.86      5632
   macro avg       0.55      0.76      0.56      5632
weighted avg       0.97      0.86      0.91      5632


Confusion Matrix:
[[  92   50]
 [ 722 4768]]


In [7]:
# Save the trained pipeline
import os

os.makedirs('../models', exist_ok=True)
joblib.dump(best_pipeline, '../models/best_pipeline.pkl')

# Also save the feature columns for reference
joblib.dump(feature_cols, '../models/feature_columns.pkl')

print(f"\nBest pipeline ({best_model_name}) saved successfully.")


Best pipeline (Random Forest) saved successfully.
