In [19]:
# model_training.ipynb

import pandas as pd
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
import xgboost as xgb
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score, f1_score, precision_score, recall_score
import joblib

# Load preprocessed data
X_train = pd.read_csv("data/processed/X_train.csv")
y_train = pd.read_csv("data/processed/y_train.csv").squeeze()
X_test = pd.read_csv("data/processed/X_test.csv")
y_test = pd.read_csv("data/processed/y_test.csv").squeeze()

# Calculate class weights for imbalanced data
class_weights = compute_class_weight('balanced', classes=np.unique(y_train), y=y_train)
class_weight_dict = dict(zip(np.unique(y_train), class_weights))
print(f"\nCalculated class weights: {class_weight_dict}")

# Define models with and without class weights
models = {
    'Logistic Regression': LogisticRegression(random_state=42, max_iter=1000),
    'Logistic Regression (Balanced)': LogisticRegression(random_state=42, max_iter=1000, class_weight='balanced'),
    'KNN': KNeighborsClassifier(n_neighbors=5),
    'Random Forest': RandomForestClassifier(random_state=42, n_estimators=100),
    'Random Forest (Balanced)': RandomForestClassifier(random_state=42, n_estimators=100, class_weight='balanced'),
    'XGBoost': xgb.XGBClassifier(random_state=42, eval_metric='logloss'),
    'XGBoost (Balanced)': xgb.XGBClassifier(random_state=42, eval_metric='logloss', 
                                           scale_pos_weight=class_weights[0]/class_weights[1])
}

# Function to evaluate model
def evaluate_model(model, X_train, y_train, X_test, y_test, model_name):
    """Evaluate a model and return metrics"""
    # Train the model
    model.fit(X_train, y_train)
    
    # Make predictions
    y_pred = model.predict(X_test)
    y_pred_proba = model.predict_proba(X_test)[:, 1] if hasattr(model, 'predict_proba') else None
    
    # Calculate metrics
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred, average='weighted')
    recall = recall_score(y_test, y_pred, average='weighted')
    f1 = f1_score(y_test, y_pred, average='weighted')
    
    # Class-specific metrics
    precision_per_class = precision_score(y_test, y_pred, average=None)
    recall_per_class = recall_score(y_test, y_pred, average=None)
    f1_per_class = f1_score(y_test, y_pred, average=None)
    
    if y_pred_proba is not None:
        auc = roc_auc_score(y_test, y_pred_proba)
    else:
        auc = None
    
    return {
        'Model': model_name,
        'Accuracy': accuracy,
        'Precision': precision,
        'Recall': recall,
        'F1-Score': f1,
        'AUC': auc,
        'Precision_Declined': precision_per_class[0],
        'Recall_Declined': recall_per_class[0],
        'F1_Declined': f1_per_class[0],
        'Precision_Approved': precision_per_class[1],
        'Recall_Approved': recall_per_class[1],
        'F1_Approved': f1_per_class[1],
        'Predictions': y_pred,
        'Probabilities': y_pred_proba,
        'Model_Object': model
    }

# Store all results
all_results = []

# Train and evaluate all models
print("Training and evaluating models...")
print("-" * 50)

for model_name, model in models.items():
    print(f"Training {model_name}...")
    
    # Evaluate model
    result = evaluate_model(model, X_train_scaled, y_train, X_test_scaled, y_test, model_name)
    all_results.append(result)
    
    print(f"  Overall Metrics:")
    print(f"    Accuracy: {result['Accuracy']:.4f}")
    print(f"    Precision: {result['Precision']:.4f}")
    print(f"    Recall: {result['Recall']:.4f}")
    print(f"    F1-Score: {result['F1-Score']:.4f}")
    if result['AUC']:
        print(f"    AUC: {result['AUC']:.4f}")
    
    print(f"  Class-specific Metrics:")
    print(f"    Declined - Precision: {result['Precision_Declined']:.4f}, Recall: {result['Recall_Declined']:.4f}, F1: {result['F1_Declined']:.4f}")
    print(f"    Approved - Precision: {result['Precision_Approved']:.4f}, Recall: {result['Recall_Approved']:.4f}, F1: {result['F1_Approved']:.4f}")
    print()

# =================================================================================
# 6. RESULTS COMPARISON AND VISUALIZATION
# =================================================================================

print("\n" + "="*50)
print("RESULTS COMPARISON")
print("="*50)

# Create results DataFrame
results_df = pd.DataFrame(all_results)
results_summary = results_df[['Model', 'Accuracy', 'Precision', 'Recall', 'F1-Score', 'AUC', 
                             'Precision_Declined', 'Recall_Declined', 'F1_Declined',
                             'Precision_Approved', 'Recall_Approved', 'F1_Approved']].copy()

print("Complete Results Summary:")
print(results_summary.round(4))

# Find best model for each metric
print("\nBest Models by Metric:")
for metric in ['Accuracy', 'F1-Score', 'F1_Declined', 'F1_Approved']:
    best_idx = results_summary[metric].idxmax()
    best_model = results_summary.loc[best_idx]
    print(f"{metric}: {best_model['Model']} (Score: {best_model[metric]:.4f})")

# Visualize overall results
fig, axes = plt.subplots(2, 2, figsize=(15, 12))

# Overall metrics
metrics = ['Accuracy', 'Precision', 'Recall', 'F1-Score']
for i, metric in enumerate(metrics):
    ax = axes[i//2, i%2]
    results_summary.plot(x='Model', y=metric, kind='bar', ax=ax, color='skyblue')
    ax.set_title(f'{metric} Comparison')
    ax.set_xlabel('Model')
    ax.set_ylabel(metric)
    ax.tick_params(axis='x', rotation=45)
    ax.legend().remove()

plt.tight_layout()
plt.show()

# Class-specific performance comparison
fig, axes = plt.subplots(1, 2, figsize=(15, 6))

# Declined class performance
declined_metrics = ['Precision_Declined', 'Recall_Declined', 'F1_Declined']
declined_data = results_summary[['Model'] + declined_metrics].set_index('Model')
declined_data.plot(kind='bar', ax=axes[0], color=['red', 'orange', 'pink'])
axes[0].set_title('Performance on Declined Class')
axes[0].set_xlabel('Model')
axes[0].set_ylabel('Score')
axes[0].tick_params(axis='x', rotation=45)
axes[0].legend(['Precision', 'Recall', 'F1-Score'])

# Approved class performance
approved_metrics = ['Precision_Approved', 'Recall_Approved', 'F1_Approved']
approved_data = results_summary[['Model'] + approved_metrics].set_index('Model')
approved_data.plot(kind='bar', ax=axes[1], color=['green', 'lightgreen', 'darkgreen'])
axes[1].set_title('Performance on Approved Class')
axes[1].set_xlabel('Model')
axes[1].set_ylabel('Score')
axes[1].tick_params(axis='x', rotation=45)
axes[1].legend(['Precision', 'Recall', 'F1-Score'])

plt.tight_layout()
plt.show()

# =================================================================================
# 7. CONFUSION MATRICES FOR ALL MODELS
# =================================================================================

print("\n" + "="*50)
print("CONFUSION MATRICES")
print("="*50)

# Plot confusion matrices for all models
fig, axes = plt.subplots(2, 4, figsize=(20, 10))
axes = axes.ravel()

for i, result in enumerate(all_results):
    if i < len(axes):
        cm = confusion_matrix(y_test, result['Predictions'])
        sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', ax=axes[i],
                   xticklabels=target_encoder.classes_, 
                   yticklabels=target_encoder.classes_)
        axes[i].set_title(f'{result["Model"]}')
        axes[i].set_xlabel('Predicted')
        axes[i].set_ylabel('Actual')

# Remove empty subplots
for j in range(len(all_results), len(axes)):
    fig.delaxes(axes[j])

plt.tight_layout()
plt.show()

# Detailed classification reports for best models
best_overall_idx = results_summary['F1-Score'].idxmax()
best_declined_idx = results_summary['F1_Declined'].idxmax()

print(f"\nBest Overall Model: {results_summary.loc[best_overall_idx]['Model']}")
print("Classification Report:")
print(classification_report(y_test, all_results[best_overall_idx]['Predictions'], 
                          target_names=target_encoder.classes_))

print(f"\nBest Model for Declined Class: {results_summary.loc[best_declined_idx]['Model']}")
print("Classification Report:")
print(classification_report(y_test, all_results[best_declined_idx]['Predictions'], 
                          target_names=target_encoder.classes_))

# =================================================================================
# 8. ROC CURVES COMPARISON
# =================================================================================

print("\n" + "="*50)
print("ROC CURVES COMPARISON")
print("="*50)

plt.figure(figsize=(12, 8))

colors = ['blue', 'red', 'green', 'orange', 'purple', 'brown', 'pink']

for i, result in enumerate(all_results):
    if result['Probabilities'] is not None:
        fpr, tpr, _ = roc_curve(y_test, result['Probabilities'])
        auc_score = result['AUC']
        plt.plot(fpr, tpr, color=colors[i % len(colors)], 
                label=f"{result['Model']} (AUC = {auc_score:.3f})")

plt.plot([0, 1], [0, 1], 'k--', label='Random Classifier')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC Curves Comparison')
plt.legend(loc="lower right")
plt.grid(True, alpha=0.3)
plt.show()
 

NameError: name 'compute_class_weight' is not defined

[]


## 🔧 Recommended Solutions:
#### Step 1: Check for Data Leakage

In [11]:
# Check if any features are perfectly correlated with target
correlation_with_target = X_train.corrwith(pd.Series(y_train))
print("Features highly correlated with target:")
print(correlation_with_target[abs(correlation_with_target) > 0.8])

# Check for identical values between features and target
for col in X_train.columns:
    if X_train[col].dtype in ['int64', 'float64']:
        correlation = X_train[col].corr(pd.Series(y_train))
        if abs(correlation) > 0.9:
            print(f"High correlation found: {col} -> {correlation:.3f}")


Features highly correlated with target:
Series([], dtype: float64)


##  Add Regularization and Cross-Validation

In [12]:
from sklearn.model_selection import cross_val_score, StratifiedKFold
from sklearn.metrics import make_scorer, f1_score

# Updated models with regularization
models = {
    'Logistic Regression': LogisticRegression(
        max_iter=1000, 
        random_state=42, 
        C=0.1,  # Add regularization
        class_weight='balanced'  # Handle imbalance
    ),
    'Random Forest': RandomForestClassifier(
        n_estimators=100,
        max_depth=10,  # Limit depth to prevent overfitting
        min_samples_split=20,
        min_samples_leaf=10,
        random_state=42,
        class_weight='balanced'
    ),
    'XGBoost': xgb.XGBClassifier(
        n_estimators=100,
        max_depth=6,  # Limit depth
        learning_rate=0.1,
        reg_alpha=0.1,  # L1 regularization
        reg_lambda=0.1,  # L2 regularization
        scale_pos_weight=5,  # Handle imbalance (32705/6474 ≈ 5)
        random_state=42,
        eval_metric='logloss'
    )
}

# Use cross-validation instead of single train-test
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
results = []

for name, model in models.items():
    # Cross-validation scores
    f1_scores = cross_val_score(model, X_train, y_train, 
                               cv=cv, scoring=make_scorer(f1_score))
    
    print(f"\n{name}:")
    print(f"CV F1 scores: {f1_scores}")
    print(f"Mean F1: {f1_scores.mean():.3f} (+/- {f1_scores.std() * 2:.3f})")
    
    # Train on full training set and evaluate on test set
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    
    test_f1 = f1_score(y_test, y_pred)
    print(f"Test F1: {test_f1:.3f}")
    
    results.append({
        'Model': name,
        'CV F1 Mean': f1_scores.mean(),
        'CV F1 Std': f1_scores.std(),
        'Test F1': test_f1
    })

results_df = pd.DataFrame(results)
print("\n" + "="*50)
print("CROSS-VALIDATION RESULTS:")
print(results_df)


STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt


Logistic Regression:
CV F1 scores: [0.7779981  0.76848232 0.7838443  0.79503739 0.78208441]
Mean F1: 0.781 (+/- 0.017)


STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Test F1: 0.920

Random Forest:
CV F1 scores: [1. 1. 1. 1. 1.]
Mean F1: 1.000 (+/- 0.000)
Test F1: 1.000

XGBoost:
CV F1 scores: [1. 1. 1. 1. 1.]
Mean F1: 1.000 (+/- 0.000)
Test F1: 1.000

CROSS-VALIDATION RESULTS:
                 Model  CV F1 Mean  CV F1 Std   Test F1
0  Logistic Regression    0.781489   0.008612  0.920321
1        Random Forest    1.000000   0.000000  1.000000
2              XGBoost    1.000000   0.000000  1.000000
