# ArthiUsaha Tiering Model Training

This notebook trains and evaluates 5 classification algorithms to predict the partner Tier (Bronze, Silver, Gold) based on financial behavior features.

## Algorithms:
1. Logistic Regression
2. Random Forest Classifier
3. Support Vector Machine (SVM)
4. Gradient Boosting Classifier
5. K-Nearest Neighbors (KNN)

## Pipeline:
- Data Loading
- Preprocessing (Scaling, Encoding)
- Model Training
- Evaluation (Precision, Recall, F1-Score, Confusion Matrix)

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer

# Models
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier

# Metrics
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score, precision_score, recall_score, f1_score

%matplotlib inline

## 1. Load Data

In [None]:
# Load the dataset generated by generate_ml_data.py
DATA_PATH = '../HACKATHON_2025_DATA/training_data.csv'
df = pd.read_csv(DATA_PATH)

print(f"Data Shape: {df.shape}")
df.head()

In [None]:
# Check class distribution
sns.countplot(x='tier', data=df)
plt.title('Tier Distribution')
plt.show()

print(df['tier'].value_counts())

## 2. Preprocessing & Split

In [None]:
# Define Features and Target
features = ['total_principal', 'total_outstanding', 'max_dpd', 'repayment_rate', 'bill_count', 'total_bill_amount', 'total_paid_amount']
target = 'tier'

X = df[features]
y = df[target]

# Encode Target (Bronze=0, Silver=1, Gold=2 or similar)
le = LabelEncoder()
y_encoded = le.fit_transform(y)
print(f"Classes: {le.classes_}")

# Split Data
X_train, X_test, y_train, y_test = train_test_split(X, y_encoded, test_size=0.2, random_state=42, stratify=y_encoded)

print(f"Train Shape: {X_train.shape}")
print(f"Test Shape: {X_test.shape}")

## 3. Define Functions for Training and Evaluation

In [None]:
def create_pipeline(classifier):
    """
    Creates a pipeline with preprocessing (Scaling) and the given classifier.
    """
    # Numerical Preprocessing: Impute missing (just in case) + Scale
    numeric_transformer = Pipeline(steps=[
        ('imputer', SimpleImputer(strategy='median')),
        ('scaler', StandardScaler())
    ])

    preprocessor = ColumnTransformer(
        transformers=[
            ('num', numeric_transformer, features)
        ])

    pipeline = Pipeline(steps=[
        ('preprocessor', preprocessor),
        ('classifier', classifier)
    ])
    
    return pipeline

def evaluate_model(name, pipeline, X_test, y_test, le):
    """
    Evaluates the model and returns a dictionary of metrics.
    """
    y_pred = pipeline.predict(X_test)
    
    # Metrics
    acc = accuracy_score(y_test, y_pred)
    prec = precision_score(y_test, y_pred, average='weighted', zero_division=0)
    rec = recall_score(y_test, y_pred, average='weighted', zero_division=0)
    f1 = f1_score(y_test, y_pred, average='weighted', zero_division=0)
    
    print(f"--- {name} Results ---")
    print(classification_report(y_test, y_pred, target_names=le.classes_))
    
    # Confusion Matrix
    cm = confusion_matrix(y_test, y_pred)
    plt.figure(figsize=(6,4))
    sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', xticklabels=le.classes_, yticklabels=le.classes_)
    plt.title(f'Confusion Matrix - {name}')
    plt.ylabel('True Label')
    plt.xlabel('Predicted Label')
    plt.show()
    
    return {
        'Model': name,
        'Accuracy': acc,
        'Precision': prec,
        'Recall': rec,
        'F1 Score': f1
    }

## 4. Train and Evaluate Models

In [None]:
# Initialize Classifiers
classifiers = {
    'Logistic Regression': LogisticRegression(max_iter=1000, random_state=42),
    'Random Forest': RandomForestClassifier(n_estimators=100, random_state=42),
    'SVM': SVC(kernel='rbf', probability=True, random_state=42),
    'Gradient Boosting': GradientBoostingClassifier(n_estimators=100, random_state=42),
    'KNN': KNeighborsClassifier(n_neighbors=5)
}

results = []

# Loop through models
for name, clf in classifiers.items():
    print(f"Training {name}...")
    pipeline = create_pipeline(clf)
    pipeline.fit(X_train, y_train)
    
    metrics = evaluate_model(name, pipeline, X_test, y_test, le)
    results.append(metrics)
    print("\n" + "="*50 + "\n")

## 5. Model Comparison

In [None]:
# Create DataFrame for comparison
results_df = pd.DataFrame(results)
results_df.sort_values(by='F1 Score', ascending=False, inplace=True)

print(results_df)

# Plot Comparison
plt.figure(figsize=(10, 6))
sns.barplot(x='F1 Score', y='Model', data=results_df, palette='viridis')
plt.title('Model Comparison (F1 Score)')
plt.xlim(0, 1.0)
plt.show()

## 6. Save Best Model
We will save the best performing model for use in the `ml_engine` service.

In [None]:
import pickle

# Identify best model
best_model_name = results_df.iloc[0]['Model']
print(f"Best Model: {best_model_name}")

# Retrain best model on full data (optional, or just use the trained one)
best_clf = classifiers[best_model_name]
best_pipeline = create_pipeline(best_clf)
best_pipeline.fit(X_train, y_train)

# Save Model and Label Encoder
with open('best_model.pkl', 'wb') as f:
    pickle.dump(best_pipeline, f)
    
with open('label_encoder.pkl', 'wb') as f:
    pickle.dump(le, f)

print("Model and Label Encoder saved successfully!")