# Phase 3: Machine Learning Model Development

**Objective:** Connect to the cleaned, ML-ready data view, perform exploratory data analysis, and apply a Machine Learning model.

**Improvements:**
- **Imputation:** Handling missing values using `SimpleImputer`.
- **Bias Mitigation:** Using **SMOTE** (Synthetic Minority Over-sampling Technique) to handle class imbalance.
- **Leakage Prevention:** Removing all rarity-based flags to force the model to learn from actual game stats.

In [None]:
import pandas as pd
import sys
import os
from sqlalchemy import create_engine
from sklearn.model_selection import train_test_split, StratifiedKFold, cross_val_score
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from imblearn.over_sampling import SMOTE # For Bias Mitigation
from imblearn.pipeline import Pipeline as ImbPipeline
import matplotlib.pyplot as plt
import seaborn as sns

# Add the parent directory to sys.path to import from src
sys.path.append(os.path.abspath(os.path.join('..')))

from src import config

## 1. Load ML-Ready Data
We will load data directly from the `v_ml_ready_cards` view in our PostgreSQL database.

In [None]:
engine = create_engine(config.get_db_connection_string())

try:
    print("Loading data from v_ml_ready_cards view...")
    df_ml = pd.read_sql("SELECT * FROM v_ml_ready_cards;", engine)
    print("Data loaded successfully.")
    print(f"Total Rows Loaded: {len(df_ml)}") # EXPLICIT COUNT
    print(df_ml.head())
    print(df_ml.info())
except Exception as e:
    print(f"Error loading ML-ready data: {e}")
    print("Please ensure the database is running and 'v_ml_ready_cards' view exists.")

## 2. Exploratory Data Analysis (EDA)
A brief look at the data distribution and relationships.

In [None]:
if 'df_ml' in locals():
    # Distribution of elixir costs
    plt.figure(figsize=(8, 5))
    sns.histplot(df_ml['elixir_cost'].dropna(), bins=range(0, 10), kde=False)
    plt.title('Distribution of Elixir Costs')
    plt.xlabel('Elixir Cost')
    plt.ylabel('Number of Cards')
    plt.xticks(range(0, 10))
    plt.show()

    # Count of cards by rarity
    plt.figure(figsize=(10, 6))
    sns.countplot(y='rarity', data=df_ml, order=df_ml['rarity'].value_counts().index)
    plt.title('Count of Cards by Rarity')
    plt.xlabel('Number of Cards')
    plt.ylabel('Rarity')
    plt.show()
else:
    print("Dataframe df_ml not available for EDA.")

## 3. Prepare Data for Machine Learning
**Crucial Step:** We drop ALL rarity-related columns. If we keep `rarity_is_common`, `rarity_is_rare`, etc., the model can trivially deduce that (NOT Common AND NOT Rare...) == Legendary. This is **Data Leakage**. 

We will rely ONLY on `elixir_cost`, `max_level`, and `elixir_tier`.

In [None]:
if 'df_ml' in locals() and not df_ml.empty:
    # Define target variable (y) and features (X)
    y = df_ml['is_legendary']
    
    # DROP ALL Rarity info to prevent leakage
    # Manual list to avoid syntax errors in generation script
    drop_cols = ['id', 'name', 'description', 'icon_medium', 'updated_at', 'rarity', 'arena', 'is_legendary', 'is_champion', 'rarity_is_legendary_flag', 'rarity_is_champion_flag', 'rarity_is_common', 'rarity_is_rare', 'rarity_is_epic']

    # Safe drop (ignore errors if column doesn't exist)
    X = df_ml.drop(columns=[c for c in drop_cols if c in df_ml.columns])

    # Handle categorical features (elixir_tier)
    X = pd.get_dummies(X, columns=['elixir_tier'], drop_first=True)

    # Ensure all columns are numeric
    X = X.select_dtypes(include=['number', 'bool'])
    
    print(f"Features (X) shape: {X.shape}")
    print(f"Target (y) shape: {y.shape}")
    print("Feature names:", list(X.columns))

    # Split data
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)
    print(f"
Train Set Size: {len(X_train)}")
    print(f"Test Set Size: {len(X_test)}")
    print("Data split completed.")
else:
    print("Dataframe df_ml not available.")

## 4. Build Pipeline with Imputation and Bias Mitigation (SMOTE)
We use an `ImbPipeline` to correctly apply SMOTE *only* during training, not validation/testing.

In [None]:
if 'X_train' in locals():
    # 1. Imputer: Fill missing values with median (Robustness)
    # 2. Scaler: Normalize features
    # 3. SMOTE: Oversample the minority class (Legendary) to fix bias
    # 4. Classifier: Random Forest
    
    pipeline = ImbPipeline([
        ('imputer', SimpleImputer(strategy='median')),
        ('scaler', StandardScaler()),
        ('smote', SMOTE(random_state=42)),
        ('model', RandomForestClassifier(n_estimators=100, random_state=42))
    ])
    
    print("Training with SMOTE and Imputation...")
    pipeline.fit(X_train, y_train)
    print("Training complete.")
else:
    print("Training data not available.")

## 5. Evaluate Model Performance
We expect a more realistic score now that leakage is removed.

In [None]:
if 'pipeline' in locals():
    y_pred = pipeline.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)
    print(f"
Model Accuracy: {accuracy:.4f}")
    
    print("
Classification Report:")
    print(classification_report(y_test, y_pred))
    
    print("
Confusion Matrix:")
    print(confusion_matrix(y_test, y_pred))

    # Feature Importance (accessing the model step inside pipeline)
    rf_model = pipeline.named_steps['model']
    if hasattr(rf_model, 'feature_importances_'):
        feature_importances = pd.Series(rf_model.feature_importances_, index=X.columns)
        plt.figure(figsize=(10, 7))
        feature_importances.nlargest(10).plot(kind='barh')
        plt.title('Top 10 Feature Importances')
        plt.show()
else:
    print("Model not trained.")