In [None]:
# Import necessary libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.metrics import confusion_matrix, classification_report, roc_curve, auc
from sklearn.pipeline import Pipeline
from imblearn.over_sampling import SMOTE

In [None]:
# Set random seed for reproducibility
np.random.seed(42)

In [None]:
# Load and prepare the data
df = pd.read_csv('../data/pokemon.csv')

In [None]:
# Feature engineering
df['total_stats'] = df[['hp', 'attack', 'defense', 'sp_attack', 'sp_defense', 'speed']].sum(axis=1)
df['physical_average'] = df[['attack', 'defense']].mean(axis=1)
df['special_average'] = df[['sp_attack', 'sp_defense']].mean(axis=1)

In [None]:
# Prepare features
# One-hot encode type1
X_type = pd.get_dummies(df['type1'], prefix='type')

In [None]:
# Combine numerical features and one-hot encoded type
X_numeric = df[['hp', 'attack', 'defense', 'sp_attack', 'sp_defense', 'speed', 
                'total_stats', 'physical_average', 'special_average']]
X = pd.concat([X_numeric, X_type], axis=1)
y = df['is_legendary']

In [None]:
# Split the data
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.25, random_state=42, stratify=y
)

In [None]:
print("Training set shape:", X_train.shape)
print("Test set shape:", X_test.shape)
print("\nClass distribution in training set:")
print(y_train.value_counts(normalize=True))

In [None]:
def evaluate_model(model, X_train, X_test, y_train, y_test, model_name="Model"):
    # Make predictions
    y_pred = model.predict(X_test)
    y_pred_proba = model.predict_proba(X_test)[:, 1]
    
    # Print metrics
    print(f"=== {model_name} Performance ===\n")
    print("Classification Report:")
    print(classification_report(y_test, y_pred))
    
    # Plot confusion matrix
    plt.figure(figsize=(8, 6))
    cm = confusion_matrix(y_test, y_pred)
    sns.heatmap(cm, annot=True, fmt='d', cmap='Blues')
    plt.title(f'Confusion Matrix - {model_name}')
    plt.ylabel('True Label')
    plt.xlabel('Predicted Label')
    plt.show()
    
    # Plot ROC curve
    plt.figure(figsize=(8, 6))
    fpr, tpr, _ = roc_curve(y_test, y_pred_proba)
    roc_auc = auc(fpr, tpr)
    plt.plot(fpr, tpr, label=f'ROC curve (AUC = {roc_auc:.2f})')
    plt.plot([0, 1], [0, 1], 'k--')
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate')
    plt.title(f'ROC Curve - {model_name}')
    plt.legend()
    plt.show()
    
    if hasattr(model, 'feature_importances_'):
        # Plot feature importances for tree-based models
        importances = pd.DataFrame({
            'feature': X_train.columns,
            'importance': model.feature_importances_
        }).sort_values('importance', ascending=False)
        
        plt.figure(figsize=(10, 6))
        sns.barplot(data=importances.head(15), x='importance', y='feature')
        plt.title(f'Top 15 Most Important Features - {model_name}')
        plt.show()

In [None]:
# 1. Baseline Model - Logistic Regression
print("Training Logistic Regression...")
lr_pipeline = Pipeline([
    ('scaler', StandardScaler()),
    ('classifier', LogisticRegression(random_state=42))
])

In [None]:
lr_pipeline.fit(X_train, y_train)
evaluate_model(lr_pipeline, X_train, X_test, y_train, y_test, "Logistic Regression")

In [None]:
# 2. Random Forest with SMOTE for handling class imbalance
print("Training Random Forest with SMOTE...")

In [None]:
# Apply SMOTE to balance the training data
smote = SMOTE(random_state=42)
X_train_balanced, y_train_balanced = smote.fit_resample(X_train, y_train)

In [None]:
rf_pipeline = Pipeline([
    ('classifier', RandomForestClassifier(n_estimators=100, random_state=42))
])

In [None]:
rf_pipeline.fit(X_train_balanced, y_train_balanced)
evaluate_model(rf_pipeline, X_train_balanced, X_test, y_train_balanced, y_test, "Random Forest with SMOTE")

In [None]:
def predict_legendary(model, pokemon_stats):
    """Predict if a Pokemon is legendary based on its stats.
    
    Args:
        model: Trained model
        pokemon_stats: Dictionary with Pokemon stats
    """
    # Create a DataFrame with the same structure as training data
    pokemon_df = pd.DataFrame([pokemon_stats])
    
    # Add engineered features
    pokemon_df['total_stats'] = pokemon_df[['hp', 'attack', 'defense', 'sp_attack', 'sp_defense', 'speed']].sum(axis=1)
    pokemon_df['physical_average'] = pokemon_df[['attack', 'defense']].mean(axis=1)
    pokemon_df['special_average'] = pokemon_df[['sp_attack', 'sp_defense']].mean(axis=1)
    
    # One-hot encode type
    type_dummies = pd.get_dummies(pokemon_df['type1'], prefix='type')
    
    # Add missing type columns from training data
    for col in X_train.columns:
        if col.startswith('type_') and col not in type_dummies.columns:
            type_dummies[col] = 0
    
    # Combine features
    features = pd.concat([
        pokemon_df[['hp', 'attack', 'defense', 'sp_attack', 'sp_defense', 'speed',
                    'total_stats', 'physical_average', 'special_average']],
        type_dummies
    ], axis=1)
    
    # Ensure columns are in the same order as training data
    features = features[X_train.columns]
    
    # Make prediction
    prediction = model.predict(features)[0]
    probability = model.predict_proba(features)[0][1]
    
    return prediction, probability

In [None]:
# Example usage
example_pokemon = {
    'hp': 100,
    'attack': 150,
    'defense': 140,
    'sp_attack': 120,
    'sp_defense': 100,
    'speed': 90,
    'type1': 'dragon'
}

In [None]:
is_legendary, legendary_prob = predict_legendary(rf_pipeline, example_pokemon)
print(f"Prediction: {'Legendary' if is_legendary else 'Not Legendary'}")
print(f"Probability of being legendary: {legendary_prob:.2%}") 