# Modeling

## Objective
Train multiple classification models to predict video game hits (total_sales >= 1M).

## Models
1. Logistic Regression (baseline linear classifier)
2. Decision Tree
3. Random Forest
4. K-Nearest Neighbors (KNN)

## Process
- Train baseline models with default parameters
- Hyperparameter tuning with GridSearchCV
- Save trained models and results for evaluation


In [None]:
%%sql


In [1]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import cross_val_score, GridSearchCV, StratifiedKFold
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score
import joblib
import warnings
warnings.filterwarnings('ignore')

pd.set_option('display.max_columns', None)


## 1. Load Preprocessed Data


In [2]:
# Load training and test sets
X_train = pd.read_csv('../data/processed/X_train.csv')
X_test = pd.read_csv('../data/processed/X_test.csv')
y_train = pd.read_csv('../data/processed/y_train.csv').values.ravel()
y_test = pd.read_csv('../data/processed/y_test.csv').values.ravel()

# Load scaled versions (for models that need scaling)
X_train_scaled = pd.read_csv('../data/processed/X_train_scaled.csv')
X_test_scaled = pd.read_csv('../data/processed/X_test_scaled.csv')

# Load class weights
class_weights_df = pd.read_csv('../data/processed/class_weights.csv')
# Convert to dictionary with int keys
class_weights = {int(col): class_weights_df[col].values[0] for col in class_weights_df.columns}

print(f"Training set: {X_train.shape}")
print(f"Test set: {X_test.shape}")
print(f"Class weights: {class_weights}")
print(f"\nTarget distribution in training:")
print(pd.Series(y_train).value_counts())


Training set: (11790, 53)
Test set: (2948, 53)
Class weights: {0: np.float64(0.5557126696832579), 1: np.float64(4.98730964467005)}

Target distribution in training:
0    10608
1     1182
Name: count, dtype: int64


## 2. Baseline Models (Without Tuning)


In [3]:
# Initialize models
models = {
    'Logistic Regression': LogisticRegression(class_weight='balanced', max_iter=1000, random_state=42),
    'Decision Tree': DecisionTreeClassifier(class_weight='balanced', random_state=42),
    'Random Forest': RandomForestClassifier(class_weight='balanced', n_estimators=100, random_state=42),
    'KNN': KNeighborsClassifier(n_neighbors=5)
}

# Store results
baseline_results = []

# Cross-validation setup
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

print("Training Baseline Models...")
print("="*80)

for name, model in models.items():
    print(f"\n{name}:")

    # Use scaled data for Logistic Regression and KNN
    if name in ['Logistic Regression', 'KNN']:
        X_tr, X_te = X_train_scaled, X_test_scaled
    else:
        X_tr, X_te = X_train, X_test

    # Cross-validation scores
    cv_scores = cross_val_score(model, X_tr, y_train, cv=cv, scoring='f1')

    # Train on full training set
    model.fit(X_tr, y_train)

    # Predictions
    y_pred = model.predict(X_te)
    y_pred_proba = model.predict_proba(X_te)[:, 1] if hasattr(model, 'predict_proba') else y_pred

    # Metrics
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred)
    recall = recall_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred)
    roc_auc = roc_auc_score(y_test, y_pred_proba)

    baseline_results.append({
        'Model': name,
        'CV F1 Mean': cv_scores.mean(),
        'CV F1 Std': cv_scores.std(),
        'Test Accuracy': accuracy,
        'Test Precision': precision,
        'Test Recall': recall,
        'Test F1': f1,
        'Test ROC-AUC': roc_auc
    })

    print(f"  CV F1: {cv_scores.mean():.4f} (+/- {cv_scores.std():.4f})")
    print(f"  Test Accuracy: {accuracy:.4f}")
    print(f"  Test Precision: {precision:.4f}")
    print(f"  Test Recall: {recall:.4f}")
    print(f"  Test F1: {f1:.4f}")
    print(f"  Test ROC-AUC: {roc_auc:.4f}")

# Display results table
baseline_df = pd.DataFrame(baseline_results)
print("\n" + "="*80)
print("BASELINE RESULTS SUMMARY")
print("="*80)
print(baseline_df.to_string(index=False))


Training Baseline Models...

Logistic Regression:
  CV F1: 0.3597 (+/- 0.0076)
  Test Accuracy: 0.7541
  Test Precision: 0.2453
  Test Recall: 0.7017
  Test F1: 0.3635
  Test ROC-AUC: 0.8075

Decision Tree:
  CV F1: 0.3275 (+/- 0.0226)
  Test Accuracy: 0.8453
  Test Precision: 0.3115
  Test Recall: 0.4508
  Test F1: 0.3684
  Test ROC-AUC: 0.6762

Random Forest:
  CV F1: 0.3268 (+/- 0.0293)
  Test Accuracy: 0.8779
  Test Precision: 0.3774
  Test Recall: 0.3390
  Test F1: 0.3571
  Test ROC-AUC: 0.7956

KNN:
  CV F1: 0.3215 (+/- 0.0359)
  Test Accuracy: 0.9043
  Test Precision: 0.5528
  Test Recall: 0.2305
  Test F1: 0.3254
  Test ROC-AUC: 0.7621

BASELINE RESULTS SUMMARY
              Model  CV F1 Mean  CV F1 Std  Test Accuracy  Test Precision  Test Recall  Test F1  Test ROC-AUC
Logistic Regression    0.359670   0.007605       0.754071        0.245261     0.701695 0.363477      0.807484
      Decision Tree    0.327542   0.022632       0.845319        0.311475     0.450847 0.368421      0

## 3. Hyperparameter Tuning


In [4]:
# Define parameter grids
param_grids = {
    'Logistic Regression': {
        'C': [0.01, 0.1, 1, 10, 100],
        'penalty': ['l2'],
        'solver': ['lbfgs', 'liblinear']
    },
    'Decision Tree': {
        'max_depth': [5, 10, 15, 20, None],
        'min_samples_split': [2, 5, 10],
        'min_samples_leaf': [1, 2, 4],
        'criterion': ['gini', 'entropy']
    },
    'Random Forest': {
        'n_estimators': [100, 200, 300],
        'max_depth': [10, 20, 30, None],
        'min_samples_split': [2, 5],
        'min_samples_leaf': [1, 2]
    },
    'KNN': {
        'n_neighbors': [3, 5, 7, 9, 11],
        'weights': ['uniform', 'distance'],
        'metric': ['euclidean', 'manhattan']
    }
}

# Perform grid search
tuned_models = {}
tuned_results = []

print("Hyperparameter Tuning with GridSearchCV...")
print("="*80)

for name, model in models.items():
    print(f"\n{name}:")

    # Use scaled data for Logistic Regression and KNN
    if name in ['Logistic Regression', 'KNN']:
        X_tr, X_te = X_train_scaled, X_test_scaled
    else:
        X_tr, X_te = X_train, X_test

    # Grid search
    grid_search = GridSearchCV(
        model,
        param_grids[name],
        cv=cv,
        scoring='f1',
        n_jobs=-1,
        verbose=0
    )

    grid_search.fit(X_tr, y_train)

    # Best model
    best_model = grid_search.best_estimator_
    tuned_models[name] = best_model

    print(f"  Best parameters: {grid_search.best_params_}")
    print(f"  Best CV F1: {grid_search.best_score_:.4f}")

    # Test predictions
    y_pred = best_model.predict(X_te)
    y_pred_proba = best_model.predict_proba(X_te)[:, 1] if hasattr(best_model, 'predict_proba') else y_pred

    # Metrics
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred)
    recall = recall_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred)
    roc_auc = roc_auc_score(y_test, y_pred_proba)

    tuned_results.append({
        'Model': name,
        'Best CV F1': grid_search.best_score_,
        'Test Accuracy': accuracy,
        'Test Precision': precision,
        'Test Recall': recall,
        'Test F1': f1,
        'Test ROC-AUC': roc_auc
    })

    print(f"  Test Accuracy: {accuracy:.4f}")
    print(f"  Test Precision: {precision:.4f}")
    print(f"  Test Recall: {recall:.4f}")
    print(f"  Test F1: {f1:.4f}")
    print(f"  Test ROC-AUC: {roc_auc:.4f}")

# Display tuned results
tuned_df = pd.DataFrame(tuned_results)
print("\n" + "="*80)
print("TUNED RESULTS SUMMARY")
print("="*80)
print(tuned_df.to_string(index=False))


Hyperparameter Tuning with GridSearchCV...

Logistic Regression:
  Best parameters: {'C': 0.01, 'penalty': 'l2', 'solver': 'lbfgs'}
  Best CV F1: 0.3617
  Test Accuracy: 0.7578
  Test Precision: 0.2473
  Test Recall: 0.6949
  Test F1: 0.3648
  Test ROC-AUC: 0.8065

Decision Tree:
  Best parameters: {'criterion': 'entropy', 'max_depth': 20, 'min_samples_leaf': 1, 'min_samples_split': 10}
  Best CV F1: 0.3571
  Test Accuracy: 0.7910
  Test Precision: 0.2664
  Test Recall: 0.6203
  Test F1: 0.3727
  Test ROC-AUC: 0.7410

Random Forest:
  Best parameters: {'max_depth': 20, 'min_samples_leaf': 2, 'min_samples_split': 5, 'n_estimators': 300}
  Best CV F1: 0.4472
  Test Accuracy: 0.8735
  Test Precision: 0.4044
  Test Recall: 0.5593
  Test F1: 0.4694
  Test ROC-AUC: 0.8378

KNN:
  Best parameters: {'metric': 'manhattan', 'n_neighbors': 5, 'weights': 'distance'}
  Best CV F1: 0.3468
  Test Accuracy: 0.9016
  Test Precision: 0.5150
  Test Recall: 0.2915
  Test F1: 0.3723
  Test ROC-AUC: 0.7490


## 4. Save Trained Models and Results


In [5]:
# Save baseline models (trained models from section 2)
for name, model in models.items():
    safe_name = name.replace(' ', '_').lower()
    model_path = f'../models/{safe_name}_baseline.pkl'
    joblib.dump(model, model_path)
    print(f"Saved: {model_path}")

# Save tuned models
for name, model in tuned_models.items():
    safe_name = name.replace(' ', '_').lower()
    model_path = f'../models/{safe_name}_tuned.pkl'
    joblib.dump(model, model_path)
    print(f"Saved: {model_path}")

# Save results dataframes
baseline_df.to_csv('../models/baseline_results.csv', index=False)
tuned_df.to_csv('../models/tuned_results.csv', index=False)
print("\nResults saved:")
print("  - ../models/baseline_results.csv")
print("  - ../models/tuned_results.csv")

print("\n" + "="*80)
print("MODEL TRAINING COMPLETE")
print("="*80)
print(f"Total models trained: {len(models)} baseline + {len(tuned_models)} tuned = {len(models)*2}")
print("All models saved to: ../models/")
print("\nNext step: Run 4_evaluation.ipynb for detailed model evaluation and comparison")



Saved: ../models/logistic_regression_baseline.pkl
Saved: ../models/decision_tree_baseline.pkl
Saved: ../models/random_forest_baseline.pkl
Saved: ../models/knn_baseline.pkl
Saved: ../models/logistic_regression_tuned.pkl
Saved: ../models/decision_tree_tuned.pkl
Saved: ../models/random_forest_tuned.pkl
Saved: ../models/knn_tuned.pkl

Results saved:
  - ../models/baseline_results.csv
  - ../models/tuned_results.csv

MODEL TRAINING COMPLETE
Total models trained: 4 baseline + 4 tuned = 8
All models saved to: ../models/

Next step: Run 4_evaluation.ipynb for detailed model evaluation and comparison
