In [1]:
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor, StackingRegressor
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error
import numpy as np
import pandas as pd
df = pd.read_csv('../data/spotify_dedup.csv')
# Prepare data
X = df[features_to_analyze]
y = df['popularity']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Define models
models = {
    'Linear Regression': LinearRegression(),
    
    'Random Forest': RandomForestRegressor(
        n_estimators=100,
        max_depth=15,
        min_samples_split=5,
        min_samples_leaf=2,
        random_state=42,
        n_jobs=-1
    ),
    
    'Stacking Regressor': StackingRegressor(
        estimators=[
            ('lr', LinearRegression()),
            ('rf', RandomForestRegressor(n_estimators=100, random_state=42))
        ],
        final_estimator=LinearRegression(),  # Meta-model
        cv=5,
        n_jobs=-1
    )
}

# Train and evaluate
results = []

for name, model in models.items():
    print(f"\n{'='*70}")
    print(f"Training {name}...")
    print('='*70)
    
    # Train
    model.fit(X_train, y_train)
    
    # Predict
    y_pred_train = model.predict(X_train)
    y_pred_test = model.predict(X_test)
    
    # Evaluate
    train_r2 = r2_score(y_train, y_pred_train)
    test_r2 = r2_score(y_test, y_pred_test)
    test_rmse = np.sqrt(mean_squared_error(y_test, y_pred_test))
    test_mae = mean_absolute_error(y_test, y_pred_test)
    
    # Cross-validation
    cv_scores = cross_val_score(model, X_train, y_train, cv=5, scoring='r2', n_jobs=-1)
    
    results.append({
        'Model': name,
        'Train R¬≤': train_r2,
        'Test R¬≤': test_r2,
        'CV R¬≤ (mean¬±std)': f"{cv_scores.mean():.4f} ¬± {cv_scores.std():.4f}",
        'RMSE': test_rmse,
        'MAE': test_mae,
        'Overfit Gap': train_r2 - test_r2
    })
    
    print(f"‚úì Train R¬≤: {train_r2:.4f}")
    print(f"‚úì Test R¬≤:  {test_r2:.4f}")
    print(f"‚úì CV R¬≤:    {cv_scores.mean():.4f} ¬± {cv_scores.std():.4f}")
    print(f"‚úì RMSE:     {test_rmse:.4f}")
    print(f"‚úì MAE:      {test_mae:.4f}")
    print(f"‚úì Overfit:  {train_r2 - test_r2:.4f}")

# Results comparison
results_df = pd.DataFrame(results)
print("\n" + "="*80)
print("üìä MODEL COMPARISON RESULTS")
print("="*80)
print(results_df.to_string(index=False))
print("="*80)

# Determine best model
best_model_name = results_df.loc[results_df['Test R¬≤'].idxmax(), 'Model']
print(f"\nüèÜ Best Model: {best_model_name}")

KeyboardInterrupt: 