# Phase 3: Regression Models - Movie Revenue Prediction

## Overview
Compare baseline and advanced regression models for revenue prediction:
- Linear Regression, Ridge, Lasso (Baseline)
- Random Forest, XGBoost, LightGBM (Advanced)
- Hyperparameter optimization with GridSearchCV/RandomizedSearchCV
- Evaluation: RMSE, MAE, R², Cross-validation


In [13]:
from pathlib import Path
import pandas as pd
import numpy as np
import pickle

PROJECT_ROOT = Path('/home/asif/AI-Movie-Hit-Predictor')
DATA_DIR = PROJECT_ROOT / 'project_components' / 'data'

# Load engineered features
df = pd.read_csv(DATA_DIR / '04_engineered_features.csv')
print(f'Dataset loaded: {df.shape[0]} rows × {df.shape[1]} columns')


Dataset loaded: 11506 rows × 106 columns


In [14]:
# Import ML Libraries
import warnings
warnings.filterwarnings('ignore')

from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV, RandomizedSearchCV
from sklearn.preprocessing import RobustScaler
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
import xgboost as xgb
import lightgbm as lgb

print('All ML libraries imported successfully!')


All ML libraries imported successfully!


## Training Baseline Models (Linear Regression, Ridge, Lasso)
Compare simple linear models with various regularization approaches.


In [15]:
# Prepare data
df_modeling = df[df['revenue'].notna()].copy()
df_modeling['log_revenue'] = np.log1p(df_modeling['revenue'])

# Select features - exclude non-numeric and ID columns
exclude_cols = ['id', 'title', 'revenue', 'budget', 'release_date', 'genres', 'cast', 'crew',
                'original_title', 'genres_list', 'cast_list', 'director_list', 'primary_genre',
                'primary_company', 'overview', 'tagline', 'homepage', 'keywords', 'company_list',
                'original_language', 'status', 'production_companies']

# Get only numeric columns
feature_cols = [col for col in df_modeling.select_dtypes(include=[np.number]).columns 
                if col not in exclude_cols]

X = df_modeling[feature_cols].fillna(df_modeling[feature_cols].median(numeric_only=True))
y = df_modeling['log_revenue']

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Scale features
scaler = RobustScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

print(f'Training set: {X_train.shape[0]} samples')
print(f'Test set: {X_test.shape[0]} samples')
print(f'Features: {X_train.shape[1]}')


Training set: 3842 samples
Test set: 961 samples
Features: 34


In [16]:
def evaluate_model(y_true, y_pred, model_name):
    rmse = np.sqrt(mean_squared_error(y_true, y_pred))
    mae = mean_absolute_error(y_true, y_pred)
    r2 = r2_score(y_true, y_pred)
    return {'Model': model_name, 'RMSE': rmse, 'MAE': mae, 'R²': r2}

results = []

# Linear Regression
lr = LinearRegression()
lr.fit(X_train_scaled, y_train)
y_pred_lr = lr.predict(X_test_scaled)
results.append(evaluate_model(y_test, y_pred_lr, 'Linear Regression'))
print(f'Linear Regression R²: {results[-1]["R²"]:.4f}')

# Ridge
ridge = Ridge(alpha=1.0)
ridge.fit(X_train_scaled, y_train)
y_pred_ridge = ridge.predict(X_test_scaled)
results.append(evaluate_model(y_test, y_pred_ridge, 'Ridge'))
print(f'Ridge R²: {results[-1]["R²"]:.4f}')

# Lasso
lasso = Lasso(alpha=0.1, max_iter=10000)
lasso.fit(X_train_scaled, y_train)
y_pred_lasso = lasso.predict(X_test_scaled)
results.append(evaluate_model(y_test, y_pred_lasso, 'Lasso'))
print(f'Lasso R²: {results[-1]["R²"]:.4f}')


Linear Regression R²: 1.0000
Ridge R²: 1.0000
Lasso R²: 0.9992


## Training Advanced Models (Random Forest, XGBoost, LightGBM)


In [17]:
# Random Forest
rf = RandomForestRegressor(n_estimators=100, random_state=42, n_jobs=-1)
rf.fit(X_train, y_train)
y_pred_rf = rf.predict(X_test)
results.append(evaluate_model(y_test, y_pred_rf, 'Random Forest'))
print(f'Random Forest R²: {results[-1]["R²"]:.4f}')

# XGBoost
xgb_model = xgb.XGBRegressor(n_estimators=100, random_state=42, verbosity=0)
xgb_model.fit(X_train, y_train)
y_pred_xgb = xgb_model.predict(X_test)
results.append(evaluate_model(y_test, y_pred_xgb, 'XGBoost'))
print(f'XGBoost R²: {results[-1]["R²"]:.4f}')

# LightGBM
lgb_model = lgb.LGBMRegressor(n_estimators=100, random_state=42, verbosity=-1)
lgb_model.fit(X_train, y_train)
y_pred_lgb = lgb_model.predict(X_test)
results.append(evaluate_model(y_test, y_pred_lgb, 'LightGBM'))
print(f'LightGBM R²: {results[-1]["R²"]:.4f}')


Random Forest R²: 1.0000
XGBoost R²: 0.9996
LightGBM R²: 0.9999


## Hyperparameter Optimization


In [18]:
# XGBoost Hyperparameter Tuning
xgb_param_grid = {
    'max_depth': [3, 5, 7],
    'learning_rate': [0.01, 0.05, 0.1],
    'n_estimators': [100, 200]
}

print('GridSearchCV for XGBoost...')
xgb_grid = GridSearchCV(xgb.XGBRegressor(random_state=42), xgb_param_grid, cv=3, scoring='neg_mean_squared_error', n_jobs=-1)
xgb_grid.fit(X_train, y_train)
print(f'Best params: {xgb_grid.best_params_}')

xgb_optimized = xgb_grid.best_estimator_
y_pred_xgb_opt = xgb_optimized.predict(X_test)
results.append(evaluate_model(y_test, y_pred_xgb_opt, 'XGBoost (Optimized)'))
print(f'XGBoost Optimized R²: {results[-1]["R²"]:.4f}')


GridSearchCV for XGBoost...
Best params: {'learning_rate': 0.05, 'max_depth': 3, 'n_estimators': 200}
XGBoost Optimized R²: 0.9998


In [None]:
# Show all results
results_df = pd.DataFrame(results)
print('\nAll Model Results:')
print(results_df.to_string(index=False))

# Create models directory
MODEL_DIR = DATA_DIR / 'models'
MODEL_DIR.mkdir(exist_ok=True)

# Save best model
best_model_path = MODEL_DIR / 'best_regression_model.pkl'
with open(best_model_path, 'wb') as f:
    pickle.dump(xgb_optimized, f)
print(f'\n✓ Best model saved: {best_model_path}')

# Save scaler
scaler_path = MODEL_DIR / 'regression_scaler.pkl'
with open(scaler_path, 'wb') as f:
    pickle.dump(scaler, f)
print(f'✓ Scaler saved: {scaler_path}')

# Save feature columns
feature_cols_path = MODEL_DIR / 'regression_feature_columns.pkl'
with open(feature_cols_path, 'wb') as f:
    pickle.dump(feature_cols, f)
print(f'✓ Feature columns saved: {feature_cols_path}')

# Save model results
results_path = DATA_DIR / '05_regression_model_results.csv'
results_df.to_csv(results_path, index=False)
print(f'✓ Model results saved: {results_path}')

# Save predictions
predictions_df = pd.DataFrame({
    'Actual': y_test.values,
    'Predicted': y_pred_xgb_opt,
    'RMSE': np.sqrt(mean_squared_error(y_test, y_pred_xgb_opt)),
    'MAE': mean_absolute_error(y_test, y_pred_xgb_opt),
    'R2': r2_score(y_test, y_pred_xgb_opt)
})
predictions_path = DATA_DIR / '05_regression_predictions.csv'
predictions_df.to_csv(predictions_path, index=False)
print(f'✓ Predictions saved: {predictions_path}')

print(f'\n✓ All regression models, scalers, and results saved successfully!')



All Model Results:
              Model         RMSE          MAE       R²
  Linear Regression 9.828037e-15 7.761626e-15 1.000000
              Ridge 1.405730e-02 1.126106e-02 0.999997
              Lasso 2.319862e-01 2.092521e-01 0.999213
      Random Forest 4.023878e-02 4.532151e-03 0.999976
            XGBoost 1.569382e-01 3.434944e-02 0.999640
           LightGBM 6.727545e-02 1.809754e-02 0.999934
XGBoost (Optimized) 1.303074e-01 2.996609e-02 0.999752

✓ Best model saved to: /home/asif/AI-Movie-Hit-Predictor/project_components/data/models/best_regression_model.pkl
