In [None]:
# Movie Predictor Model
# Anja Gill
# Goals were to see if we could predict the rating of a movie based on features such as budget, production company, cast, etc.

In [4]:
import tmdbsimple as tmdb
import pandas as pd
import time
from tqdm import tqdm
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.metrics import mean_squared_error, r2_score
import ast
import joblib
import os 
import warnings
warnings.filterwarnings('ignore')

In [None]:
# NOTE: API_KEY is exposed here. For production, use environment variables.
tmdb.API_KEY = '7eb2f50ca573c609c0bac8e9f804514d'

PAGES_TO_FETCH = 10  # Reduced pages for quick test generation

# Define the features that the Streamlit app actually collects from the user.
STREAMLIT_INPUT_FEATURES = [
    'budget',
    'runtime',
    'release_year',
    'release_month',
    'release_dayofweek',
]
!Data Acquisition and Preprocessing

def get_movie_data(movie_id):
    """Fetches detailed data for a single movie from TMDb."""
    try:
        movie = tmdb.Movies(movie_id)
        info = movie.info()
        credits = movie.credits()
        keywords = movie.keywords()
        
        director = next((person['name'] for person in credits['crew'] if person['job'] == 'Director'), None)
        cast = [actor['name'] for actor in credits['cast'][:5]]
        production_companies = [company['name'] for company in info.get('production_companies', [])[:5]]
        genres = [genre['name'] for genre in info.get('genres', [])]
        movie_keywords = [keyword['name'] for keyword in keywords.get('keywords', [])]
        
        # Filter for quality data points (non-zero budget/revenue)
        if info.get('budget', 0) == 0 or info.get('revenue', 0) == 0:
            return None
        
        return {
            'id': info['id'],
            'title': info['title'],
            'release_date': info.get('release_date'),
            'budget': info.get('budget'),
            'revenue': info.get('revenue'),
            'runtime': info.get('runtime'),
            'genres': genres,
            'cast': cast,
            'director': director,
            'keywords': movie_keywords,
            'production_companies': production_companies
        }
    except Exception:
        # Silently skip movies with API errors
        return None

# --- Data Fetching (Use this section ONLY if you need to regenerate data) ---
# If you already have 'movies_large_dataset.csv', you can skip this cell.


In [None]:
all_movie_data = []
PAGES_TO_FETCH = 100 
print(f"Fetching data for up to {PAGES_TO_FETCH * 20} movies...")

# Note: Due to API limits, fetching data takes time.
for page in tqdm(range(1, PAGES_TO_FETCH + 1), desc="Fetching Pages"):
    try:
        discover = tmdb.Discover()
        response = discover.movie(page=page, sort_by='popularity.desc')
        page_movie_ids = [movie['id'] for movie in response['results']]
        
        for movie_id in page_movie_ids:
            data = get_movie_data(movie_id)
            if data:
                all_movie_data.append(data)
            time.sleep(0.1) # Respect API rate limits
            
    except Exception as e:
        print(f"Error on page {page}: {e}")
        time.sleep(1)

df = pd.DataFrame(all_movie_data)
df.to_csv('movies_large_dataset.csv', index=False)
print(f"\nDataset saved with {len(df)} movies.")

Fetching data for up to 2000 movies...


Fetching Pages:  77%|████████████████████      | 77/100 [16:02<04:53, 12.75s/it]

In [None]:
# Data cleaning
df = pd.read_csv('movies_large_dataset.csv')

# Initial Cleaning
df['budget'] = df['budget'].replace(0, np.nan)
df['revenue'] = df['revenue'].replace(0, np.nan)
df.dropna(subset=['budget', 'revenue'], inplace=True)

# Feature extraction from date
df['release_date'] = pd.to_datetime(df['release_date'])
df['release_year'] = df['release_date'].dt.year
df['release_month'] = df['release_date'].dt.month
df['release_dayofweek'] = df['release_date'].dt.dayofweek

# Convert stored strings back into Python lists
for col in ['genres', 'cast', 'keywords', 'production_companies']:
    df[col] = df[col].apply(lambda x: ast.literal_eval(x) if isinstance(x, str) else x)

df['profit'] = df['revenue'] - df['budget']

print(f"\nCleaned DataFrame shape: {df.shape}")

In [None]:
# Find top 10 genres for one-hot encoding
top_genres = df['genres'].explode().value_counts().nlargest(10).index

for genre in top_genres:
    df[f'genre_{genre}'] = df['genres'].apply(lambda x: 1 if genre in x else 0)

# Mean Encoding Logic
for col in ['director', 'cast', 'production_companies']:
    if col != 'director':
        # For lists (cast, companies), explode and calculate mean revenue
        exploded_df = df.explode(col)
        mapping = exploded_df.groupby(col)['revenue'].mean()
        df[f'mean_{col}_revenue'] = df[col].apply(
            lambda lst: np.mean([mapping.get(item, 0) for item in lst]) if lst else 0
        )
    else: 
        # Director is a single value, use map
        mapping = df.groupby(col)['revenue'].mean()
        df[f'mean_{col}_revenue'] = df[col].map(mapping).fillna(0)

In [None]:
# Model Training with Full Feature Set

y = df['revenue']

# Define ALL features, including the complex mean-encoded ones
features_to_drop_full = [
    'id', 'title', 'release_date', 'genres', 'cast', 'director', 
    'keywords', 'production_companies', 'revenue', 'profit', 'roi'
]
X_full = df.drop(columns=features_to_drop_full, errors='ignore')
X_full.dropna(inplace=True) 

# Re-align Y after dropping NaNs in X_full
y_full = y[X_full.index]

X_train_full, X_test_full, y_train_full, y_test_full = train_test_split(
    X_full, y_full, test_size=0.2, random_state=11
)

print(f"\nTraining on {X_train_full.shape[1]} features.")
# Example GBR model for initial metrics
gbr_full = GradientBoostingRegressor(n_estimators=100, learning_rate=0.1, max_depth=3, random_state=11)
gbr_full.fit(X_train_full, y_train_full)

y_pred_full = gbr_full.predict(X_test_full)
r2_full = r2_score(y_test_full, y_pred_full)

print(f"**Full Feature Model R² Score (Baseline): {r2_full:.4f}**")

In [None]:
# Hyperparameter Tuning (GridSearch)

print("\n--- Running Grid Search for Best Model Parameters ---")

param_grid = {
    'n_estimators': [100, 200],
    'learning_rate': [0.05, 0.1],
    'max_depth': [3, 5]
}

grid_search_full = GridSearchCV(
    estimator=GradientBoostingRegressor(random_state=11),
    param_grid=param_grid,
    cv=3, # Reduced CV for quicker demonstration
    scoring='r2',
    n_jobs=-1,
    verbose=0
)

grid_search_full.fit(X_train_full, y_train_full)

best_params_full = grid_search_full.best_params_
best_model_full = grid_search_full.best_estimator_

print(f"Best Parameters Found: {best_params_full}")
print(f"Best Cross-validation R² Score: {grid_search_full.best_score_:.4f}")

In [None]:
# Model Evaluation and Feature Importance (Showcasing Results)

y_pred_tuned_full = best_model_full.predict(X_test_full)
rmse_tuned_full = np.sqrt(mean_squared_error(y_test_full, y_pred_tuned_full))
r2_tuned_full = r2_score(y_test_full, y_pred_tuned_full)

print("\n**Final Full-Feature Model Evaluation**")
print(f"RMSE: ${rmse_tuned_full:,.2f}")
print(f"R-squared (R²): {r2_tuned_full:.4f}")

feature_importances = pd.DataFrame({
    'feature': X_full.columns,
    'importance': best_model_full.feature_importances_
}).sort_values('importance', ascending=False)

print("\nTop 10 Most Important Features:")
print(feature_importances.head(10).to_markdown(index=False))

In [None]:
# The highly accurate model above uses features (mean_revenue_...) that cannot 
# be collected from a live user in the Streamlit app.
# We create a new, simplified model that only uses user-collectable features (8 features).

print("\n--- Creating Simplified Model for Deployment ---")

# Define the features that the Streamlit app actually collects from the user.
STREAMLIT_FEATURES = [
    'budget', 
    'runtime', 
    'release_year', 
    'release_month', 
    'release_dayofweek'
]
# Add the genre columns created in Step 3
STREAMLIT_FEATURES.extend([col for col in X_full.columns if col.startswith('genre_')])

# Define X and y for the simplified model
X_simple = X_full[STREAMLIT_FEATURES]
y_simple = y_full 

# Re-train the GBR model using the best parameters, but only on the simple features
gbr_final_for_app = GradientBoostingRegressor(**best_params_full, random_state=11)
gbr_final_for_app.fit(X_simple, y_simple)

# Final evaluation of the simplified model (for documentation)
y_pred_simple = gbr_final_for_app.predict(X_test_full[STREAMLIT_FEATURES])
r2_simple = r2_score(y_test_full, y_pred_simple)

print(f"Simplified Model R² Score: {r2_simple:.4f}")
print("--- Final Model Ready for Streamlit ---")

# --- 8. Saving Final Files for Deployment ---

# Save the model trained on the simplified features
joblib.dump(gbr_final_for_app, 'movie_revenue_model.pkl')

# Save only the list of simple feature names (CRUCIAL for app input matching)
joblib.dump(STREAMLIT_FEATURES, 'model_features.pkl')

print("\nSUCCESS: Final 'movie_revenue_model.pkl' and 'model_features.pkl' saved locally.")
