In [101]:
import tmdbsimple as tmdb
import pandas as pd
import time
from tqdm import tqdm
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.model_selection import GridSearchCV
import ast
import streamlit as st

In [15]:
# Configuration

tmdb.API_KEY = '7eb2f50ca573c609c0bac8e9f804514d'

def get_movie_data(movie_id):
    """Fetches detailed data for a single movie from TMDb."""
    try:
        movie = tmdb.Movies(movie_id)
        info = movie.info()
        credits = movie.credits()
        keywords = movie.keywords()
        
        director = next((person['name'] for person in credits['crew'] if person['job'] == 'Director'), None)
        cast = [actor['name'] for actor in credits['cast'][:5]]
        production_companies = [company['name'] for company in info.get('production_companies', [])[:5]]
        genres = [genre['name'] for genre in info.get('genres', [])]
        movie_keywords = [keyword['name'] for keyword in keywords.get('keywords', [])]
        
# Ensure budget and revenue are present and non-zero for a quality dataset
        
        if info.get('budget', 0) == 0 or info.get('revenue', 0) == 0:
            return None
        return {
            'id': info['id'],
            'title': info['title'],
            'release_date': info.get('release_date'),
            'budget': info.get('budget'),
            'revenue': info.get('revenue'),
            'runtime': info.get('runtime'),
            'genres': genres,
            'cast': cast,
            'director': director,
            'keywords': movie_keywords,
            'production_companies': production_companies
        }
    except Exception as e:
        return None

In [7]:
# Fetching data

all_movie_data = []
PAGES_TO_FETCH = 100 

print(f"Fetching data for {PAGES_TO_FETCH * 20} movies...")

# Use tqdm for a progress bar

for page in tqdm(range(1, PAGES_TO_FETCH + 1), desc="Fetching Pages"):
    try:
        discover = tmdb.Discover()
        response = discover.movie(page=page, sort_by='popularity.desc')
        
        page_movie_ids = [movie['id'] for movie in response['results']]
        
        for movie_id in page_movie_ids:
            data = get_movie_data(movie_id)
            if data:
                all_movie_data.append(data)
            time.sleep(0.1) 
            
    except Exception as e:
        print(f"Error on page {page}: {e}")
        time.sleep(1)

Fetching data for 2000 movies...


Fetching Pages: 100%|█████████████████████████| 100/100 [19:56<00:00, 11.97s/it]


In [8]:
# Finalize Dataset

df = pd.DataFrame(all_movie_data)

print(f"\nSuccessfully fetched and processed data for {len(df)} movies.")
print("--- Sample of your final dataset ---")
print(df.head())

# Save the dataset to a CSV file

df.to_csv('movies_large_dataset.csv', index=False)
print(f"\nDataset saved to 'movies_large_dataset.csv'")


Successfully fetched and processed data for 1126 movies.
--- Sample of your final dataset ---
        id                                           title release_date  \
0  1311031  Demon Slayer: Kimetsu no Yaiba Infinity Castle   2025-07-18   
1   617126                    The Fantastic 4: First Steps   2025-07-22   
2  1186350                                           Marco   2024-12-20   
3  1038392                       The Conjuring: Last Rites   2025-09-03   
4  1054867                        One Battle After Another   2025-09-23   

      budget    revenue  runtime                                  genres  \
0   20000000  643612593      156  [Animation, Action, Fantasy, Thriller]   
1  200000000  521347662      115            [Science Fiction, Adventure]   
2    3467000   11559000      145               [Action, Crime, Thriller]   
3   55000000  458205000      135                                [Horror]   
4  130000000  101651146      162               [Action, Thriller, Crime]  

In [11]:
# Load dataset

df = pd.read_csv('movies_large_dataset.csv')

print("Initial Data Info")
df.info()

# Check for missing values in each column

print("\nMissing Value Counts")
print(df.isnull().sum())

Initial Data Info
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1126 entries, 0 to 1125
Data columns (total 11 columns):
 #   Column                Non-Null Count  Dtype 
---  ------                --------------  ----- 
 0   id                    1126 non-null   int64 
 1   title                 1126 non-null   object
 2   release_date          1126 non-null   object
 3   budget                1126 non-null   int64 
 4   revenue               1126 non-null   int64 
 5   runtime               1126 non-null   int64 
 6   genres                1126 non-null   object
 7   cast                  1126 non-null   object
 8   director              1126 non-null   object
 9   keywords              1126 non-null   object
 10  production_companies  1126 non-null   object
dtypes: int64(4), object(7)
memory usage: 96.9+ KB

Missing Value Counts
id                      0
title                   0
release_date            0
budget                  0
revenue                 0
runtime               

In [39]:
# Display how many movies have 0 budget or revenue before cleaning

print(f"Movies with zero budget before cleaning: {df[df['budget'] == 0].shape[0]}")
print(f"Movies with zero revenue before cleaning: {df[df['revenue'] == 0].shape[0]}")

# Replace 0s and drop them

df['budget'] = df['budget'].replace(0, np.nan)
df['revenue'] = df['revenue'].replace(0, np.nan)
df.dropna(subset=['budget', 'revenue'], inplace=True)

print(f"\nShape of DataFrame after removing zero budget/revenue movies: {df.shape}")

Movies with zero budget before cleaning: 0
Movies with zero revenue before cleaning: 0

Shape of DataFrame after removing zero budget/revenue movies: (1126, 11)


In [51]:
df = pd.read_csv('movies_large_dataset.csv')

df['profit'] = df['revenue'] - df['budget']
df['roi'] = (df['profit'] / (df['budget'] + 1)) * 100

# Convert release_date and extract date features
df['release_date'] = pd.to_datetime(df['release_date'])
df['release_year'] = df['release_date'].dt.year
df['release_month'] = df['release_date'].dt.month
df['release_dayofweek'] = df['release_date'].dt.dayofweek

In [55]:
# Define X and y
y = df['revenue']

features_to_drop = [
    'id', 'title', 'release_date', 'genres', 'cast', 'director',
    'keywords', 'production_companies', 'revenue', 'profit', 'roi'
]
X = df.drop(columns=features_to_drop)

print("--- Features (X) ---")
print(X.head())

print("\n--- Target (y) ---")
print(y.head())

--- Features (X) ---
      budget  runtime  release_year  release_month  release_dayofweek
0   20000000      156          2025              7                  4
1  200000000      115          2025              7                  1
2    3467000      145          2024             12                  4
3   55000000      135          2025              9                  2
4  130000000      162          2025              9                  1

--- Target (y) ---
0    643612593
1    521347662
2     11559000
3    458205000
4    101651146
Name: revenue, dtype: int64


In [59]:
# Split and train data
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=11)

gbr = GradientBoostingRegressor(n_estimators=100, learning_rate=0.1, max_depth=3, random_state=11)
gbr.fit(X_train, y_train)

y_pred = gbr.predict(X_test)

mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
r2 = r2_score(y_test, y_pred)

print("\nModel Evaluation")
print(f"Root Mean Squared Error (RMSE): ${rmse:,.2f}")
print(f"R-squared (R²): {r2:.2f}")



Model Evaluation
Root Mean Squared Error (RMSE): $217,548,097.48
R-squared (R²): 0.64


In [75]:
# Define the hyperparameter grid, initiate GridSearchCV
param_grid = {
    'n_estimators': [100, 200, 300],
    'learning_rate': [0.05, 0.1, 0.2],
    'max_depth': [3, 4, 5]
}

gbr = GradientBoostingRegressor(random_state=11)
grid_search = GridSearchCV(
    estimator=gbr,
    param_grid=param_grid,
    cv=5,
    scoring='r2',
    n_jobs=-1,
    verbose=2
)

# 3. Fit the model
grid_search.fit(X_train, y_train)

Fitting 5 folds for each of 27 candidates, totalling 135 fits


In [76]:
# Print the best parameters and score
print(f"\nBest Parameters Found: {grid_search.best_params_}")
print(f"Best Cross-validation R² Score: {grid_search.best_score_:.4f}")



Best Parameters Found: {'learning_rate': 0.05, 'max_depth': 5, 'n_estimators': 100}
Best Cross-validation R² Score: 0.4708


In [79]:
# Evaluate the best model on the test set
print("\nTuned Gradient Boosting Model Evaluation")
best_model = grid_search.best_estimator_
y_pred_tuned = best_model.predict(X_test)



Tuned Gradient Boosting Model Evaluation


In [81]:
# Calculate final metrics
mse_tuned = mean_squared_error(y_test, y_pred_tuned)
rmse_tuned = np.sqrt(mse_tuned)
r2_tuned = r2_score(y_test, y_pred_tuned)

print(f"Root Mean Squared Error (RMSE): ${rmse_tuned:,.2f}")
print(f"R-squared (R²): {r2_tuned:.2f}")

Root Mean Squared Error (RMSE): $210,625,202.42
R-squared (R²): 0.66


In [87]:
# Loaded dataset with categorical columns

df = pd.read_csv('movies_large_dataset.csv')
df['release_date'] = pd.to_datetime(df['release_date'])
df['release_year'] = df['release_date'].dt.year
df['release_month'] = df['release_date'].dt.month
df['release_dayofweek'] = df['release_date'].dt.dayofweek

# Convert strings back to lists
for col in ['genres', 'cast', 'keywords', 'production_companies']:
    df[col] = df[col].apply(ast.literal_eval)

In [91]:
# Feature engineering

top_genres = df['genres'].explode().value_counts().nlargest(10).index
for genre in top_genres:
    df[f'genre_{genre}'] = df['genres'].apply(lambda x: 1 if genre in x else 0)

for col in ['director', 'cast', 'production_companies']:
    if col != 'director':
        exploded_df = df.explode(col)
        mapping = exploded_df.groupby(col)['revenue'].mean()
        df[f'mean_{col}_revenue'] = df[col].apply(lambda lst: np.mean([mapping.get(item, 0) for item in lst]) if lst else 0)
    else: # Director is not a list
        mapping = df.groupby(col)['revenue'].mean()
        df[f'mean_{col}_revenue'] = df[col].map(mapping).fillna(0)

In [93]:
# Prep data and split

y = df['revenue']

features_to_drop = [
    'id', 'title', 'release_date', 'genres', 'cast', 'director', 
    'keywords', 'production_companies', 'revenue'
]
X = df.drop(columns=features_to_drop)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=11)

In [97]:
# Use best params from GridSearch and evaluate the model

best_params = {'learning_rate': 0.05, 'max_depth': 5, 'n_estimators': 100}
gbr_tuned = GradientBoostingRegressor(**best_params, random_state=11)

print("Training model with new categorical features...")
gbr_tuned.fit(X_train, y_train)

y_pred_final = gbr_tuned.predict(X_test)
rmse_final = np.sqrt(mean_squared_error(y_test, y_pred_final))
r2_final = r2_score(y_test, y_pred_final)

print("\nFinal Model Evaluation")
print(f"Root Mean Squared Error (RMSE): ${rmse_final:,.2f}")
print(f"R-squared (R²): {r2_final:.2f}")

Training model with new categorical features...

Final Model Evaluation
Root Mean Squared Error (RMSE): $115,525,562.89
R-squared (R²): 0.90


In [122]:
# Feature importances

feature_importances = pd.DataFrame({
    'feature': X.columns,
    'importance': gbr_tuned.feature_importances_
}).sort_values('importance', ascending=False)

print("\nTop 10 Most Important Features")
print(feature_importances.head(10))


Top 10 Most Important Features
                              feature  importance
16                  mean_cast_revenue    0.833990
15              mean_director_revenue    0.057266
17  mean_production_companies_revenue    0.038983
1                             runtime    0.031848
5                        genre_Action    0.009143
2                        release_year    0.009124
0                              budget    0.008996
3                       release_month    0.002424
11                      genre_Fantasy    0.002080
4                   release_dayofweek    0.001817


In [126]:
import joblib

# Re-save the definitive model and features
joblib.dump(best_model, 'movie_revenue_model.pkl')
joblib.dump(X.columns, 'model_features.pkl')

print("Final model and features have been re-saved successfully!")

Final model and features have been re-saved successfully!
