In [3]:
#install scikit-learn
%pip install scikit-learn

Collecting scikit-learn
  Obtaining dependency information for scikit-learn from https://files.pythonhosted.org/packages/17/1c/ccdd103cfcc9435a18819856fbbe0c20b8fa60bfc3343580de4be13f0668/scikit_learn-1.5.2-cp311-cp311-win_amd64.whl.metadata
  Downloading scikit_learn-1.5.2-cp311-cp311-win_amd64.whl.metadata (13 kB)
Collecting scipy>=1.6.0 (from scikit-learn)
  Obtaining dependency information for scipy>=1.6.0 from https://files.pythonhosted.org/packages/ea/c2/5ecadc5fcccefaece775feadcd795060adf5c3b29a883bff0e678cfe89af/scipy-1.14.1-cp311-cp311-win_amd64.whl.metadata
  Downloading scipy-1.14.1-cp311-cp311-win_amd64.whl.metadata (60 kB)
     ---------------------------------------- 0.0/60.8 kB ? eta -:--:--
     ------ --------------------------------- 10.2/60.8 kB ? eta -:--:--
     ---------------------------------------- 60.8/60.8 kB 1.6 MB/s eta 0:00:00
Collecting joblib>=1.2.0 (from scikit-learn)
  Obtaining dependency information for joblib>=1.2.0 from https://files.pythonhosted.o


[notice] A new release of pip is available: 23.2.1 -> 24.3.1
[notice] To update, run: python.exe -m pip install --upgrade pip


In [1]:
import pandas as pd
from pymongo import MongoClient
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.preprocessing import StandardScaler
import numpy as np
import matplotlib.pyplot as plt

# Connect to MongoDB
client = MongoClient('mongodb://localhost:27017/')
db = client['movie_database']

# Load movie and box office data
movies = pd.DataFrame(list(db['movie'].find()))
movie_performance = pd.DataFrame(list(db['movie_performance'].find()))

In [2]:

# feature evaluation
import pandas as pd
from sklearn.preprocessing import StandardScaler

# Assuming movies and movie_performance are loaded as pandas DataFrames
merged_df = pd.merge(movies, movie_performance, left_on='_id', right_on='movie_id', how='inner')

# Apply log transformation to budget and final box office
merged_df['log_budget'] = np.log1p(merged_df['budget'])
merged_df['log_final_box_office'] = np.log1p(merged_df['final_box_office'])

# One-hot encoding of genre and director
merged_df = pd.get_dummies(merged_df, columns=['genre', 'director'])

# One-hot encoding actors (assuming actors is a list of names)
merged_df['actors'] = merged_df['actors'].apply(lambda x: ','.join(x) if isinstance(x, list) else x)
merged_df = pd.get_dummies(merged_df, columns=['actors'], prefix='actor')

# Feature columns, including log-transformed budget, genres, director, and actors
features = ['log_budget', 'imdb_score'] + [col for col in merged_df.columns if col.startswith('genre_')] \
           + [col for col in merged_df.columns if col.startswith('director_')] \
           + [col for col in merged_df.columns if col.startswith('actor_')]

X = merged_df[features]
y = merged_df['log_final_box_office']


In [3]:
# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
scaler = StandardScaler()
# Fit the scaler on the training data and transform it
X_train_scaled = scaler.fit_transform(X_train)

# Use the same scaler to transform the test data
X_test_scaled = scaler.transform(X_test)

In [4]:
from sklearn.ensemble import RandomForestRegressor

# Initialize and train Random Forest model
rf_model = RandomForestRegressor(n_estimators=100, random_state=42)
rf_model.fit(X_train_scaled, y_train)

# Get feature importances
feature_importances = rf_model.feature_importances_

# Create a DataFrame to display feature importances
feature_importance_df = pd.DataFrame({'Feature': features, 'Importance': feature_importances})
feature_importance_df = feature_importance_df.sort_values(by='Importance', ascending=False)

print(feature_importance_df)


                                                Feature  Importance
0                                            log_budget    0.555469
1                                            imdb_score    0.084689
10                                         genre_Horror    0.008534
9                                           genre_Drama    0.006892
4748  actor_Philip Seymour Hoffman,Zooey Deschanel,M...    0.004654
...                                                 ...         ...
4515     actor_Natalie Portman,Mila Kunis,Mark Margolis    0.000000
1807                actor_Adam Sandler,RZA,Maude Apatow    0.000000
1366                            director_Rebecca Miller    0.000000
3432  actor_Jennifer Hudson,Mary J. Blige,Vondie Cur...    0.000000
5637     actor_Zachary Gordon,Rachael Harris,Fran Kranz    0.000000

[5656 rows x 2 columns]


In [5]:
selected_features = feature_importance_df[feature_importance_df['Importance'] > 0.00001]['Feature'].tolist()

X_train_selected = X_train[selected_features] 
X_test_selected = X_test[selected_features]    

scaler = StandardScaler()

# Fit the scaler on the selected training data and transform it
X_train_scaled = scaler.fit_transform(X_train_selected)

# Transform the test data using the same scaler
X_test_scaled = scaler.transform(X_test_selected)


rf_model_selected = RandomForestRegressor(n_estimators=100, random_state=42)
rf_model_selected.fit(X_train_scaled, y_train)


y_pred_selected = rf_model_selected.predict(X_test_scaled)

mse_selected = mean_squared_error(y_test, y_pred_selected)
r2_selected = r2_score(y_test, y_pred_selected)

print(f"Mean Squared Error (Selected Features): {mse_selected}")
print(f"R^2 Score (Selected Features): {r2_selected}")


Mean Squared Error (Selected Features): 0.783258989664048
R^2 Score (Selected Features): 0.7389398243627874


In [6]:
#Hyperparameter Tuning

from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestRegressor

# Define the hyperparameter grid to be tuned
param_grid = {
    'n_estimators': [50, 100, 150], 
    'max_depth': [None, 10, 20],
    'min_samples_split': [2, 5, 10], 
    'min_samples_leaf': [1, 2, 4], 
    'max_features': ['sqrt', 'log2'], 
}

rf = RandomForestRegressor(random_state=42)

# Perform hyperparameter tuning using GridSearchCV
grid_search = GridSearchCV(estimator=rf, param_grid=param_grid, cv=5, n_jobs=-1, verbose=2, scoring='neg_mean_squared_error')

grid_search.fit(X_train_selected, y_train)

print("Best Hyperparameters: ", grid_search.best_params_)
print("Best Cross-validation Score: ", grid_search.best_score_)


Fitting 5 folds for each of 162 candidates, totalling 810 fits


  _data = np.array(data, dtype=dtype, copy=copy,


Best Hyperparameters:  {'max_depth': None, 'max_features': 'sqrt', 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 150}
Best Cross-validation Score:  -0.830837962290639


In [7]:


# best model
best_rf_model = grid_search.best_estimator_

# evluation
y_pred = best_rf_model.predict(X_test_selected)
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print(f"Mean Squared Error (After Hyperparameter Tuning): {mse}")
print(f"R^2 Score (After Hyperparameter Tuning): {r2}")


Mean Squared Error (After Hyperparameter Tuning): 0.7440524139468644
R^2 Score (After Hyperparameter Tuning): 0.7520073737658932


In [9]:
# Calculate absolute error
absolute_error = np.abs(y_test - y_pred)

# Calculate relative error
relative_error = absolute_error / y_test

# Calculate the number of predictions with less than 10% error
accuracy_90 = np.mean(relative_error < 0.10) * 100  # 90% accuracy

print(f"Accuracy with less than 10% error: {accuracy_90:.2f}%")


Accuracy with less than 10% error: 92.70%


In [10]:
# save model
import pickle

# save model
with open('predict_box_office.pkl', 'wb') as f:
    pickle.dump(grid_search.best_estimator_, f)

# save feature columns
with open('feature_columns.pkl', 'wb') as f:
    pickle.dump(selected_features, f)

# save standar
with open('scaler.pkl', 'wb') as f:
    pickle.dump(scaler, f)