In [72]:
from boxoffice.modeling.cleaned_data import df
import statsmodels.api as sm
import statsmodels.formula.api as smf
import numpy as np
import pandas as pd

In [73]:
# do an 80/20 split and then cross validate
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error

# target variable is opening_wide_revenue
X = df.drop(columns=['opening_wide_revenue'])
y = df['opening_wide_revenue']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

lm = LinearRegression()

predictor_columns = [
    'running_time',
    'budget',
    'metacritic_before_wide_friday_calculated',
    # 'metacritic_monday_before_wide_friday_calculated',
    'max_trailer_views',
    'top_3_trailer_views',
    'total_trailer_views',
    'sum_cast_box_office',
    'sum_crew_box_office',
    'wikipedia_pre_release_monday_views',
    'in_franchise'
]

# fit with OLS
X_train = X_train[predictor_columns]

model = sm.OLS(y_train, X_train).fit()

print(model.summary())

# cross validate
scores = cross_val_score(lm, X_train, y_train, scoring='neg_mean_squared_error', cv=5)

print('Cross validation scores:', scores)

y_pred = model.predict(X_test[predictor_columns])

print('Mean squared error:', mean_squared_error(y_test, y_pred))

                                  OLS Regression Results                                 
Dep. Variable:     opening_wide_revenue   R-squared (uncentered):                   0.860
Model:                              OLS   Adj. R-squared (uncentered):              0.858
Method:                   Least Squares   F-statistic:                              405.0
Date:                  Sat, 09 Nov 2024   Prob (F-statistic):                   1.09e-273
Time:                          15:13:51   Log-Likelihood:                         -12140.
No. Observations:                   669   AIC:                                  2.430e+04
Df Residuals:                       659   BIC:                                  2.434e+04
Df Model:                            10                                                  
Covariance Type:              nonrobust                                                  
                                               coef    std err          t      P>|t|      [0.025    

In [74]:
import plotly.express as px
import plotly.graph_objects as go

results_df = pd.DataFrame({
    'actual': y_test,
    'predicted': y_pred,
    'title': X['title'],
    'mpaa_rating': X['mpaa_rating']
})

results_df = results_df.sort_values(by='actual')

fig = px.scatter(results_df, x='actual', y='predicted', hover_name='title', color='mpaa_rating')
fig.add_trace(go.Scatter(x=results_df['actual'], y=results_df['actual'], mode='lines', name='y=x'))
fig.show()

In [75]:
# now need to implement the log and sqrt transformations
df['log_opening_wide_revenue'] = np.log(df['opening_wide_revenue'])
df['sqrt_opening_wide_revenue'] = np.sqrt(df['opening_wide_revenue'])
df['log_budget'] = np.log(df['budget'])
df['sqrt_budget'] = np.sqrt(df['budget'])
df['sqrt_wikipedia_pre_release_monday_views'] = np.sqrt(df['wikipedia_pre_release_monday_views'])
df['sqrt_sum_cast_box_office'] = np.sqrt(df['sum_cast_box_office'])
df['sqrt_sum_crew_box_office'] = np.sqrt(df['sum_crew_box_office'])
df['sqrt_max_trailer_views'] = np.sqrt(df['max_trailer_views'])
df['sqrt_total_trailer_views'] = np.sqrt(df['total_trailer_views'])
df['sqrt_top_3_trailer_views'] = np.sqrt(df['top_3_trailer_views'])

formula = "log_opening_wide_revenue ~ 0 + metacritic_before_wide_friday_calculated * log_budget * sqrt_wikipedia_pre_release_monday_views * weighted_crew_median_box_office + weighted_cast_median_box_office + in_franchise + top_5_trailer_views"

X_train, X_test, y_train, y_test = train_test_split(df, df['log_opening_wide_revenue'], test_size=0.2)

model = smf.ols(formula=formula, data=X_train).fit()

print(model.summary())

# create an actual vs predicted plot
y_pred = model.predict(X_test)

results_df = pd.DataFrame({
    'actual': y_test,
    'predicted': y_pred,
    'title': X['title'],
    'mpaa_rating': X['mpaa_rating']
})

# drop rows with missing values
results_df = results_df.dropna()

# undo the log transformation
results_df['actual'] = np.exp(results_df['actual'])
results_df['predicted'] = np.exp(results_df['predicted'])

fig = px.scatter(results_df, x='actual', y='predicted', hover_name='title', color='mpaa_rating')
fig.add_trace(go.Scatter(x=results_df['actual'], y=results_df['actual'], mode='lines', name='y=x'))
fig.show()

# print the mean squared error
print('Mean squared error:', mean_squared_error(results_df['actual'], results_df['predicted']))

                               OLS Regression Results                               
Dep. Variable:     log_opening_wide_revenue   R-squared:                       0.654
Model:                                  OLS   Adj. R-squared:                  0.645
Method:                       Least Squares   F-statistic:                     72.32
Date:                      Sat, 09 Nov 2024   Prob (F-statistic):          2.54e-137
Time:                              15:13:51   Log-Likelihood:                -622.15
No. Observations:                       669   AIC:                             1280.
Df Residuals:                           651   BIC:                             1361.
Df Model:                                17                                         
Covariance Type:                  nonrobust                                         
                                                                                                                                  coef    std err     

Mean squared error: 909461112852873.1
