In [1]:
# imports
from boxoffice.modeling.cleaned_data import X_train, y_train, X_test, y_test
from sklearn.metrics import root_mean_squared_error
import pandas as pd
import plotly.express as px
import plotly.graph_objects as go
# import statsmodels.formula.api as smf
import statsmodels.api as sm

MOVIES_DB_PATH: boxoffice/db/data/data.sqlite
MOVIES_DB_PATH: ../boxoffice/db/data/data.sqlite
movies.csv exists, 1731906929.6094918, 1731906849.873322
Reading from movies.csv
Index([], dtype='object')


In [2]:
X_train['opening_wide_revenue'] = y_train
X_test['opening_wide_revenue'] = y_test
X_train['production_method'] = X_train['production_method'].replace('Stop-Motion Animation', 'Digital Animation')
X_test['production_method'] = X_test['production_method'].replace('Stop-Motion Animation', 'Digital Animation')

  X_test['opening_wide_revenue'] = y_test


In [3]:
formula1 = "opening_wide_revenue ~ 0 + top_5_trailer_views + budget + wikipedia_pre_release_cumulative_views + wikipedia_pre_release_cumulative_views : in_franchise : production_method"
formula2 = "opening_wide_revenue ~ 0 + budget + wikipedia_pre_release_cumulative_views + wikipedia_pre_release_cumulative_views : in_franchise : production_method"
formula3 = "opening_wide_revenue ~ 0 + budget + wikipedia_pre_release_monday_views + wikipedia_pre_release_monday_views : in_franchise : production_method"

formula = formula2

rlm_model = sm.RLM.from_formula(formula, data=X_train).fit()

pred_simple_glm = rlm_model.predict(X_test)

print("RMSE:", root_mean_squared_error(y_test, pred_simple_glm))
print(rlm_model.summary())

train_pred = rlm_model.predict(X_train)

training_data = pd.DataFrame(
    {
        "actual": y_train,
        "predicted": train_pred,
        "title": X_train["title"],
        "production_method": X_train["production_method"],
    }
)

# write to a file
training_data.to_csv("training_data.csv")

fig = px.scatter(training_data, x="actual", y="predicted", hover_name="title", color="production_method")
fig.add_trace(go.Scatter(x=training_data["actual"], y=training_data["actual"], mode="lines", name="y=x"))
fig.update_layout(title="Actual vs Predicted Opening Weekend Revenue", xaxis_title="Actual", yaxis_title="Predicted")

RMSE: 21276183.441054396
                     Robust linear Model Regression Results                     
Dep. Variable:     opening_wide_revenue   No. Observations:                  677
Model:                              RLM   Df Residuals:                      672
Method:                            IRLS   Df Model:                            4
Norm:                            HuberT                                         
Scale Est.:                         mad                                         
Cov Type:                            H1                                         
Date:                  Mon, 25 Nov 2024                                         
Time:                          16:47:17                                         
No. Iterations:                       2                                         
                                                                                                   coef    std err          z      P>|z|      [0.025      0.975]
----

In [4]:
results_df = pd.DataFrame({'title': X_test['title'], 'actual': y_test, 'predicted': pred_simple_glm, 'production_method': X_test['production_method']})

# plot actual vs predicted
fig = px.scatter(results_df, x='actual', y='predicted', hover_name='title', color='production_method')
fig.add_trace(go.Scatter(x=results_df['actual'], y=results_df['actual'], mode='lines', name='y=x'))
fig.update_layout(title='Actual vs Predicted Opening Weekend Revenue', xaxis_title='Actual', yaxis_title='Predicted')

In [5]:
# plot the training residuals
training_data['average_actual_predicted'] = (training_data['actual'] + training_data['predicted']) / 2
training_data['residual'] = training_data['actual'] - training_data['predicted']
# create facet grid with two plots, one with actual vs residual and one with predicted vs residual
fig = px.scatter(training_data, x='average_actual_predicted', y='residual', color='production_method')
fig.add_trace(go.Scatter(x=training_data['actual'], y=[0]*len(training_data), mode='lines', name='y=0'))
fig.update_layout(title='(Actual + Predicted) / 2 vs Residuals', xaxis_title='(Actual + Predicted) / 2', yaxis_title='Residual')
# draw lines at y=x and y=-x dashed
fig.add_trace(go.Scatter(x=training_data['actual'], y=training_data['actual'], mode='lines', name='y=x', line=dict(dash='dash')))
fig.add_trace(go.Scatter(x=training_data['actual'], y=-training_data['actual'], mode='lines', name='y=-x', line=dict(dash='dash')))

In [6]:
# create a dataframe with testing and training combined and then a column for whether it is testing or training
results_df['residual'] = results_df['actual'] - results_df['predicted']

combined_data = pd.concat([training_data, results_df])
combined_data['data_type'] = ['training'] * len(training_data) + ['testing'] * len(results_df)

# residuals as a percent difference of opening weekend
combined_data['percent_residual'] = (combined_data['actual'] - combined_data['predicted']) / (
    (combined_data['actual'] + combined_data['predicted']) / 2
)
fig = px.scatter(combined_data, x='predicted', y='percent_residual', color='data_type', hover_name='title')
fig.add_trace(go.Scatter(x=combined_data['actual'], y=[0]*len(combined_data), mode='lines', name='y=0'))
fig.update_layout(title='Predicted vs Percent Residuals', xaxis_title='Predicted', yaxis_title='Percent Residual')