In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.multioutput import MultiOutputRegressor
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.svm import SVR
from sklearn.metrics import mean_squared_error
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score
from sklearn.ensemble import VotingRegressor
import pickle

In [2]:
df = pd.read_csv("rotten_tomatoes_movies.csv")

In [3]:
df = df.dropna(subset=["audienceScore", "tomatoMeter"])

In [4]:
df['genre_list'] = df['genre'].str.split(', ')
encoded_genres = pd.get_dummies(df['genre_list'].apply(pd.Series).stack(), drop_first=True).groupby(level=0).sum()
df = pd.concat([df, encoded_genres], axis=1)
df.drop(['genre', 'genre_list'], axis=1, inplace=True)

In [5]:
# df = df.drop(['genre'], axis=1)

In [6]:
df.drop(['rating', 'ratingContents'], axis=1, inplace=True)

In [7]:
#combines releaseDateTheaters and releaseDateStreaming
df['releaseDateTheaters'] = pd.to_datetime(df['releaseDateTheaters'])
df['releaseDateStreaming'] = pd.to_datetime(df['releaseDateStreaming'])
df['combinedReleaseDate'] = df['releaseDateTheaters'].fillna(df['releaseDateStreaming'])
df['combinedReleaseDate'] = df['combinedReleaseDate'].dt.strftime('%Y-%m-%d')
df.drop(['releaseDateTheaters', 'releaseDateStreaming'], axis=1, inplace=True)

In [8]:
df.drop(['distributor', 'soundMix'], axis=1, inplace=True)

In [9]:
#convert combinedReleaseDate to release_year and release_month
df['combinedReleaseDate'] = pd.to_datetime(df['combinedReleaseDate'])
df['release_year'] = df['combinedReleaseDate'].dt.year
df['release_month'] = df['combinedReleaseDate'].dt.month
df.drop(columns=['combinedReleaseDate'], inplace=True)

In [10]:
#converts box office numbers into more useable numbers
def preprocess_box_office(value):
    value = str(value).replace('$', '') 
    if value.endswith('M'):
        return float(value[:-1]) * 1e6
    elif value.endswith('K'):
        return float(value[:-1]) * 1e3
    else:
        return float(value)

df['boxOffice'] = df['boxOffice'].apply(preprocess_box_office)

In [11]:
#created a dataframe just for box office because it seems to have the most NaN values
columns_to_select_box = ['audienceScore', 'tomatoMeter', 'boxOffice', 'id']
df_box_office = df[columns_to_select_box]
df_box_office = df_box_office.dropna()

In [12]:
#created dataframes for directors and writers for later use
columns_to_select = ['audienceScore', 'tomatoMeter', 'director', 'id']
df_director = df[columns_to_select]

columns_to_select_writer = ['audienceScore', 'tomatoMeter', 'writer', 'id']
df_writer = df[columns_to_select_writer]

df_director = df_director.dropna()
df_writer = df_writer.dropna()

In [13]:
#created a dataframe with less important features for later use
columns_to_select = ['runtimeMinutes', 'release_year', 'release_month', 'audienceScore', 'tomatoMeter', 'id']
df_other = df[columns_to_select]
df = df.drop(['title', "boxOffice"], axis=1)
df = df.dropna()

In [14]:
#function that calculates the smoothed means of directors and writers, created because this code is reused a couple times
def calculate_smoothed_means(df, m):
    director_stats = df.groupby('director')[['tomatoMeter', 'audienceScore']].agg(['mean', 'count'])

    for column in ['tomatoMeter', 'audienceScore']:
        director_stats[column, 'smoothed_mean'] = (
            (director_stats[column, 'count'] * director_stats[column, 'mean']) + 
            (m * df[column].mean())
        ) / (director_stats[column, 'count'] + m)

    director_smoothed_map = {}
    for column in ['tomatoMeter', 'audienceScore']:
        director_smoothed_map[column] = director_stats[column, 'smoothed_mean'].to_dict()

    for column in ['tomatoMeter', 'audienceScore']:
        df[f'director_{column}_smoothed_mean'] = df['director'].map(director_smoothed_map[column])

    writer_stats = df.groupby('writer')[['tomatoMeter', 'audienceScore']].agg(['mean', 'count'])

    for column in ['tomatoMeter', 'audienceScore']:
        writer_stats[column, 'smoothed_mean'] = (
            (writer_stats[column, 'count'] * writer_stats[column, 'mean']) + 
            (m * df[column].mean())
        ) / (writer_stats[column, 'count'] + m)

    writer_smoothed_map = {}
    for column in ['tomatoMeter', 'audienceScore']:
        writer_smoothed_map[column] = writer_stats[column, 'smoothed_mean'].to_dict()

    for column in ['tomatoMeter', 'audienceScore']:
        df[f'writer_{column}_smoothed_mean'] = df['writer'].map(writer_smoothed_map[column])
    
    return df

df = calculate_smoothed_means(df, m=0.1)


In [15]:
X = df.drop(['audienceScore', 'tomatoMeter', 'director', 'writer', 'id', 'originalLanguage'], axis=1)  

In [16]:
# features = df.drop(['audienceScore', 'tomatoMeter', 'director', 'writer', 'id'], axis=1)  

# categorical_columns = ['originalLanguage']  
# encoder = OneHotEncoder(drop='first', sparse_output=False)
# encoded_categorical = encoder.fit_transform(df[categorical_columns])

# encoded_feature_names = encoder.get_feature_names_out(input_features=categorical_columns)
# encoded_categorical_df = pd.DataFrame(encoded_categorical, columns=encoded_feature_names)
# df = pd.concat([df.reset_index(drop=True), encoded_categorical_df], axis=1)
# X = pd.concat([features.reset_index(drop=True), encoded_categorical_df], axis=1)
# X = X.drop(['originalLanguage'], axis=1)
y = df[['tomatoMeter', 'audienceScore']]


In [17]:
#first model trained
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

base_estimator = RandomForestRegressor(random_state=42)

# multioutput_regressor = MultiOutputRegressor(base_estimator)
# multioutput_regressor.fit(X_train, y_train)
print(len(y_test))
model_1 = LinearRegression()
model_1.fit(X_train, y_train)
y_pred = model_1.predict(X_test)
y_pred_round = np.round(y_pred)
mse = mean_squared_error(y_test, y_pred_round)

print("Mean Squared Error:", mse)

4613
Mean Squared Error: 69.78625623238673


In [18]:
r_squared = r2_score(y_test, y_pred_round)
print("R-squared:", r_squared)

R-squared: 0.8829647226980867


In [19]:
# Create DataFrames for each predicted column
predicted_tomatoMeter_df = pd.DataFrame({'Predicted_tomatoMeter': y_pred_round[:, 0]})
predicted_audienceScore_df = pd.DataFrame({'Predicted_audienceScore': y_pred_round[:, 1]})

# Save the DataFrames to separate CSV files
predicted_tomatoMeter_df.to_csv('predicted_tomatoMeter.csv', index=False)
predicted_audienceScore_df.to_csv('predicted_audienceScore.csv', index=False)

In [20]:
# Create DataFrames for each predicted column
# predicted_tomatoMeter_df = pd.DataFrame({'Predicted_tomatoMeter': y_test[:, 0]})
# predicted_audienceScore_df = pd.DataFrame({'Predicted_audienceScore': y_test[:, 1]})

# Save the DataFrames to separate CSV files
y_test.to_csv('y_test.csv', index=False)
# predicted_audienceScore_df.to_csv('audienceScore.csv', index=False)

In [21]:
print(X.columns)

Index(['runtimeMinutes', 'Adventure', 'Animation', 'Anime', 'Biography',
       'Comedy', 'Crime', 'Documentary', 'Drama', 'Entertainment', 'Fantasy',
       'Foreign', 'Gay & lesbian', 'History', 'Holiday', 'Horror',
       'Kids & family', 'Lgbtq+', 'Music', 'Musical', 'Mystery & thriller',
       'Nature', 'News', 'Other', 'Romance', 'Sci-fi', 'Short',
       'Special interest', 'Sports', 'Sports & fitness', 'Stand-up', 'Variety',
       'War', 'Western', 'release_year', 'release_month',
       'director_tomatoMeter_smoothed_mean',
       'director_audienceScore_smoothed_mean',
       'writer_tomatoMeter_smoothed_mean',
       'writer_audienceScore_smoothed_mean'],
      dtype='object')


In [22]:
# X_genre_language = X
# X_genre_language = X.drop(['director_tomatoMeter_smoothed_mean', 
#                      'director_audienceScore_smoothed_mean', 'writer_tomatoMeter_smoothed_mean', 'writer_audienceScore_smoothed_mean', "runtimeMinutes"
#            , 'release_month', 'release_year'], axis=1)
# y_genre_language = y
# #first model trained
# X_train_genre_language, X_test_genre_language, y_train_genre_language, y_test_genre_language = train_test_split(X_genre_language, y_genre_language, test_size=0.2, random_state=42)

# base_estimator = RandomForestRegressor(random_state=42)

# # multioutput_regressor = MultiOutputRegressor(base_estimator)
# # multioutput_regressor.fit(X_train, y_train)

# genre_language = LinearRegression()
# genre_language.fit(X_train_genre_language, y_train_genre_language)
# y_pred_genre_language = genre_language.predict(X_test_genre_language)
# mse_genre_language = mean_squared_error(y_test_genre_language, y_pred_genre_language)

# print("Mean Squared Error:", mse_genre_language)

In [23]:
# r_squared_genre_language = r2_score(y_test_genre_language, y_pred_genre_language)
# print("R-squared:", r_squared_genre_language)

In [24]:
m = 0.1

director_stats = df_director.groupby('director')[['tomatoMeter', 'audienceScore']].agg(['mean', 'count'])

for column in ['tomatoMeter', 'audienceScore']:
    director_stats[column, 'smoothed_mean'] = (
        (director_stats[column, 'count'] * director_stats[column, 'mean']) + 
        (m * df_director[column].mean())
    ) / (director_stats[column, 'count'] + m)

director_smoothed_map = {}
for column in ['tomatoMeter', 'audienceScore']:
    director_smoothed_map[column] = director_stats[column, 'smoothed_mean'].to_dict()

for column in ['tomatoMeter', 'audienceScore']:
    df_director[f'director_{column}_smoothed_mean'] = df_director['director'].map(director_smoothed_map[column])

writer_stats = df_writer.groupby('writer')[['tomatoMeter', 'audienceScore']].agg(['mean', 'count'])

for column in ['tomatoMeter', 'audienceScore']:
    writer_stats[column, 'smoothed_mean'] = (
        (writer_stats[column, 'count'] * writer_stats[column, 'mean']) + 
        (m * df_writer[column].mean())
    ) / (writer_stats[column, 'count'] + m)

writer_smoothed_map = {}
for column in ['tomatoMeter', 'audienceScore']:
    writer_smoothed_map[column] = writer_stats[column, 'smoothed_mean'].to_dict()

for column in ['tomatoMeter', 'audienceScore']:
    df_writer[f'writer_{column}_smoothed_mean'] = df_writer['writer'].map(writer_smoothed_map[column])

In [25]:
#features and target for directors and writers
X_director = df_director[['director_tomatoMeter_smoothed_mean', 'director_audienceScore_smoothed_mean']]
y_director = df_director[['tomatoMeter', 'audienceScore']]

X_writer = df_writer[['writer_tomatoMeter_smoothed_mean', 'writer_audienceScore_smoothed_mean']]
y_writer = df_writer[['tomatoMeter', 'audienceScore']]

In [26]:
#training director model
X_train_director, X_test_director, y_train_director, y_test_director = train_test_split(X_director, y_director, test_size=0.2, random_state=42)

model_director = LinearRegression()
model_director.fit(X_train_director, y_train_director)

y_pred_director = model_director.predict(X_test_director)
mse_director = mean_squared_error(y_test_director, y_pred_director)
print("MSE for director and writer model:", mse_director)

MSE for director and writer model: 205.45602540514565


In [27]:
r_squared_director = r2_score(y_test_director, y_pred_director)
print("R-squared:", r_squared_director)

R-squared: 0.6632966459572744


In [28]:
#training writer model
X_train_writer, X_test_writer, y_train_writer, y_test_writer = train_test_split(X_writer, y_writer, test_size=0.2, random_state=42)

model_writer = LinearRegression()
model_writer.fit(X_train_writer, y_train_writer)

y_pred_writer = model_writer.predict(X_test_writer)
mse_writer = mean_squared_error(y_test_writer, y_pred_writer)
print("MSE for writer model:", mse_writer)

MSE for writer model: 78.51693745731163


In [29]:
r_squared_writer = r2_score(y_test_writer, y_pred_writer)
print("R-squared:", r_squared_writer)

R-squared: 0.8641949258821622


In [30]:
#model with other features
df_other = df_other.dropna()
X_other = df_other.drop(['tomatoMeter', 'audienceScore', 'id'], axis=1)
y_other = df_other[['tomatoMeter', 'audienceScore']]

X_train_other, X_test_other, y_train_other, y_test_other = train_test_split(X_other, y_other, test_size=0.2, random_state=42)

model_other = LinearRegression()
model_other.fit(X_train_other, y_train_other)

y_pred_other = model_other.predict(X_test_other)
mse_other = mean_squared_error(y_test_other, y_pred_other)
print("MSE for other features model:", mse_other)

MSE for other features model: 599.2608379422254


In [31]:
r_squared_other = r2_score(y_test_other, y_pred_other)
print("R-squared:", r_squared_other)

R-squared: 0.026360070158132964


In [32]:
#box office model
X_box_office = df_box_office.drop(['tomatoMeter', 'audienceScore', 'id'], axis=1)
y_box_office = df_box_office[['tomatoMeter', 'audienceScore']]

X_train_box_office, X_test_box_office, y_train_box_office, y_test_box_office = train_test_split(X_box_office, y_box_office, test_size=0.2, random_state=42)

model_box_office = LinearRegression()
model_box_office.fit(X_train_box_office, y_train_box_office)

y_pred_box_office = model_box_office.predict(X_test_box_office)
mse_box_office = mean_squared_error(y_test_box_office, y_pred_box_office)
print("MSE for box office model:", mse_box_office)

MSE for box office model: 573.5385992075446


In [33]:
r_squared_box_office = r2_score(y_test_box_office, y_pred_box_office)
print("R-squared:", r_squared_box_office)

R-squared: 0.004321882246269504


In [34]:
#testing out if giving each writer and director a unique id trains better
director_mapping = pd.DataFrame({'director': df['director'].unique()})
director_mapping['director_id'] = range(1, len(director_mapping) + 1)

writer_mapping = pd.DataFrame({'writer': df['writer'].unique()})
writer_mapping['writer_id'] = range(1, len(writer_mapping) + 1)

In [35]:
df = df.merge(director_mapping, on='director', how='left')
df = df.merge(writer_mapping, on='writer', how='left')

In [36]:
#training the director and writer id model
X_id = df.drop(['tomatoMeter', 'audienceScore', 'director', 'writer', 'originalLanguage', 'director_tomatoMeter_smoothed_mean', 
                     'director_audienceScore_smoothed_mean', 'writer_tomatoMeter_smoothed_mean', 'writer_audienceScore_smoothed_mean'
                    , 'id' ],axis=1)
y_id = df[['tomatoMeter', 'audienceScore']]

X_train_id, X_test_id, y_train_id, y_test_id = train_test_split(X_id, y_id, test_size=0.2, random_state=42)

model_id = LinearRegression()
model_id.fit(X_train_id, y_train_id)

y_pred_id = model_id.predict(X_test_id)
mse_id = mean_squared_error(y_test_id, y_pred_id)
print("MSE for id model:", mse_id)

MSE for id model: 511.17687133252684


In [37]:
r_squared_id = r2_score(y_test_id, y_pred_id)
print("R-squared:", r_squared_id)

R-squared: 0.16684670623871078


In [38]:
# creating a "long" dataframe rows with multiple directors or writers will be split into multiple individual rows
df['director'] = df['director'].str.split(',')
df['writer'] = df['writer'].str.split(',')

df_long = df.explode('director').explode('writer').reset_index(drop=True)
df_long.head()

Unnamed: 0,id,audienceScore,tomatoMeter,runtimeMinutes,originalLanguage,director,writer,Adventure,Animation,Anime,...,War,Western,release_year,release_month,director_tomatoMeter_smoothed_mean,director_audienceScore_smoothed_mean,writer_tomatoMeter_smoothed_mean,writer_audienceScore_smoothed_mean,director_id,writer_id
0,adrift_2018,65.0,69.0,120.0,English,Baltasar Kormákur,Aaron Kandell,1.0,0.0,0.0,...,0.0,0.0,2018.0,6.0,63.609385,62.912446,68.439039,64.684587,1,1
1,adrift_2018,65.0,69.0,120.0,English,Baltasar Kormákur,Jordan Kandell,1.0,0.0,0.0,...,0.0,0.0,2018.0,6.0,63.609385,62.912446,68.439039,64.684587,1,1
2,adrift_2018,65.0,69.0,120.0,English,Baltasar Kormákur,David Branson Smith,1.0,0.0,0.0,...,0.0,0.0,2018.0,6.0,63.609385,62.912446,68.439039,64.684587,1,1
3,1035316-born_to_kill,74.0,83.0,92.0,English,Robert Wise,Eve Greene,0.0,0.0,0.0,...,0.0,0.0,1947.0,4.0,75.045786,69.190237,81.166311,72.866405,2,2
4,1035316-born_to_kill,74.0,83.0,92.0,English,Robert Wise,Richard Macaulay,0.0,0.0,0.0,...,0.0,0.0,1947.0,4.0,75.045786,69.190237,81.166311,72.866405,2,2


In [39]:
#use the function from before
df_long = calculate_smoothed_means(df_long, m=0.1)

In [40]:
#training long df
X_long = df_long.drop(['tomatoMeter', 'audienceScore', 'director', 'writer', 'originalLanguage', 'id'], axis=1)
y_long = df_long[['tomatoMeter', 'audienceScore']]

X_train_long, X_test_long, y_train_long, y_test_long = train_test_split(X_long, y_long, test_size=0.2, random_state=42)

model_long = LinearRegression()
model_long.fit(X_train_long, y_train_long)

y_pred_long = model_long.predict(X_test_long)
mse_long = mean_squared_error(y_test_long, y_pred_long)
print("MSE for long model:", mse_long)


MSE for long model: 137.79273019941078


In [41]:
r_squared_long = r2_score(y_test_long, y_pred_long)
print("R-squared:", r_squared_long)

R-squared: 0.7714472531191752


In [42]:
# training long df with ids
X_id_long = df_long.drop(['tomatoMeter', 'audienceScore', 'director', 'writer', 'originalLanguage', 'director_tomatoMeter_smoothed_mean', 
                     'director_audienceScore_smoothed_mean', 'writer_tomatoMeter_smoothed_mean', 'writer_audienceScore_smoothed_mean'
                    , 'id'],axis=1)
y_id_long = df_long[['tomatoMeter', 'audienceScore']]

X_train_id_long, X_test_id_long, y_train_id_long, y_test_id_long = train_test_split(X_id_long, y_id_long, test_size=0.2, random_state=42)

model_id_long = LinearRegression()
model_id_long.fit(X_train_id_long, y_train_id_long)

y_pred_id_long = model_id_long.predict(X_test_id_long)
mse_id_long = mean_squared_error(y_test_id_long, y_pred_id_long)
print("MSE for id_long model:", mse_id_long)


MSE for id_long model: 508.7705896729193


In [43]:
r_squared_id_long = r2_score(y_test_id_long, y_pred_id_long)
print("R-squared:", r_squared_id_long)

R-squared: 0.18044478359261018
