In [1]:
# Import dependencies
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error
from math import sqrt
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import r2_score
from sklearn.preprocessing import OneHotEncoder
import pandas as pd
import tensorflow as tf

In [2]:
# Load cleaned_movies into DataFrame
movies_df = pd.read_csv('movies_final.csv')

# Drop Unnamed column
movies_df = movies_df.loc[:,~movies_df.columns.str.contains('^Unnamed')]
movies_df

Unnamed: 0,name,rating,genre,year,released,score,votes,director,writer,star,country,gross,company,runtime
0,The Shining,R,Drama,1980,"June 13, 1980 (United States)",8.4,927000.0,Stanley Kubrick,Stephen King,Jack Nicholson,United Kingdom,46998772.0,Warner Bros.,146.0
1,The Blue Lagoon,R,Adventure,1980,"July 2, 1980 (United States)",5.8,65000.0,Randal Kleiser,Henry De Vere Stacpoole,Brooke Shields,United States,58853106.0,Columbia Pictures,104.0
2,Star Wars: Episode V - The Empire Strikes Back,PG,Action,1980,"June 20, 1980 (United States)",8.7,1200000.0,Irvin Kershner,Leigh Brackett,Mark Hamill,United States,538375067.0,Lucasfilm,124.0
3,Airplane!,PG,Comedy,1980,"July 2, 1980 (United States)",7.7,221000.0,Jim Abrahams,Jim Abrahams,Robert Hays,United States,83453539.0,Paramount Pictures,88.0
4,Caddyshack,R,Comedy,1980,"July 25, 1980 (United States)",7.3,108000.0,Harold Ramis,Brian Doyle-Murray,Chevy Chase,United States,39846344.0,Orion Pictures,98.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7407,The Call of the Wild,PG,Adventure,2020,"February 21, 2020 (United States)",6.8,42000.0,Chris Sanders,Michael Green,Harrison Ford,Canada,111105497.0,20th Century Studios,100.0
7408,The Eight Hundred,Not Rated,Action,2020,"August 28, 2020 (United States)",6.8,3700.0,Hu Guan,Hu Guan,Zhi-zhong Huang,China,461421559.0,Beijing Diqi Yinxiang Entertainment,149.0
7409,The Quarry,R,Crime,2020,"April 17, 2020 (Mexico)",5.4,2400.0,Scott Teems,Scott Teems,Shea Whigham,United States,3661.0,Prowess Pictures,98.0
7410,Tulsa,PG-13,Comedy,2020,"June 3, 2020 (United States)",5.0,294.0,Scott Pryor,Scott Pryor,Scott Pryor,United States,413378.0,Pryor Entertainment,120.0


In [3]:
# Determine if genre needs binning
genres = movies_df['genre'].value_counts()

# Bin genres with less than 5 movies into 'Other'
replace_genre = list(genres[genres < 5].index)

# Replace in DataFrame
for genre in replace_genre:
    movies_df['genre'] = movies_df['genre'].replace(genre,'Other')

In [4]:
# Remove directors with only 1 film
movies_df = movies_df[movies_df['director'].map(movies_df['director'].value_counts()) != 1]
movies_df

Unnamed: 0,name,rating,genre,year,released,score,votes,director,writer,star,country,gross,company,runtime
0,The Shining,R,Drama,1980,"June 13, 1980 (United States)",8.4,927000.0,Stanley Kubrick,Stephen King,Jack Nicholson,United Kingdom,46998772.0,Warner Bros.,146.0
1,The Blue Lagoon,R,Adventure,1980,"July 2, 1980 (United States)",5.8,65000.0,Randal Kleiser,Henry De Vere Stacpoole,Brooke Shields,United States,58853106.0,Columbia Pictures,104.0
2,Star Wars: Episode V - The Empire Strikes Back,PG,Action,1980,"June 20, 1980 (United States)",8.7,1200000.0,Irvin Kershner,Leigh Brackett,Mark Hamill,United States,538375067.0,Lucasfilm,124.0
3,Airplane!,PG,Comedy,1980,"July 2, 1980 (United States)",7.7,221000.0,Jim Abrahams,Jim Abrahams,Robert Hays,United States,83453539.0,Paramount Pictures,88.0
4,Caddyshack,R,Comedy,1980,"July 25, 1980 (United States)",7.3,108000.0,Harold Ramis,Brian Doyle-Murray,Chevy Chase,United States,39846344.0,Orion Pictures,98.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7399,Tenet,PG-13,Action,2020,"September 3, 2020 (United States)",7.4,387000.0,Christopher Nolan,Christopher Nolan,John David Washington,United States,363656624.0,Warner Bros.,150.0
7400,Wonder Woman 1984,PG-13,Action,2020,"December 25, 2020 (United States)",5.4,217000.0,Patty Jenkins,Patty Jenkins,Gal Gadot,United States,166534027.0,Atlas Entertainment,151.0
7402,The Invisible Man,R,Drama,2020,"February 28, 2020 (United States)",7.1,186000.0,Leigh Whannell,Leigh Whannell,Elisabeth Moss,Canada,143151000.0,Universal Pictures,124.0
7403,Onward,PG,Animation,2020,"March 6, 2020 (United States)",7.4,120000.0,Dan Scanlon,Dan Scanlon,Tom Holland,United States,141950608.0,Walt Disney Pictures,102.0


In [5]:
# Remove writers with only 1 film
movies_df = movies_df[movies_df['writer'].map(movies_df['writer'].value_counts()) != 1]
movies_df

Unnamed: 0,name,rating,genre,year,released,score,votes,director,writer,star,country,gross,company,runtime
0,The Shining,R,Drama,1980,"June 13, 1980 (United States)",8.4,927000.0,Stanley Kubrick,Stephen King,Jack Nicholson,United Kingdom,46998772.0,Warner Bros.,146.0
3,Airplane!,PG,Comedy,1980,"July 2, 1980 (United States)",7.7,221000.0,Jim Abrahams,Jim Abrahams,Robert Hays,United States,83453539.0,Paramount Pictures,88.0
4,Caddyshack,R,Comedy,1980,"July 25, 1980 (United States)",7.3,108000.0,Harold Ramis,Brian Doyle-Murray,Chevy Chase,United States,39846344.0,Orion Pictures,98.0
5,Friday the 13th,R,Horror,1980,"May 9, 1980 (United States)",6.4,123000.0,Sean S. Cunningham,Victor Miller,Betsy Palmer,United States,39754601.0,Paramount Pictures,95.0
6,The Blues Brothers,R,Action,1980,"June 20, 1980 (United States)",7.9,188000.0,John Landis,Dan Aykroyd,John Belushi,United States,115229890.0,Universal Pictures,133.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7399,Tenet,PG-13,Action,2020,"September 3, 2020 (United States)",7.4,387000.0,Christopher Nolan,Christopher Nolan,John David Washington,United States,363656624.0,Warner Bros.,150.0
7400,Wonder Woman 1984,PG-13,Action,2020,"December 25, 2020 (United States)",5.4,217000.0,Patty Jenkins,Patty Jenkins,Gal Gadot,United States,166534027.0,Atlas Entertainment,151.0
7402,The Invisible Man,R,Drama,2020,"February 28, 2020 (United States)",7.1,186000.0,Leigh Whannell,Leigh Whannell,Elisabeth Moss,Canada,143151000.0,Universal Pictures,124.0
7403,Onward,PG,Animation,2020,"March 6, 2020 (United States)",7.4,120000.0,Dan Scanlon,Dan Scanlon,Tom Holland,United States,141950608.0,Walt Disney Pictures,102.0


In [6]:
# Remove stars with only 1 film
movies_df = movies_df[movies_df['star'].map(movies_df['star'].value_counts()) != 1]
movies_df

Unnamed: 0,name,rating,genre,year,released,score,votes,director,writer,star,country,gross,company,runtime
0,The Shining,R,Drama,1980,"June 13, 1980 (United States)",8.4,927000.0,Stanley Kubrick,Stephen King,Jack Nicholson,United Kingdom,46998772.0,Warner Bros.,146.0
3,Airplane!,PG,Comedy,1980,"July 2, 1980 (United States)",7.7,221000.0,Jim Abrahams,Jim Abrahams,Robert Hays,United States,83453539.0,Paramount Pictures,88.0
4,Caddyshack,R,Comedy,1980,"July 25, 1980 (United States)",7.3,108000.0,Harold Ramis,Brian Doyle-Murray,Chevy Chase,United States,39846344.0,Orion Pictures,98.0
5,Friday the 13th,R,Horror,1980,"May 9, 1980 (United States)",6.4,123000.0,Sean S. Cunningham,Victor Miller,Betsy Palmer,United States,39754601.0,Paramount Pictures,95.0
6,The Blues Brothers,R,Action,1980,"June 20, 1980 (United States)",7.9,188000.0,John Landis,Dan Aykroyd,John Belushi,United States,115229890.0,Universal Pictures,133.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7351,Overcomer,PG,Drama,2019,"August 23, 2019 (United States)",6.6,4100.0,Alex Kendrick,Alex Kendrick,Alex Kendrick,United States,38026103.0,Affirm Films,119.0
7352,A Madea Family Funeral,PG-13,Comedy,2019,"March 1, 2019 (United States)",4.5,4000.0,Tyler Perry,Tyler Perry,Tyler Perry,United States,74747725.0,The Tyler Perry Company,109.0
7395,The Other Side of Heaven 2: Fire of Faith,PG-13,Adventure,2019,"June 28, 2019 (United States)",5.6,336.0,Mitch Davis,Mitch Davis,Christopher Gorham,United States,1807216.0,Kolipoki Pictures,117.0
7403,Onward,PG,Animation,2020,"March 6, 2020 (United States)",7.4,120000.0,Dan Scanlon,Dan Scanlon,Tom Holland,United States,141950608.0,Walt Disney Pictures,102.0


In [7]:
# Remove companies with only 1 film
movies_df = movies_df[movies_df['company'].map(movies_df['company'].value_counts()) != 1]
movies_df

Unnamed: 0,name,rating,genre,year,released,score,votes,director,writer,star,country,gross,company,runtime
0,The Shining,R,Drama,1980,"June 13, 1980 (United States)",8.4,927000.0,Stanley Kubrick,Stephen King,Jack Nicholson,United Kingdom,46998772.0,Warner Bros.,146.0
3,Airplane!,PG,Comedy,1980,"July 2, 1980 (United States)",7.7,221000.0,Jim Abrahams,Jim Abrahams,Robert Hays,United States,83453539.0,Paramount Pictures,88.0
4,Caddyshack,R,Comedy,1980,"July 25, 1980 (United States)",7.3,108000.0,Harold Ramis,Brian Doyle-Murray,Chevy Chase,United States,39846344.0,Orion Pictures,98.0
5,Friday the 13th,R,Horror,1980,"May 9, 1980 (United States)",6.4,123000.0,Sean S. Cunningham,Victor Miller,Betsy Palmer,United States,39754601.0,Paramount Pictures,95.0
6,The Blues Brothers,R,Action,1980,"June 20, 1980 (United States)",7.9,188000.0,John Landis,Dan Aykroyd,John Belushi,United States,115229890.0,Universal Pictures,133.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7326,Don't Let Go,R,Drama,2019,"August 30, 2019 (United States)",6.4,8000.0,Jacob Estes,Jacob Estes,David Oyelowo,United States,5288011.0,Blumhouse Productions,103.0
7342,Missing Link,PG,Animation,2019,"April 12, 2019 (United States)",6.7,23000.0,Chris Butler,Chris Butler,Hugh Jackman,Canada,26565710.0,Annapurna Pictures,93.0
7352,A Madea Family Funeral,PG-13,Comedy,2019,"March 1, 2019 (United States)",4.5,4000.0,Tyler Perry,Tyler Perry,Tyler Perry,United States,74747725.0,The Tyler Perry Company,109.0
7403,Onward,PG,Animation,2020,"March 6, 2020 (United States)",7.4,120000.0,Dan Scanlon,Dan Scanlon,Tom Holland,United States,141950608.0,Walt Disney Pictures,102.0


In [8]:
# Remove scores with only 1 value
movies_df = movies_df[movies_df['score'].map(movies_df['score'].value_counts()) != 1]
movies_df

Unnamed: 0,name,rating,genre,year,released,score,votes,director,writer,star,country,gross,company,runtime
0,The Shining,R,Drama,1980,"June 13, 1980 (United States)",8.4,927000.0,Stanley Kubrick,Stephen King,Jack Nicholson,United Kingdom,46998772.0,Warner Bros.,146.0
3,Airplane!,PG,Comedy,1980,"July 2, 1980 (United States)",7.7,221000.0,Jim Abrahams,Jim Abrahams,Robert Hays,United States,83453539.0,Paramount Pictures,88.0
4,Caddyshack,R,Comedy,1980,"July 25, 1980 (United States)",7.3,108000.0,Harold Ramis,Brian Doyle-Murray,Chevy Chase,United States,39846344.0,Orion Pictures,98.0
5,Friday the 13th,R,Horror,1980,"May 9, 1980 (United States)",6.4,123000.0,Sean S. Cunningham,Victor Miller,Betsy Palmer,United States,39754601.0,Paramount Pictures,95.0
6,The Blues Brothers,R,Action,1980,"June 20, 1980 (United States)",7.9,188000.0,John Landis,Dan Aykroyd,John Belushi,United States,115229890.0,Universal Pictures,133.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7326,Don't Let Go,R,Drama,2019,"August 30, 2019 (United States)",6.4,8000.0,Jacob Estes,Jacob Estes,David Oyelowo,United States,5288011.0,Blumhouse Productions,103.0
7342,Missing Link,PG,Animation,2019,"April 12, 2019 (United States)",6.7,23000.0,Chris Butler,Chris Butler,Hugh Jackman,Canada,26565710.0,Annapurna Pictures,93.0
7352,A Madea Family Funeral,PG-13,Comedy,2019,"March 1, 2019 (United States)",4.5,4000.0,Tyler Perry,Tyler Perry,Tyler Perry,United States,74747725.0,The Tyler Perry Company,109.0
7403,Onward,PG,Animation,2020,"March 6, 2020 (United States)",7.4,120000.0,Dan Scanlon,Dan Scanlon,Tom Holland,United States,141950608.0,Walt Disney Pictures,102.0


In [9]:
# Generate our categorical variable list
movies_cat = movies_df.dtypes[movies_df.dtypes == "object"].index.tolist()

# Remove name from categorical list
movies_cat.pop(0)
movies_cat

['rating',
 'genre',
 'released',
 'director',
 'writer',
 'star',
 'country',
 'company']

In [10]:
# Create a OneHotEncoder instance
enc = OneHotEncoder(sparse=False)

# Fit and transform the OneHotEncoder using the categorical variable list
encode_df = pd.DataFrame(enc.fit_transform(movies_df[movies_cat]))

# Add the encoded variable names to the DataFrame
encode_df.columns = enc.get_feature_names(movies_cat)
encode_df.head()



Unnamed: 0,rating_G,rating_NC-17,rating_Not Rated,rating_PG,rating_PG-13,rating_R,rating_Unrated,genre_Action,genre_Adventure,genre_Animation,...,company_Walt Disney Productions,company_Warner Bros.,company_Warner Bros. Family Entertainment,company_Warner Independent Pictures (WIP),company_Weintraub Entertainment Group,company_Why Not Productions,company_Worldview Entertainment,company_Yellow Bird,company_Zentropa Entertainments,company_Zoetrope Studios
0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [11]:
# Merge one-hot encoded features and drop the originals
movies_df = movies_df.merge(encode_df,left_index=True, right_index=True)
movies_df = movies_df.drop(movies_cat,1)
movies_df.head()

  This is separate from the ipykernel package so we can avoid doing imports until


Unnamed: 0,name,year,score,votes,gross,runtime,rating_G,rating_NC-17,rating_Not Rated,rating_PG,...,company_Walt Disney Productions,company_Warner Bros.,company_Warner Bros. Family Entertainment,company_Warner Independent Pictures (WIP),company_Weintraub Entertainment Group,company_Why Not Productions,company_Worldview Entertainment,company_Yellow Bird,company_Zentropa Entertainments,company_Zoetrope Studios
0,The Shining,1980,8.4,927000.0,46998772.0,146.0,0.0,0.0,0.0,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,Airplane!,1980,7.7,221000.0,83453539.0,88.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,Caddyshack,1980,7.3,108000.0,39846344.0,98.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,Friday the 13th,1980,6.4,123000.0,39754601.0,95.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6,The Blues Brothers,1980,7.9,188000.0,115229890.0,133.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [12]:
# Create target and features data
X = movies_df.drop(columns=['score','name'])
y = movies_df['score']

In [13]:
# Split training/test datasets
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)

# Create a StandardScaler instance
scaler = StandardScaler()

# Fit the StandardScaler
X_scaler = scaler.fit(X_train)

# Scale the data
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

In [14]:
# Create a random forest regression model
rf_model = RandomForestRegressor(n_estimators=128, random_state=42)

# Fitting the model
rf_model = rf_model.fit(X_train_scaled, y_train)

# Evaluate the model (r2 score)
y_pred = rf_model.predict(X_test_scaled)
rf_model.score(X_test_scaled, y_test)

0.3977529206822399

In [15]:
# Determine RMSE (root mean square error)
mse = mean_squared_error(y_test, y_pred, squared=False)
rmse = sqrt(mse)
rmse

0.8081597838093846

In [16]:
# Determine the MAE (mean absolute error)
mae = mean_absolute_error(y_test,y_pred)
mae

0.44963745915032693