In [1]:
# Import dependencies
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
from math import sqrt
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import r2_score

In [2]:
# Load cleaned_movies into DataFrame
movies_df = pd.read_csv('movies_final.csv')
movies_df.head()

Unnamed: 0.1,Unnamed: 0,name,rating,genre,year,released,score,votes,director,writer,star,country,gross,company,runtime
0,0,The Shining,R,Drama,1980,"June 13, 1980 (United States)",8.4,927000.0,Stanley Kubrick,Stephen King,Jack Nicholson,United Kingdom,46998772.0,Warner Bros.,146.0
1,1,The Blue Lagoon,R,Adventure,1980,"July 2, 1980 (United States)",5.8,65000.0,Randal Kleiser,Henry De Vere Stacpoole,Brooke Shields,United States,58853106.0,Columbia Pictures,104.0
2,2,Star Wars: Episode V - The Empire Strikes Back,PG,Action,1980,"June 20, 1980 (United States)",8.7,1200000.0,Irvin Kershner,Leigh Brackett,Mark Hamill,United States,538375067.0,Lucasfilm,124.0
3,3,Airplane!,PG,Comedy,1980,"July 2, 1980 (United States)",7.7,221000.0,Jim Abrahams,Jim Abrahams,Robert Hays,United States,83453539.0,Paramount Pictures,88.0
4,4,Caddyshack,R,Comedy,1980,"July 25, 1980 (United States)",7.3,108000.0,Harold Ramis,Brian Doyle-Murray,Chevy Chase,United States,39846344.0,Orion Pictures,98.0


Preprocessing

In [3]:
# Drop Unnamed column
movies_df = movies_df.loc[:,~movies_df.columns.str.contains('^Unnamed')]
movies_df

Unnamed: 0,name,rating,genre,year,released,score,votes,director,writer,star,country,gross,company,runtime
0,The Shining,R,Drama,1980,"June 13, 1980 (United States)",8.4,927000.0,Stanley Kubrick,Stephen King,Jack Nicholson,United Kingdom,46998772.0,Warner Bros.,146.0
1,The Blue Lagoon,R,Adventure,1980,"July 2, 1980 (United States)",5.8,65000.0,Randal Kleiser,Henry De Vere Stacpoole,Brooke Shields,United States,58853106.0,Columbia Pictures,104.0
2,Star Wars: Episode V - The Empire Strikes Back,PG,Action,1980,"June 20, 1980 (United States)",8.7,1200000.0,Irvin Kershner,Leigh Brackett,Mark Hamill,United States,538375067.0,Lucasfilm,124.0
3,Airplane!,PG,Comedy,1980,"July 2, 1980 (United States)",7.7,221000.0,Jim Abrahams,Jim Abrahams,Robert Hays,United States,83453539.0,Paramount Pictures,88.0
4,Caddyshack,R,Comedy,1980,"July 25, 1980 (United States)",7.3,108000.0,Harold Ramis,Brian Doyle-Murray,Chevy Chase,United States,39846344.0,Orion Pictures,98.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7407,The Call of the Wild,PG,Adventure,2020,"February 21, 2020 (United States)",6.8,42000.0,Chris Sanders,Michael Green,Harrison Ford,Canada,111105497.0,20th Century Studios,100.0
7408,The Eight Hundred,Not Rated,Action,2020,"August 28, 2020 (United States)",6.8,3700.0,Hu Guan,Hu Guan,Zhi-zhong Huang,China,461421559.0,Beijing Diqi Yinxiang Entertainment,149.0
7409,The Quarry,R,Crime,2020,"April 17, 2020 (Mexico)",5.4,2400.0,Scott Teems,Scott Teems,Shea Whigham,United States,3661.0,Prowess Pictures,98.0
7410,Tulsa,PG-13,Comedy,2020,"June 3, 2020 (United States)",5.0,294.0,Scott Pryor,Scott Pryor,Scott Pryor,United States,413378.0,Pryor Entertainment,120.0


In [4]:
# Drop name, relase date, and gross revenue columns
movies_df = movies_df.drop(columns=['name','released','gross'], axis=1)
movies_df.head()

Unnamed: 0,rating,genre,year,score,votes,director,writer,star,country,company,runtime
0,R,Drama,1980,8.4,927000.0,Stanley Kubrick,Stephen King,Jack Nicholson,United Kingdom,Warner Bros.,146.0
1,R,Adventure,1980,5.8,65000.0,Randal Kleiser,Henry De Vere Stacpoole,Brooke Shields,United States,Columbia Pictures,104.0
2,PG,Action,1980,8.7,1200000.0,Irvin Kershner,Leigh Brackett,Mark Hamill,United States,Lucasfilm,124.0
3,PG,Comedy,1980,7.7,221000.0,Jim Abrahams,Jim Abrahams,Robert Hays,United States,Paramount Pictures,88.0
4,R,Comedy,1980,7.3,108000.0,Harold Ramis,Brian Doyle-Murray,Chevy Chase,United States,Orion Pictures,98.0


In [5]:
# Determine if year needs binning
years = movies_df['year'].value_counts()
years

2009    200
2002    200
2014    200
2013    200
2005    200
2018    199
2006    199
2007    199
2008    199
2011    199
2012    199
2016    199
2017    199
2004    199
1994    198
1996    198
2010    197
2001    197
2003    197
1995    196
1997    196
2015    196
1991    195
1993    195
1998    194
2000    193
1999    193
1988    191
1992    189
1990    189
1989    186
1987    186
1986    182
2019    180
1985    177
1984    155
1983    127
1982    118
1981    103
1980     80
2020     13
Name: year, dtype: int64

In [6]:
# Determine if genre needs binning
genres = movies_df['genre'].value_counts()
genres

Comedy       2182
Action       1666
Drama        1439
Crime         536
Biography     429
Adventure     419
Animation     331
Horror        304
Fantasy        42
Mystery        20
Thriller       12
Family         10
Romance         8
Sci-Fi          8
Western         3
Music           1
Sport           1
Musical         1
Name: genre, dtype: int64

In [7]:
# Bin genres with less than 5 movies into 'Other'
replace_genre = list(genres[genres < 5].index)

# Replace in DataFrame
for genre in replace_genre:
    movies_df['genre'] = movies_df['genre'].replace(genre,'Other')

# Check to see if binning was successful
movies_df['genre'].value_counts()

Comedy       2182
Action       1666
Drama        1439
Crime         536
Biography     429
Adventure     419
Animation     331
Horror        304
Fantasy        42
Mystery        20
Thriller       12
Family         10
Romance         8
Sci-Fi          8
Other           6
Name: genre, dtype: int64

In [8]:
# Determine which directors need to be removed (directors with only 1 movie)
directors = movies_df['director'].value_counts()
directors

Woody Allen         38
Clint Eastwood      31
Steven Spielberg    27
Directors           25
Ron Howard          24
                    ..
Daniel Myrick        1
Tony Cinciripini     1
Rodney Gibbons       1
Nicole Garcia        1
Kevin Shulman        1
Name: director, Length: 2816, dtype: int64

In [9]:
# Remove directors with only 1 film
directors_df = movies_df[movies_df['director'].map(movies_df['director'].value_counts()) != 1]
directors_df

Unnamed: 0,rating,genre,year,score,votes,director,writer,star,country,company,runtime
0,R,Drama,1980,8.4,927000.0,Stanley Kubrick,Stephen King,Jack Nicholson,United Kingdom,Warner Bros.,146.0
1,R,Adventure,1980,5.8,65000.0,Randal Kleiser,Henry De Vere Stacpoole,Brooke Shields,United States,Columbia Pictures,104.0
2,PG,Action,1980,8.7,1200000.0,Irvin Kershner,Leigh Brackett,Mark Hamill,United States,Lucasfilm,124.0
3,PG,Comedy,1980,7.7,221000.0,Jim Abrahams,Jim Abrahams,Robert Hays,United States,Paramount Pictures,88.0
4,R,Comedy,1980,7.3,108000.0,Harold Ramis,Brian Doyle-Murray,Chevy Chase,United States,Orion Pictures,98.0
...,...,...,...,...,...,...,...,...,...,...,...
7399,PG-13,Action,2020,7.4,387000.0,Christopher Nolan,Christopher Nolan,John David Washington,United States,Warner Bros.,150.0
7400,PG-13,Action,2020,5.4,217000.0,Patty Jenkins,Patty Jenkins,Gal Gadot,United States,Atlas Entertainment,151.0
7402,R,Drama,2020,7.1,186000.0,Leigh Whannell,Leigh Whannell,Elisabeth Moss,Canada,Universal Pictures,124.0
7403,PG,Animation,2020,7.4,120000.0,Dan Scanlon,Dan Scanlon,Tom Holland,United States,Walt Disney Pictures,102.0


In [10]:
# Check to make sure proper directors were removed
directors_df['director'].value_counts()

Woody Allen         38
Clint Eastwood      31
Steven Spielberg    27
Directors           25
Ron Howard          24
                    ..
Tom Hanks            2
David Hogan          2
Steve Buscemi        2
Matthew Bright       2
Adam Robitel         2
Name: director, Length: 1353, dtype: int64

In [11]:
# Determine which stars need to be removed
stars = directors_df['star'].value_counts()
stars

Tom Hanks                39
Nicolas Cage             39
Robert De Niro           38
Denzel Washington        37
Tom Cruise               33
                         ..
Clarence Williams III     1
Josh Hamilton             1
Hal Scardino              1
Heather Matarazzo         1
Elisabeth Moss            1
Name: star, Length: 2058, dtype: int64

In [12]:
# Remove stars with only 1 film
stars_df = directors_df[directors_df['star'].map(directors_df['star'].value_counts()) != 1]
stars_df

Unnamed: 0,rating,genre,year,score,votes,director,writer,star,country,company,runtime
0,R,Drama,1980,8.4,927000.0,Stanley Kubrick,Stephen King,Jack Nicholson,United Kingdom,Warner Bros.,146.0
1,R,Adventure,1980,5.8,65000.0,Randal Kleiser,Henry De Vere Stacpoole,Brooke Shields,United States,Columbia Pictures,104.0
2,PG,Action,1980,8.7,1200000.0,Irvin Kershner,Leigh Brackett,Mark Hamill,United States,Lucasfilm,124.0
3,PG,Comedy,1980,7.7,221000.0,Jim Abrahams,Jim Abrahams,Robert Hays,United States,Paramount Pictures,88.0
4,R,Comedy,1980,7.3,108000.0,Harold Ramis,Brian Doyle-Murray,Chevy Chase,United States,Orion Pictures,98.0
...,...,...,...,...,...,...,...,...,...,...,...
7395,PG-13,Adventure,2019,5.6,336.0,Mitch Davis,Mitch Davis,Christopher Gorham,United States,Kolipoki Pictures,117.0
7399,PG-13,Action,2020,7.4,387000.0,Christopher Nolan,Christopher Nolan,John David Washington,United States,Warner Bros.,150.0
7400,PG-13,Action,2020,5.4,217000.0,Patty Jenkins,Patty Jenkins,Gal Gadot,United States,Atlas Entertainment,151.0
7403,PG,Animation,2020,7.4,120000.0,Dan Scanlon,Dan Scanlon,Tom Holland,United States,Walt Disney Pictures,102.0


In [13]:
# Check to make sure proper stars were removed
stars_df['star'].value_counts()

Tom Hanks            39
Nicolas Cage         39
Robert De Niro       38
Denzel Washington    37
Bruce Willis         33
                     ..
Lili Taylor           2
Fanny Ardant          2
Billy Zane            2
Vincent Cassel        2
Claire Foy            2
Name: star, Length: 834, dtype: int64

In [14]:
# Determine which writers need to be removed
writers = stars_df['writer'].value_counts()
writers

Woody Allen         35
John Hughes         22
Luc Besson          20
Stephen King        17
David Mamet         14
                    ..
Eleazar Lipsky       1
Paul Auster          1
Oliver Parker        1
Ian McKellen         1
W. Bruce Cameron     1
Name: writer, Length: 2808, dtype: int64

In [15]:
# Remove writers with only 1 film
writers_df = stars_df[stars_df['writer'].map(stars_df['writer'].value_counts()) != 1]
# Check to make sure proper writers were removed
writers_df['writer'].value_counts()

Woody Allen           35
John Hughes           22
Luc Besson            20
Stephen King          17
David Mamet           14
                      ..
Carol Heikkinen        2
Roger Avary            2
Patrick J. Clifton     2
Robbie Fox             2
Chad St. John          2
Name: writer, Length: 883, dtype: int64

In [16]:
# Remove companies with only 1 film
companies_df = writers_df[writers_df['company'].map(writers_df['company'].value_counts()) != 1]
# Check to make sure proper companies were removed
companies_df['company'].value_counts()

Universal Pictures              204
Warner Bros.                    179
Columbia Pictures               177
Paramount Pictures              168
Twentieth Century Fox           128
                               ... 
Tribeca Productions               2
40 Acres & A Mule Filmworks       2
Village Roadshow Pictures         2
Hawn / Sylbert Movie Company      2
Triumph Films                     2
Name: company, Length: 252, dtype: int64

In [17]:
# Determine value_counts() for scores
companies_df['score'].value_counts()

6.4    131
6.6    125
6.2    117
6.7    102
7.0    102
6.5    101
7.3     95
6.3     94
6.1     90
6.8     90
6.9     88
7.2     84
7.1     80
5.6     66
6.0     61
5.7     60
5.9     58
5.8     58
7.5     58
7.4     57
7.7     55
7.6     52
5.5     50
7.8     46
8.1     33
5.3     31
5.4     28
7.9     28
8.0     25
5.2     23
5.1     21
5.0     18
4.9     17
4.5     14
8.2     11
4.7     10
4.8     10
8.3     10
8.4      8
8.5      8
4.6      7
4.4      6
3.8      5
3.7      3
4.1      3
8.6      3
4.0      3
8.7      3
8.9      2
3.9      2
8.8      2
4.3      2
9.3      1
2.2      1
4.2      1
3.6      1
9.0      1
2.5      1
3.5      1
Name: score, dtype: int64

In [18]:
# Remove scores with only 1 value
scores_df = companies_df[companies_df['score'].map(companies_df['score'].value_counts()) != 1]
# Check to make sure proper scores were removed
scores_df['score'].value_counts()

6.4    131
6.6    125
6.2    117
6.7    102
7.0    102
6.5    101
7.3     95
6.3     94
6.1     90
6.8     90
6.9     88
7.2     84
7.1     80
5.6     66
6.0     61
5.7     60
5.9     58
5.8     58
7.5     58
7.4     57
7.7     55
7.6     52
5.5     50
7.8     46
8.1     33
5.3     31
5.4     28
7.9     28
8.0     25
5.2     23
5.1     21
5.0     18
4.9     17
4.5     14
8.2     11
4.7     10
4.8     10
8.3     10
8.4      8
8.5      8
4.6      7
4.4      6
3.8      5
3.7      3
8.7      3
4.1      3
4.0      3
8.6      3
3.9      2
8.9      2
8.8      2
4.3      2
Name: score, dtype: int64

In [19]:
# Create final DataFrame for ML model
processed_movies_df = scores_df
processed_movies_df

Unnamed: 0,rating,genre,year,score,votes,director,writer,star,country,company,runtime
0,R,Drama,1980,8.4,927000.0,Stanley Kubrick,Stephen King,Jack Nicholson,United Kingdom,Warner Bros.,146.0
3,PG,Comedy,1980,7.7,221000.0,Jim Abrahams,Jim Abrahams,Robert Hays,United States,Paramount Pictures,88.0
5,R,Horror,1980,6.4,123000.0,Sean S. Cunningham,Victor Miller,Betsy Palmer,United States,Paramount Pictures,95.0
6,R,Action,1980,7.9,188000.0,John Landis,Dan Aykroyd,John Belushi,United States,Universal Pictures,133.0
8,PG,Action,1980,6.8,101000.0,Richard Lester,Jerry Siegel,Gene Hackman,United States,Dovemead Films,127.0
...,...,...,...,...,...,...,...,...,...,...,...
7352,PG-13,Comedy,2019,4.5,4000.0,Tyler Perry,Tyler Perry,Tyler Perry,United States,The Tyler Perry Company,109.0
7399,PG-13,Action,2020,7.4,387000.0,Christopher Nolan,Christopher Nolan,John David Washington,United States,Warner Bros.,150.0
7400,PG-13,Action,2020,5.4,217000.0,Patty Jenkins,Patty Jenkins,Gal Gadot,United States,Atlas Entertainment,151.0
7403,PG,Animation,2020,7.4,120000.0,Dan Scanlon,Dan Scanlon,Tom Holland,United States,Walt Disney Pictures,102.0


Machine Learning Model

In [20]:
# Split the data into features and target
# Create our features
X = pd.get_dummies(processed_movies_df, columns=['rating','genre','year','director','writer',
                                                'star','country','company']).drop('score',axis=1)

# Create our target
y = processed_movies_df['score']

In [21]:
# View X
X

Unnamed: 0,votes,runtime,rating_G,rating_NC-17,rating_Not Rated,rating_PG,rating_PG-13,rating_R,rating_Unrated,genre_Action,...,company_Walt Disney Productions,company_Warner Bros.,company_Warner Bros. Family Entertainment,company_Warner Independent Pictures (WIP),company_Weintraub Entertainment Group,company_Why Not Productions,company_Worldview Entertainment,company_Yellow Bird,company_Zentropa Entertainments,company_Zoetrope Studios
0,927000.0,146.0,0,0,0,0,0,1,0,0,...,0,1,0,0,0,0,0,0,0,0
3,221000.0,88.0,0,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
5,123000.0,95.0,0,0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
6,188000.0,133.0,0,0,0,0,0,1,0,1,...,0,0,0,0,0,0,0,0,0,0
8,101000.0,127.0,0,0,0,1,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7352,4000.0,109.0,0,0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
7399,387000.0,150.0,0,0,0,0,1,0,0,1,...,0,1,0,0,0,0,0,0,0,0
7400,217000.0,151.0,0,0,0,0,1,0,0,1,...,0,0,0,0,0,0,0,0,0,0
7403,120000.0,102.0,0,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [22]:
# Check balance of target values
y.value_counts()

6.4    131
6.6    125
6.2    117
6.7    102
7.0    102
6.5    101
7.3     95
6.3     94
6.1     90
6.8     90
6.9     88
7.2     84
7.1     80
5.6     66
6.0     61
5.7     60
5.9     58
5.8     58
7.5     58
7.4     57
7.7     55
7.6     52
5.5     50
7.8     46
8.1     33
5.3     31
5.4     28
7.9     28
8.0     25
5.2     23
5.1     21
5.0     18
4.9     17
4.5     14
8.2     11
4.7     10
4.8     10
8.3     10
8.4      8
8.5      8
4.6      7
4.4      6
3.8      5
3.7      3
8.7      3
4.1      3
4.0      3
8.6      3
3.9      2
8.9      2
8.8      2
4.3      2
Name: score, dtype: int64

In [23]:
# Split into training and testing data
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1, stratify=y)

In [24]:
# Train the data into Linear Regression algorithm
# Create an instance
lr = LinearRegression()
# Fit into training data
lr.fit(X_train,y_train)

LinearRegression()

In [25]:
# Retrieve the intercept
print(lr.intercept_)

4.6488787589809455


In [26]:
# Retrieve the coefficient(s)
print(lr.coef_)

[ 1.63564427e-06  1.68218868e-02 -9.58496747e-01 ...  9.93708367e-02
  8.66622120e-02  1.48193755e-01]


In [27]:
# How many coefficients?
print(len(lr.coef_))

2707


In [28]:
# Make predictions
y_pred = lr.predict(X_test)

In [29]:
# Determine RMSE (root mean square error)
mse = mean_squared_error(y_test, y_pred, squared=False)
rmse = sqrt(mse)
rmse

1.2679086214213404

In [30]:
# Determine the MAE (mean absolute error)
mae = mean_absolute_error(y_test,y_pred)
mae

1.254859906699954

In [31]:
# Determine the r2_score
r2_score = r2_score(y_test,y_pred)
r2_score

-2.5538311338304123