In [1]:
# Import dependencies
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
from math import sqrt
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import r2_score

In [2]:
# Load cleaned_movies into DataFrame
movies_df = pd.read_csv('movies_final.csv')
movies_df.head()

Unnamed: 0.1,Unnamed: 0,name,rating,genre,year,released,score,votes,director,writer,star,country,gross,company,runtime
0,0,The Shining,R,Drama,1980,"June 13, 1980 (United States)",8.4,927000.0,Stanley Kubrick,Stephen King,Jack Nicholson,United Kingdom,46998772.0,Warner Bros.,146.0
1,1,The Blue Lagoon,R,Adventure,1980,"July 2, 1980 (United States)",5.8,65000.0,Randal Kleiser,Henry De Vere Stacpoole,Brooke Shields,United States,58853106.0,Columbia Pictures,104.0
2,2,Star Wars: Episode V - The Empire Strikes Back,PG,Action,1980,"June 20, 1980 (United States)",8.7,1200000.0,Irvin Kershner,Leigh Brackett,Mark Hamill,United States,538375067.0,Lucasfilm,124.0
3,3,Airplane!,PG,Comedy,1980,"July 2, 1980 (United States)",7.7,221000.0,Jim Abrahams,Jim Abrahams,Robert Hays,United States,83453539.0,Paramount Pictures,88.0
4,4,Caddyshack,R,Comedy,1980,"July 25, 1980 (United States)",7.3,108000.0,Harold Ramis,Brian Doyle-Murray,Chevy Chase,United States,39846344.0,Orion Pictures,98.0


Preprocessing

In [3]:
# Drop Unnamed column
movies_df = movies_df.loc[:,~movies_df.columns.str.contains('^Unnamed')]
movies_df

Unnamed: 0,name,rating,genre,year,released,score,votes,director,writer,star,country,gross,company,runtime
0,The Shining,R,Drama,1980,"June 13, 1980 (United States)",8.4,927000.0,Stanley Kubrick,Stephen King,Jack Nicholson,United Kingdom,46998772.0,Warner Bros.,146.0
1,The Blue Lagoon,R,Adventure,1980,"July 2, 1980 (United States)",5.8,65000.0,Randal Kleiser,Henry De Vere Stacpoole,Brooke Shields,United States,58853106.0,Columbia Pictures,104.0
2,Star Wars: Episode V - The Empire Strikes Back,PG,Action,1980,"June 20, 1980 (United States)",8.7,1200000.0,Irvin Kershner,Leigh Brackett,Mark Hamill,United States,538375067.0,Lucasfilm,124.0
3,Airplane!,PG,Comedy,1980,"July 2, 1980 (United States)",7.7,221000.0,Jim Abrahams,Jim Abrahams,Robert Hays,United States,83453539.0,Paramount Pictures,88.0
4,Caddyshack,R,Comedy,1980,"July 25, 1980 (United States)",7.3,108000.0,Harold Ramis,Brian Doyle-Murray,Chevy Chase,United States,39846344.0,Orion Pictures,98.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7407,The Call of the Wild,PG,Adventure,2020,"February 21, 2020 (United States)",6.8,42000.0,Chris Sanders,Michael Green,Harrison Ford,Canada,111105497.0,20th Century Studios,100.0
7408,The Eight Hundred,Not Rated,Action,2020,"August 28, 2020 (United States)",6.8,3700.0,Hu Guan,Hu Guan,Zhi-zhong Huang,China,461421559.0,Beijing Diqi Yinxiang Entertainment,149.0
7409,The Quarry,R,Crime,2020,"April 17, 2020 (Mexico)",5.4,2400.0,Scott Teems,Scott Teems,Shea Whigham,United States,3661.0,Prowess Pictures,98.0
7410,Tulsa,PG-13,Comedy,2020,"June 3, 2020 (United States)",5.0,294.0,Scott Pryor,Scott Pryor,Scott Pryor,United States,413378.0,Pryor Entertainment,120.0


In [4]:
# Drop name, relase date, writer and gross revenue columns
movies_df = movies_df.drop(columns=['name','released','gross','writer'], axis=1)
movies_df.head()

Unnamed: 0,rating,genre,year,score,votes,director,star,country,company,runtime
0,R,Drama,1980,8.4,927000.0,Stanley Kubrick,Jack Nicholson,United Kingdom,Warner Bros.,146.0
1,R,Adventure,1980,5.8,65000.0,Randal Kleiser,Brooke Shields,United States,Columbia Pictures,104.0
2,PG,Action,1980,8.7,1200000.0,Irvin Kershner,Mark Hamill,United States,Lucasfilm,124.0
3,PG,Comedy,1980,7.7,221000.0,Jim Abrahams,Robert Hays,United States,Paramount Pictures,88.0
4,R,Comedy,1980,7.3,108000.0,Harold Ramis,Chevy Chase,United States,Orion Pictures,98.0


In [5]:
# Determine if year needs binning
years = movies_df['year'].value_counts()
years

2009    200
2002    200
2014    200
2013    200
2005    200
2018    199
2006    199
2007    199
2008    199
2011    199
2012    199
2016    199
2017    199
2004    199
1994    198
1996    198
2010    197
2001    197
2003    197
1995    196
1997    196
2015    196
1991    195
1993    195
1998    194
2000    193
1999    193
1988    191
1992    189
1990    189
1989    186
1987    186
1986    182
2019    180
1985    177
1984    155
1983    127
1982    118
1981    103
1980     80
2020     13
Name: year, dtype: int64

In [6]:
# Determine if genre needs binning
genres = movies_df['genre'].value_counts()
genres

Comedy       2182
Action       1666
Drama        1439
Crime         536
Biography     429
Adventure     419
Animation     331
Horror        304
Fantasy        42
Mystery        20
Thriller       12
Family         10
Romance         8
Sci-Fi          8
Western         3
Music           1
Sport           1
Musical         1
Name: genre, dtype: int64

In [7]:
# Bin genres with less than 5 movies into 'Other'
replace_genre = list(genres[genres < 5].index)

# Replace in DataFrame
for genre in replace_genre:
    movies_df['genre'] = movies_df['genre'].replace(genre,'Other')

# Check to see if binning was successful
movies_df['genre'].value_counts()

Comedy       2182
Action       1666
Drama        1439
Crime         536
Biography     429
Adventure     419
Animation     331
Horror        304
Fantasy        42
Mystery        20
Thriller       12
Family         10
Romance         8
Sci-Fi          8
Other           6
Name: genre, dtype: int64

In [8]:
# Determine which directors need to be removed (directors with only 1 movie)
directors = movies_df['director'].value_counts()
directors

Woody Allen         38
Clint Eastwood      31
Steven Spielberg    27
Directors           25
Ron Howard          24
                    ..
Daniel Myrick        1
Tony Cinciripini     1
Rodney Gibbons       1
Nicole Garcia        1
Kevin Shulman        1
Name: director, Length: 2816, dtype: int64

In [9]:
# Remove directors with only 1 film
movies_df = movies_df[movies_df['director'].map(movies_df['director'].value_counts()) != 1]
movies_df

Unnamed: 0,rating,genre,year,score,votes,director,star,country,company,runtime
0,R,Drama,1980,8.4,927000.0,Stanley Kubrick,Jack Nicholson,United Kingdom,Warner Bros.,146.0
1,R,Adventure,1980,5.8,65000.0,Randal Kleiser,Brooke Shields,United States,Columbia Pictures,104.0
2,PG,Action,1980,8.7,1200000.0,Irvin Kershner,Mark Hamill,United States,Lucasfilm,124.0
3,PG,Comedy,1980,7.7,221000.0,Jim Abrahams,Robert Hays,United States,Paramount Pictures,88.0
4,R,Comedy,1980,7.3,108000.0,Harold Ramis,Chevy Chase,United States,Orion Pictures,98.0
...,...,...,...,...,...,...,...,...,...,...
7399,PG-13,Action,2020,7.4,387000.0,Christopher Nolan,John David Washington,United States,Warner Bros.,150.0
7400,PG-13,Action,2020,5.4,217000.0,Patty Jenkins,Gal Gadot,United States,Atlas Entertainment,151.0
7402,R,Drama,2020,7.1,186000.0,Leigh Whannell,Elisabeth Moss,Canada,Universal Pictures,124.0
7403,PG,Animation,2020,7.4,120000.0,Dan Scanlon,Tom Holland,United States,Walt Disney Pictures,102.0


In [10]:
# Check to make sure proper directors were removed
movies_df['director'].value_counts()

Woody Allen         38
Clint Eastwood      31
Steven Spielberg    27
Directors           25
Ron Howard          24
                    ..
Tom Hanks            2
David Hogan          2
Steve Buscemi        2
Matthew Bright       2
Adam Robitel         2
Name: director, Length: 1353, dtype: int64

In [11]:
# Determine which stars need to be removed
stars = movies_df['star'].value_counts()
stars

Tom Hanks                39
Nicolas Cage             39
Robert De Niro           38
Denzel Washington        37
Tom Cruise               33
                         ..
Clarence Williams III     1
Josh Hamilton             1
Hal Scardino              1
Heather Matarazzo         1
Elisabeth Moss            1
Name: star, Length: 2058, dtype: int64

In [12]:
# Remove stars with only 1 film
movies_df = movies_df[movies_df['star'].map(movies_df['star'].value_counts()) != 1]
movies_df

Unnamed: 0,rating,genre,year,score,votes,director,star,country,company,runtime
0,R,Drama,1980,8.4,927000.0,Stanley Kubrick,Jack Nicholson,United Kingdom,Warner Bros.,146.0
1,R,Adventure,1980,5.8,65000.0,Randal Kleiser,Brooke Shields,United States,Columbia Pictures,104.0
2,PG,Action,1980,8.7,1200000.0,Irvin Kershner,Mark Hamill,United States,Lucasfilm,124.0
3,PG,Comedy,1980,7.7,221000.0,Jim Abrahams,Robert Hays,United States,Paramount Pictures,88.0
4,R,Comedy,1980,7.3,108000.0,Harold Ramis,Chevy Chase,United States,Orion Pictures,98.0
...,...,...,...,...,...,...,...,...,...,...
7395,PG-13,Adventure,2019,5.6,336.0,Mitch Davis,Christopher Gorham,United States,Kolipoki Pictures,117.0
7399,PG-13,Action,2020,7.4,387000.0,Christopher Nolan,John David Washington,United States,Warner Bros.,150.0
7400,PG-13,Action,2020,5.4,217000.0,Patty Jenkins,Gal Gadot,United States,Atlas Entertainment,151.0
7403,PG,Animation,2020,7.4,120000.0,Dan Scanlon,Tom Holland,United States,Walt Disney Pictures,102.0


In [13]:
# Check to make sure proper stars were removed
movies_df['star'].value_counts()

Tom Hanks            39
Nicolas Cage         39
Robert De Niro       38
Denzel Washington    37
Bruce Willis         33
                     ..
Lili Taylor           2
Fanny Ardant          2
Billy Zane            2
Vincent Cassel        2
Claire Foy            2
Name: star, Length: 834, dtype: int64

In [14]:
# Remove companies with only 1 film
movies_df = movies_df[movies_df['company'].map(movies_df['company'].value_counts()) != 1]
# Check to make sure proper companies were removed
movies_df['company'].value_counts()

Universal Pictures                   297
Columbia Pictures                    274
Warner Bros.                         270
Paramount Pictures                   254
Twentieth Century Fox                192
                                    ... 
Nimbus Film Productions                2
Intermedia Films                       2
X-Filme Creative Pool                  2
Initial Entertainment Group (IEG)      2
Entertainment One                      2
Name: company, Length: 395, dtype: int64

In [15]:
# Determine value_counts() for scores
movies_df['score'].value_counts()

6.4    217
6.6    199
6.2    197
6.3    179
6.5    178
      ... 
9.3      1
3.1      1
3.6      1
3.4      1
3.2      1
Name: score, Length: 65, dtype: int64

In [16]:
# Remove scores with only 1 value
movies_df = movies_df[movies_df['score'].map(movies_df['score'].value_counts()) != 1]
# Check to make sure proper scores were removed
movies_df['score'].value_counts()

6.4    217
6.6    199
6.2    197
6.3    179
6.5    178
6.7    173
6.1    160
7.0    158
7.3    158
6.8    155
7.2    151
6.9    150
7.1    126
5.8    123
6.0    117
5.9    116
5.6    107
5.7    106
7.5    102
7.4     90
5.5     85
7.7     83
5.4     77
7.6     75
7.8     61
5.3     54
8.1     45
5.2     45
5.1     43
8.0     42
4.9     35
7.9     33
5.0     31
4.7     20
8.2     17
4.5     17
4.8     16
8.3     12
4.6     12
8.5     11
4.4     11
3.8     10
8.4      9
4.1      7
4.3      7
4.2      6
8.6      5
8.7      4
4.0      4
8.8      4
3.9      3
3.7      3
8.9      3
2.5      3
2.2      2
3.0      2
Name: score, dtype: int64

Machine Learning Model

In [17]:
# Split the data into features and target
# Create our features
X = pd.get_dummies(movies_df, columns=['rating','genre','year','director',
                                                'star','country','company']).drop('score',axis=1)

# Create our target
y = movies_df['score']

In [18]:
# View X
X

Unnamed: 0,votes,runtime,rating_G,rating_NC-17,rating_Not Rated,rating_PG,rating_PG-13,rating_R,rating_Unrated,genre_Action,...,company_Weintraub Entertainment Group,company_Why Not Productions,company_Wild Bunch,company_Working Title Films,company_Worldview Entertainment,company_X-Filme Creative Pool,company_Yellow Bird,company_Zenith Entertainment,company_Zentropa Entertainments,company_Zoetrope Studios
0,927000.0,146.0,0,0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
1,65000.0,104.0,0,0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
2,1200000.0,124.0,0,0,0,1,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
3,221000.0,88.0,0,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,108000.0,98.0,0,0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7352,4000.0,109.0,0,0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
7399,387000.0,150.0,0,0,0,0,1,0,0,1,...,0,0,0,0,0,0,0,0,0,0
7400,217000.0,151.0,0,0,0,0,1,0,0,1,...,0,0,0,0,0,0,0,0,0,0
7403,120000.0,102.0,0,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [19]:
# Check balance of target values
y.value_counts()

6.4    217
6.6    199
6.2    197
6.3    179
6.5    178
6.7    173
6.1    160
7.0    158
7.3    158
6.8    155
7.2    151
6.9    150
7.1    126
5.8    123
6.0    117
5.9    116
5.6    107
5.7    106
7.5    102
7.4     90
5.5     85
7.7     83
5.4     77
7.6     75
7.8     61
5.3     54
8.1     45
5.2     45
5.1     43
8.0     42
4.9     35
7.9     33
5.0     31
4.7     20
8.2     17
4.5     17
4.8     16
8.3     12
4.6     12
8.5     11
4.4     11
3.8     10
8.4      9
4.1      7
4.3      7
4.2      6
8.6      5
8.7      4
4.0      4
8.8      4
3.9      3
3.7      3
8.9      3
2.5      3
2.2      2
3.0      2
Name: score, dtype: int64

In [20]:
# Split into training and testing data
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1, stratify=y)

In [21]:
# Train the data into Linear Regression algorithm
# Create an instance
lr = LinearRegression()
# Fit into training data
lr.fit(X_train,y_train)

LinearRegression()

In [22]:
# Retrieve the intercept
print(lr.intercept_)

-362326.88866901497


In [23]:
# Retrieve the coefficient(s)
print(lr.coef_)

[ 1.68976795e-06  8.33837818e-03  3.30491533e+05 ... -3.68096422e+01
 -6.78059746e+02 -3.59877722e+01]


In [24]:
# How many coefficients?
print(len(lr.coef_))

2446


In [25]:
# Make predictions
y_pred = lr.predict(X_test)

In [26]:
# Determine RMSE (root mean square error)
mse = mean_squared_error(y_test, y_pred, squared=False)
rmse = sqrt(mse)
rmse

33.81161913307532

In [27]:
# Determine the MAE (mean absolute error)
mae = mean_absolute_error(y_test,y_pred)
mae

289.7495175292129

In [28]:
# Determine the r2_score
r2_score = r2_score(y_test,y_pred)
r2_score

-1799405.3339829624