In [1]:
# Import dependencies
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from math import sqrt
from sklearn.metrics import mean_absolute_error

In [2]:
# Load cleaned_movies dataset into a data frame.
movies_df = pd.read_csv('movies_final.csv') 
movies_df.head()

Unnamed: 0.1,Unnamed: 0,name,rating,genre,year,released,score,votes,director,writer,star,country,gross,company,runtime
0,0,The Shining,R,Drama,1980,"June 13, 1980 (United States)",8.4,927000.0,Stanley Kubrick,Stephen King,Jack Nicholson,United Kingdom,46998772.0,Warner Bros.,146.0
1,1,The Blue Lagoon,R,Adventure,1980,"July 2, 1980 (United States)",5.8,65000.0,Randal Kleiser,Henry De Vere Stacpoole,Brooke Shields,United States,58853106.0,Columbia Pictures,104.0
2,2,Star Wars: Episode V - The Empire Strikes Back,PG,Action,1980,"June 20, 1980 (United States)",8.7,1200000.0,Irvin Kershner,Leigh Brackett,Mark Hamill,United States,538375067.0,Lucasfilm,124.0
3,3,Airplane!,PG,Comedy,1980,"July 2, 1980 (United States)",7.7,221000.0,Jim Abrahams,Jim Abrahams,Robert Hays,United States,83453539.0,Paramount Pictures,88.0
4,4,Caddyshack,R,Comedy,1980,"July 25, 1980 (United States)",7.3,108000.0,Harold Ramis,Brian Doyle-Murray,Chevy Chase,United States,39846344.0,Orion Pictures,98.0


In [3]:
# Determine dtypes
movies_df.dtypes

Unnamed: 0      int64
name           object
rating         object
genre          object
year            int64
released       object
score         float64
votes         float64
director       object
writer         object
star           object
country        object
gross         float64
company        object
runtime       float64
dtype: object

In [4]:
# Determine if year needs binning
release_year = movies_df['year'].value_counts()
release_year

2009    200
2002    200
2014    200
2013    200
2005    200
2018    199
2006    199
2007    199
2008    199
2011    199
2012    199
2016    199
2017    199
2004    199
1994    198
1996    198
2010    197
2001    197
2003    197
1995    196
1997    196
2015    196
1991    195
1993    195
1998    194
2000    193
1999    193
1988    191
1992    189
1990    189
1989    186
1987    186
1986    182
2019    180
1985    177
1984    155
1983    127
1982    118
1981    103
1980     80
2020     13
Name: year, dtype: int64

In [5]:
# Determine if genre needs binning
genres = movies_df['genre'].value_counts()
genres

Comedy       2182
Action       1666
Drama        1439
Crime         536
Biography     429
Adventure     419
Animation     331
Horror        304
Fantasy        42
Mystery        20
Thriller       12
Family         10
Romance         8
Sci-Fi          8
Western         3
Music           1
Sport           1
Musical         1
Name: genre, dtype: int64

In [7]:
# Bin genres with less than 5 movies into 'Other'
replace_genre = list(genres[genres < 5].index)

# Replace in DataFrame
for genre in replace_genre:
    movies_df['genre'] = movies_df['genre'].replace(genre,'Other')
    
# Check to make sure binning was successful
movies_df['genre'].value_counts()

Comedy       2182
Action       1666
Drama        1439
Crime         536
Biography     429
Adventure     419
Animation     331
Horror        304
Fantasy        42
Mystery        20
Thriller       12
Family         10
Romance         8
Sci-Fi          8
Other           6
Name: genre, dtype: int64

In [7]:
# Determine number of directors
# bottom 1000 only have 1
# bottom 1500 starts at 2
# bottom 2000 starts at 3
# bottom 2500 starts at 5
directors = movies_df['director'].value_counts()
director_count = pd.DataFrame(directors)
director_count.tail(3000)

Unnamed: 0,director
Woody Allen,38
Clint Eastwood,31
Steven Spielberg,27
Directors,25
Ron Howard,24
...,...
Daniel Myrick,1
Tony Cinciripini,1
Rodney Gibbons,1
Nicole Garcia,1


In [17]:
# Get movies
movies_director = movies_df['director'].value_counts() == 1
delete_directors = []
df1 = movies_df[movies_df['director'].map(movies_df['director'].value_counts()) != 1]
df1

Unnamed: 0.1,Unnamed: 0,name,rating,genre,year,released,score,votes,director,writer,star,country,gross,company,runtime
0,0,The Shining,R,Drama,1980,"June 13, 1980 (United States)",8.4,927000.0,Stanley Kubrick,Stephen King,Jack Nicholson,United Kingdom,46998772.0,Warner Bros.,146.0
1,1,The Blue Lagoon,R,Adventure,1980,"July 2, 1980 (United States)",5.8,65000.0,Randal Kleiser,Henry De Vere Stacpoole,Brooke Shields,United States,58853106.0,Columbia Pictures,104.0
2,2,Star Wars: Episode V - The Empire Strikes Back,PG,Action,1980,"June 20, 1980 (United States)",8.7,1200000.0,Irvin Kershner,Leigh Brackett,Mark Hamill,United States,538375067.0,Lucasfilm,124.0
3,3,Airplane!,PG,Comedy,1980,"July 2, 1980 (United States)",7.7,221000.0,Jim Abrahams,Jim Abrahams,Robert Hays,United States,83453539.0,Paramount Pictures,88.0
4,4,Caddyshack,R,Comedy,1980,"July 25, 1980 (United States)",7.3,108000.0,Harold Ramis,Brian Doyle-Murray,Chevy Chase,United States,39846344.0,Orion Pictures,98.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7399,7643,Tenet,PG-13,Action,2020,"September 3, 2020 (United States)",7.4,387000.0,Christopher Nolan,Christopher Nolan,John David Washington,United States,363656624.0,Warner Bros.,150.0
7400,7644,Wonder Woman 1984,PG-13,Action,2020,"December 25, 2020 (United States)",5.4,217000.0,Patty Jenkins,Patty Jenkins,Gal Gadot,United States,166534027.0,Atlas Entertainment,151.0
7402,7646,The Invisible Man,R,Drama,2020,"February 28, 2020 (United States)",7.1,186000.0,Leigh Whannell,Leigh Whannell,Elisabeth Moss,Canada,143151000.0,Universal Pictures,124.0
7403,7647,Onward,PG,Animation,2020,"March 6, 2020 (United States)",7.4,120000.0,Dan Scanlon,Dan Scanlon,Tom Holland,United States,141950608.0,Walt Disney Pictures,102.0


In [18]:
df1['director'].value_counts()

Woody Allen         38
Clint Eastwood      31
Steven Spielberg    27
Directors           25
Ron Howard          24
                    ..
Tom Hanks            2
David Hogan          2
Steve Buscemi        2
Matthew Bright       2
Adam Robitel         2
Name: director, Length: 1353, dtype: int64

In [64]:
# Determine number of starring actors/actresses
# bottom 1500 only have 1
# bottom 2000 starts at 3
# bottom 2250 starts at 4
# bottom 2500 starts at 10
stars = movies['star'].value_counts()
star_count = pd.DataFrame(stars)
star_count.tail(2250)

Unnamed: 0,star
Jonah Hill,4
Jim Varney,4
Taraji P. Henson,4
Bruce Campbell,4
Christopher Reid,4
...,...
Teri Hatcher,1
Pamela Anderson,1
Mike Judge,1
Kim Bodnia,1


In [65]:
# Split the data into features and target
# Create our features
X = pd.get_dummies(movies, columns=['rating', 'genre', 'year', 'director', 
                'writer', 'star', 'country', 'company']).drop('score', axis=1)


# Create our target
y = movies['score']

In [66]:
#
X.describe()

Unnamed: 0.1,Unnamed: 0,votes,gross,runtime,rating_Approved,rating_G,rating_NC-17,rating_Not Rated,rating_PG,rating_PG-13,...,company_Zoetrope Studios,company_Zucker Brothers Productions,company_Zupnik-Curtis Enterprises,company_double A Films,company_erbp,company_i am OTHER,company_i5 Films,company_iDeal Partners Film Fund,company_micro_scope,company_thefyzz
count,7412.0,7412.0,7412.0,7412.0,7412.0,7412.0,7412.0,7412.0,7412.0,7412.0,...,7412.0,7412.0,7412.0,7412.0,7412.0,7412.0,7412.0,7412.0,7412.0,7412.0
mean,3878.402995,90763.57,78993550.0,107.446438,0.000135,0.020507,0.003103,0.034808,0.164598,0.28184,...,0.000944,0.000135,0.000135,0.000135,0.000135,0.000135,0.000135,0.000135,0.000135,0.000135
std,2190.832442,165346.6,166216400.0,18.517471,0.011615,0.141737,0.055623,0.183307,0.370842,0.449926,...,0.030719,0.011615,0.011615,0.011615,0.011615,0.011615,0.011615,0.011615,0.011615,0.011615
min,0.0,105.0,309.0,63.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,2004.75,10000.0,4633888.0,95.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,3908.5,34000.0,20403520.0,104.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
75%,5772.25,96000.0,76442310.0,116.0,0.0,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
max,7659.0,2400000.0,2847246000.0,366.0,1.0,1.0,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


In [67]:
# Check balance of target values
y.value_counts()

6.6    348
6.4    344
6.2    339
6.5    326
6.7    318
      ... 
2.1      2
9.3      1
2.3      1
2.6      1
9.0      1
Name: score, Length: 72, dtype: int64

In [68]:
# Split into training and testing data
X_train, X_test, y_train, y_test = train_test_split(X,y,random_state=1,stratify=y)

ValueError: The least populated class in y has only 1 member, which is too few. The minimum number of groups for any class cannot be less than 2.