In [1]:
# Import dependencies
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
import pandas as pd
import tensorflow as tf

In [2]:
# Load cleaned_movies into DataFrame
movies_df = pd.read_csv('movies_final.csv')
movies_df.head()
# Drop Unnamed column
movies_df = movies_df.loc[:,~movies_df.columns.str.contains('^Unnamed')]
movies_df

Unnamed: 0,name,rating,genre,year,released,score,votes,director,writer,star,country,gross,company,runtime
0,The Shining,R,Drama,1980,"June 13, 1980 (United States)",8.4,927000.0,Stanley Kubrick,Stephen King,Jack Nicholson,United Kingdom,46998772.0,Warner Bros.,146.0
1,The Blue Lagoon,R,Adventure,1980,"July 2, 1980 (United States)",5.8,65000.0,Randal Kleiser,Henry De Vere Stacpoole,Brooke Shields,United States,58853106.0,Columbia Pictures,104.0
2,Star Wars: Episode V - The Empire Strikes Back,PG,Action,1980,"June 20, 1980 (United States)",8.7,1200000.0,Irvin Kershner,Leigh Brackett,Mark Hamill,United States,538375067.0,Lucasfilm,124.0
3,Airplane!,PG,Comedy,1980,"July 2, 1980 (United States)",7.7,221000.0,Jim Abrahams,Jim Abrahams,Robert Hays,United States,83453539.0,Paramount Pictures,88.0
4,Caddyshack,R,Comedy,1980,"July 25, 1980 (United States)",7.3,108000.0,Harold Ramis,Brian Doyle-Murray,Chevy Chase,United States,39846344.0,Orion Pictures,98.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7407,The Call of the Wild,PG,Adventure,2020,"February 21, 2020 (United States)",6.8,42000.0,Chris Sanders,Michael Green,Harrison Ford,Canada,111105497.0,20th Century Studios,100.0
7408,The Eight Hundred,Not Rated,Action,2020,"August 28, 2020 (United States)",6.8,3700.0,Hu Guan,Hu Guan,Zhi-zhong Huang,China,461421559.0,Beijing Diqi Yinxiang Entertainment,149.0
7409,The Quarry,R,Crime,2020,"April 17, 2020 (Mexico)",5.4,2400.0,Scott Teems,Scott Teems,Shea Whigham,United States,3661.0,Prowess Pictures,98.0
7410,Tulsa,PG-13,Comedy,2020,"June 3, 2020 (United States)",5.0,294.0,Scott Pryor,Scott Pryor,Scott Pryor,United States,413378.0,Pryor Entertainment,120.0


In [3]:
# Drop the non-beneficial columns: 'name','released','gross'
movies_df = movies_df.drop(columns=['name','released','gross'], axis=1)
movies_df.head()

Unnamed: 0,rating,genre,year,score,votes,director,writer,star,country,company,runtime
0,R,Drama,1980,8.4,927000.0,Stanley Kubrick,Stephen King,Jack Nicholson,United Kingdom,Warner Bros.,146.0
1,R,Adventure,1980,5.8,65000.0,Randal Kleiser,Henry De Vere Stacpoole,Brooke Shields,United States,Columbia Pictures,104.0
2,PG,Action,1980,8.7,1200000.0,Irvin Kershner,Leigh Brackett,Mark Hamill,United States,Lucasfilm,124.0
3,PG,Comedy,1980,7.7,221000.0,Jim Abrahams,Jim Abrahams,Robert Hays,United States,Paramount Pictures,88.0
4,R,Comedy,1980,7.3,108000.0,Harold Ramis,Brian Doyle-Murray,Chevy Chase,United States,Orion Pictures,98.0


In [4]:
# Determine if genre needs binning
genres = movies_df['genre'].value_counts()
genres

Comedy       2182
Action       1666
Drama        1439
Crime         536
Biography     429
Adventure     419
Animation     331
Horror        304
Fantasy        42
Mystery        20
Thriller       12
Family         10
Romance         8
Sci-Fi          8
Western         3
Music           1
Sport           1
Musical         1
Name: genre, dtype: int64

In [5]:
# Bin genres with less than 10 movies into 'Other'
replace_genre = list(genres[genres < 10].index)

# Replace in DataFrame
for genre in replace_genre:
    movies_df['genre'] = movies_df['genre'].replace(genre,'Other')

# Check to see if binning was successful
movies_df['genre'].value_counts()

Comedy       2182
Action       1666
Drama        1439
Crime         536
Biography     429
Adventure     419
Animation     331
Horror        304
Fantasy        42
Other          22
Mystery        20
Thriller       12
Family         10
Name: genre, dtype: int64

In [6]:
# Remove directors with only 1 film
movies_df = movies_df[movies_df['director'].map(movies_df['director'].value_counts()) != 1]
movies_df

Unnamed: 0,rating,genre,year,score,votes,director,writer,star,country,company,runtime
0,R,Drama,1980,8.4,927000.0,Stanley Kubrick,Stephen King,Jack Nicholson,United Kingdom,Warner Bros.,146.0
1,R,Adventure,1980,5.8,65000.0,Randal Kleiser,Henry De Vere Stacpoole,Brooke Shields,United States,Columbia Pictures,104.0
2,PG,Action,1980,8.7,1200000.0,Irvin Kershner,Leigh Brackett,Mark Hamill,United States,Lucasfilm,124.0
3,PG,Comedy,1980,7.7,221000.0,Jim Abrahams,Jim Abrahams,Robert Hays,United States,Paramount Pictures,88.0
4,R,Comedy,1980,7.3,108000.0,Harold Ramis,Brian Doyle-Murray,Chevy Chase,United States,Orion Pictures,98.0
...,...,...,...,...,...,...,...,...,...,...,...
7399,PG-13,Action,2020,7.4,387000.0,Christopher Nolan,Christopher Nolan,John David Washington,United States,Warner Bros.,150.0
7400,PG-13,Action,2020,5.4,217000.0,Patty Jenkins,Patty Jenkins,Gal Gadot,United States,Atlas Entertainment,151.0
7402,R,Drama,2020,7.1,186000.0,Leigh Whannell,Leigh Whannell,Elisabeth Moss,Canada,Universal Pictures,124.0
7403,PG,Animation,2020,7.4,120000.0,Dan Scanlon,Dan Scanlon,Tom Holland,United States,Walt Disney Pictures,102.0


In [7]:
# Remove stars with only 1 film
movies_df = movies_df[movies_df['star'].map(movies_df['star'].value_counts()) != 1]
movies_df

Unnamed: 0,rating,genre,year,score,votes,director,writer,star,country,company,runtime
0,R,Drama,1980,8.4,927000.0,Stanley Kubrick,Stephen King,Jack Nicholson,United Kingdom,Warner Bros.,146.0
1,R,Adventure,1980,5.8,65000.0,Randal Kleiser,Henry De Vere Stacpoole,Brooke Shields,United States,Columbia Pictures,104.0
2,PG,Action,1980,8.7,1200000.0,Irvin Kershner,Leigh Brackett,Mark Hamill,United States,Lucasfilm,124.0
3,PG,Comedy,1980,7.7,221000.0,Jim Abrahams,Jim Abrahams,Robert Hays,United States,Paramount Pictures,88.0
4,R,Comedy,1980,7.3,108000.0,Harold Ramis,Brian Doyle-Murray,Chevy Chase,United States,Orion Pictures,98.0
...,...,...,...,...,...,...,...,...,...,...,...
7395,PG-13,Adventure,2019,5.6,336.0,Mitch Davis,Mitch Davis,Christopher Gorham,United States,Kolipoki Pictures,117.0
7399,PG-13,Action,2020,7.4,387000.0,Christopher Nolan,Christopher Nolan,John David Washington,United States,Warner Bros.,150.0
7400,PG-13,Action,2020,5.4,217000.0,Patty Jenkins,Patty Jenkins,Gal Gadot,United States,Atlas Entertainment,151.0
7403,PG,Animation,2020,7.4,120000.0,Dan Scanlon,Dan Scanlon,Tom Holland,United States,Walt Disney Pictures,102.0


In [8]:
# Remove writers with only 1 film
movies_df = movies_df[movies_df['writer'].map(movies_df['writer'].value_counts()) != 1]
movies_df

Unnamed: 0,rating,genre,year,score,votes,director,writer,star,country,company,runtime
0,R,Drama,1980,8.4,927000.0,Stanley Kubrick,Stephen King,Jack Nicholson,United Kingdom,Warner Bros.,146.0
3,PG,Comedy,1980,7.7,221000.0,Jim Abrahams,Jim Abrahams,Robert Hays,United States,Paramount Pictures,88.0
5,R,Horror,1980,6.4,123000.0,Sean S. Cunningham,Victor Miller,Betsy Palmer,United States,Paramount Pictures,95.0
6,R,Action,1980,7.9,188000.0,John Landis,Dan Aykroyd,John Belushi,United States,Universal Pictures,133.0
8,PG,Action,1980,6.8,101000.0,Richard Lester,Jerry Siegel,Gene Hackman,United States,Dovemead Films,127.0
...,...,...,...,...,...,...,...,...,...,...,...
7395,PG-13,Adventure,2019,5.6,336.0,Mitch Davis,Mitch Davis,Christopher Gorham,United States,Kolipoki Pictures,117.0
7399,PG-13,Action,2020,7.4,387000.0,Christopher Nolan,Christopher Nolan,John David Washington,United States,Warner Bros.,150.0
7400,PG-13,Action,2020,5.4,217000.0,Patty Jenkins,Patty Jenkins,Gal Gadot,United States,Atlas Entertainment,151.0
7403,PG,Animation,2020,7.4,120000.0,Dan Scanlon,Dan Scanlon,Tom Holland,United States,Walt Disney Pictures,102.0


In [9]:
# Remove companies with only 1 film
movies_df = movies_df[movies_df['company'].map(movies_df['company'].value_counts()) != 1]
movies_df

Unnamed: 0,rating,genre,year,score,votes,director,writer,star,country,company,runtime
0,R,Drama,1980,8.4,927000.0,Stanley Kubrick,Stephen King,Jack Nicholson,United Kingdom,Warner Bros.,146.0
3,PG,Comedy,1980,7.7,221000.0,Jim Abrahams,Jim Abrahams,Robert Hays,United States,Paramount Pictures,88.0
5,R,Horror,1980,6.4,123000.0,Sean S. Cunningham,Victor Miller,Betsy Palmer,United States,Paramount Pictures,95.0
6,R,Action,1980,7.9,188000.0,John Landis,Dan Aykroyd,John Belushi,United States,Universal Pictures,133.0
8,PG,Action,1980,6.8,101000.0,Richard Lester,Jerry Siegel,Gene Hackman,United States,Dovemead Films,127.0
...,...,...,...,...,...,...,...,...,...,...,...
7352,PG-13,Comedy,2019,4.5,4000.0,Tyler Perry,Tyler Perry,Tyler Perry,United States,The Tyler Perry Company,109.0
7399,PG-13,Action,2020,7.4,387000.0,Christopher Nolan,Christopher Nolan,John David Washington,United States,Warner Bros.,150.0
7400,PG-13,Action,2020,5.4,217000.0,Patty Jenkins,Patty Jenkins,Gal Gadot,United States,Atlas Entertainment,151.0
7403,PG,Animation,2020,7.4,120000.0,Dan Scanlon,Dan Scanlon,Tom Holland,United States,Walt Disney Pictures,102.0


In [None]:
# Generate out categorical variable lists