In [1]:
# Import dependencies
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
import tensorflow as tf
from tensorflow.keras import Model, Sequential
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.layers import Dense

Preprocessing

In [2]:
# Load cleaned_movies into DataFrame
movies_df = pd.read_csv('movies_final.csv')
movies_df.head()
# Drop Unnamed column
movies_df = movies_df.loc[:,~movies_df.columns.str.contains('^Unnamed')]
movies_df

Unnamed: 0,name,rating,genre,year,released,score,votes,director,writer,star,country,gross,company,runtime
0,The Shining,R,Drama,1980,"June 13, 1980 (United States)",8.4,927000.0,Stanley Kubrick,Stephen King,Jack Nicholson,United Kingdom,46998772.0,Warner Bros.,146.0
1,The Blue Lagoon,R,Adventure,1980,"July 2, 1980 (United States)",5.8,65000.0,Randal Kleiser,Henry De Vere Stacpoole,Brooke Shields,United States,58853106.0,Columbia Pictures,104.0
2,Star Wars: Episode V - The Empire Strikes Back,PG,Action,1980,"June 20, 1980 (United States)",8.7,1200000.0,Irvin Kershner,Leigh Brackett,Mark Hamill,United States,538375067.0,Lucasfilm,124.0
3,Airplane!,PG,Comedy,1980,"July 2, 1980 (United States)",7.7,221000.0,Jim Abrahams,Jim Abrahams,Robert Hays,United States,83453539.0,Paramount Pictures,88.0
4,Caddyshack,R,Comedy,1980,"July 25, 1980 (United States)",7.3,108000.0,Harold Ramis,Brian Doyle-Murray,Chevy Chase,United States,39846344.0,Orion Pictures,98.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7407,The Call of the Wild,PG,Adventure,2020,"February 21, 2020 (United States)",6.8,42000.0,Chris Sanders,Michael Green,Harrison Ford,Canada,111105497.0,20th Century Studios,100.0
7408,The Eight Hundred,Not Rated,Action,2020,"August 28, 2020 (United States)",6.8,3700.0,Hu Guan,Hu Guan,Zhi-zhong Huang,China,461421559.0,Beijing Diqi Yinxiang Entertainment,149.0
7409,The Quarry,R,Crime,2020,"April 17, 2020 (Mexico)",5.4,2400.0,Scott Teems,Scott Teems,Shea Whigham,United States,3661.0,Prowess Pictures,98.0
7410,Tulsa,PG-13,Comedy,2020,"June 3, 2020 (United States)",5.0,294.0,Scott Pryor,Scott Pryor,Scott Pryor,United States,413378.0,Pryor Entertainment,120.0


In [3]:
# Drop the non-beneficial columns: 'released'
movies_df = movies_df.drop(columns=['released'], axis=1)
movies_df.head()

Unnamed: 0,name,rating,genre,year,score,votes,director,writer,star,country,gross,company,runtime
0,The Shining,R,Drama,1980,8.4,927000.0,Stanley Kubrick,Stephen King,Jack Nicholson,United Kingdom,46998772.0,Warner Bros.,146.0
1,The Blue Lagoon,R,Adventure,1980,5.8,65000.0,Randal Kleiser,Henry De Vere Stacpoole,Brooke Shields,United States,58853106.0,Columbia Pictures,104.0
2,Star Wars: Episode V - The Empire Strikes Back,PG,Action,1980,8.7,1200000.0,Irvin Kershner,Leigh Brackett,Mark Hamill,United States,538375067.0,Lucasfilm,124.0
3,Airplane!,PG,Comedy,1980,7.7,221000.0,Jim Abrahams,Jim Abrahams,Robert Hays,United States,83453539.0,Paramount Pictures,88.0
4,Caddyshack,R,Comedy,1980,7.3,108000.0,Harold Ramis,Brian Doyle-Murray,Chevy Chase,United States,39846344.0,Orion Pictures,98.0


In [4]:
# Determine if genre needs binning
genres = movies_df['genre'].value_counts()
genres

Comedy       2182
Action       1666
Drama        1439
Crime         536
Biography     429
Adventure     419
Animation     331
Horror        304
Fantasy        42
Mystery        20
Thriller       12
Family         10
Romance         8
Sci-Fi          8
Western         3
Music           1
Sport           1
Musical         1
Name: genre, dtype: int64

In [5]:
# Bin genres with less than 10 movies into 'Other'
replace_genre = list(genres[genres < 10].index)

# Replace in DataFrame
for genre in replace_genre:
    movies_df['genre'] = movies_df['genre'].replace(genre,'Other')

# Check to see if binning was successful
movies_df['genre'].value_counts()

Comedy       2182
Action       1666
Drama        1439
Crime         536
Biography     429
Adventure     419
Animation     331
Horror        304
Fantasy        42
Other          22
Mystery        20
Thriller       12
Family         10
Name: genre, dtype: int64

In [6]:
# Remove directors with only 1 film
#movies_df = movies_df[movies_df['director'].map(movies_df['director'].value_counts()) != 1]
#movies_df

In [7]:
# Remove stars with only 1 film
#movies_df = movies_df[movies_df['star'].map(movies_df['star'].value_counts()) != 1]
#movies_df

In [8]:
# Remove writers with only 1 film
#movies_df = movies_df[movies_df['writer'].map(movies_df['writer'].value_counts()) != 1]
#movies_df

In [9]:
# Remove companies with only 1 film
#movies_df = movies_df[movies_df['company'].map(movies_df['company'].value_counts()) != 1]
#movies_df

In [10]:
# Bin genres with less than 5 movies into 'Other'
genres = movies_df['genre'].value_counts()
replace_genre = list(genres[genres < 5].index)

# Replace in DataFrame
for genre in replace_genre:
    movies_df['genre'] = movies_df['genre'].replace(genre,'Other')

# Check to see if binning was successful
movies_df['genre'].value_counts()

Comedy       2182
Action       1666
Drama        1439
Crime         536
Biography     429
Adventure     419
Animation     331
Horror        304
Fantasy        42
Other          22
Mystery        20
Thriller       12
Family         10
Name: genre, dtype: int64

In [11]:
# Bin directors with less than 1 movie into 'Other'
directors = movies_df['director'].value_counts()
replace_director = list(directors[directors == 1].index)

# Replace in DataFrame
for director in replace_director:
    movies_df['director'] = movies_df['director'].replace(director,'Other')

# Check to see if binning was successful
movies_df['director'].value_counts()

Other               1463
Woody Allen           38
Clint Eastwood        31
Steven Spielberg      27
Directors             25
                    ... 
Tom Hanks              2
David Hogan            2
Steve Buscemi          2
Matthew Bright         2
Adam Robitel           2
Name: director, Length: 1354, dtype: int64

In [12]:
# Bin stars with less than 1 movie into 'Other'
stars = movies_df['star'].value_counts()
replace_star = list(stars[stars == 1].index)

# Replace in DataFrame
for star in replace_star:
    movies_df['star'] = movies_df['star'].replace(star,'Other')

# Check to see if binning was successful
movies_df['star'].value_counts()

Other                1634
Nicolas Cage           43
Tom Hanks              41
Robert De Niro         41
Denzel Washington      37
                     ... 
Shaquille O'Neal        2
Scott Mechlowicz        2
Kelsey Grammer          2
Joseph Fiennes          2
Martha Higareda         2
Name: star, Length: 1019, dtype: int64

In [13]:
# Bin companies with less than 1 movie into 'Other'
companies = movies_df['company'].value_counts()
replace_company = list(companies[companies == 1].index)

# Replace in DataFrame
for company in replace_company:
    movies_df['company'] = movies_df['company'].replace(company,'Other')

# Check to see if binning was successful
movies_df['company'].value_counts()

Other                  1577
Universal Pictures      376
Warner Bros.            332
Columbia Pictures       332
Paramount Pictures      319
                       ... 
Tribeca Productions       2
Jim Henson Pictures       2
Dogstar Films             2
Mutual Film Company       2
Neverending Media         2
Name: company, Length: 666, dtype: int64

In [14]:
# Generate out categorical variable lists
movies_cat = movies_df.dtypes[movies_df.dtypes=='object'].index.tolist()
movies_cat.pop(0)
movies_cat

['rating', 'genre', 'director', 'writer', 'star', 'country', 'company']

In [15]:
# Create a OneHotEncoder instance
ohe = OneHotEncoder(sparse=False)

# Fit and transform the OneHotEncoder using the categorical variable list
encoded_df = pd.DataFrame(ohe.fit_transform(movies_df[movies_cat]))

# Add the encoded variable names to the dataframe
encoded_df.columns = ohe.get_feature_names(movies_cat)
encoded_df



Unnamed: 0,rating_Approved,rating_G,rating_NC-17,rating_Not Rated,rating_PG,rating_PG-13,rating_R,rating_TV-14,rating_TV-MA,rating_TV-PG,...,company_WingNut Films,company_Working Dog,company_Working Title Films,company_Worldview Entertainment,company_X-Filme Creative Pool,company_Yash Raj Films,company_Yellow Bird,company_Zenith Entertainment,company_Zentropa Entertainments,company_Zoetrope Studios
0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7407,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
7408,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
7409,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
7410,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [16]:
# Merge one-hot encoded features and drop the originals
movies_df = movies_df.merge(encoded_df, left_index = True, right_index=True)
movies_df = movies_df.drop(columns=movies_cat)
movies_df

Unnamed: 0,name,year,score,votes,gross,runtime,rating_Approved,rating_G,rating_NC-17,rating_Not Rated,...,company_WingNut Films,company_Working Dog,company_Working Title Films,company_Worldview Entertainment,company_X-Filme Creative Pool,company_Yash Raj Films,company_Yellow Bird,company_Zenith Entertainment,company_Zentropa Entertainments,company_Zoetrope Studios
0,The Shining,1980,8.4,927000.0,46998772.0,146.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,The Blue Lagoon,1980,5.8,65000.0,58853106.0,104.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,Star Wars: Episode V - The Empire Strikes Back,1980,8.7,1200000.0,538375067.0,124.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,Airplane!,1980,7.7,221000.0,83453539.0,88.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,Caddyshack,1980,7.3,108000.0,39846344.0,98.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7407,The Call of the Wild,2020,6.8,42000.0,111105497.0,100.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
7408,The Eight Hundred,2020,6.8,3700.0,461421559.0,149.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
7409,The Quarry,2020,5.4,2400.0,3661.0,98.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
7410,Tulsa,2020,5.0,294.0,413378.0,120.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


Compile, Train, and Evaluate Neural Network Model

In [19]:
# Split our preprocessed data into our features and target arrays
X = movies_df.drop(['name','score'],1).values
y = movies_df['score'].values

# Split the preprocessed data into training and testing data
X_train, X_test, y_train, y_test = train_test_split(X,y,random_state=42)

  


In [20]:
# Create a StandardScaler instances
scaler = StandardScaler()

# Fit the StandardScaler
X_scaler = scaler.fit(X_train)

# Scale the data
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

In [21]:
# Define the model - deep neural net, i.e., the number of input features and hidden nodes for each layer.
input_features = len(X_train[0])
hidden_nodes_1 = 80
hidden_nodes_2 = 30

nn = tf.keras.models.Sequential()

# First hidden layer
nn.add(tf.keras.layers.Dense(units=hidden_nodes_1, activation='relu', input_dim=input_features))

# Second hidden layer
nn.add(tf.keras.layers.Dense(units=hidden_nodes_2, activation='relu', input_dim=input_features))

# Output layer
nn.add(tf.keras.layers.Dense(units=1, activation='sigmoid'))

# Check the structure of the model
nn.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense (Dense)                (None, 80)                598400    
_________________________________________________________________
dense_1 (Dense)              (None, 30)                2430      
_________________________________________________________________
dense_2 (Dense)              (None, 1)                 31        
Total params: 600,861
Trainable params: 600,861
Non-trainable params: 0
_________________________________________________________________


2022-03-18 18:02:00.549459: I tensorflow/core/platform/cpu_feature_guard.cc:142] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [22]:
# Compile the model
nn.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

In [23]:
# Train the model
fit_model = nn.fit(X_train_scaled, y_train, epochs=100)

2022-03-18 18:02:20.571564: I tensorflow/compiler/mlir/mlir_graph_optimization_pass.cc:176] None of the MLIR Optimization Passes are enabled (registered 2)


Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100


Epoch 73/100
Epoch 74/100
Epoch 75/100
Epoch 76/100
Epoch 77/100
Epoch 78/100
Epoch 79/100
Epoch 80/100
Epoch 81/100
Epoch 82/100
Epoch 83/100
Epoch 84/100
Epoch 85/100
Epoch 86/100
Epoch 87/100
Epoch 88/100
Epoch 89/100
Epoch 90/100
Epoch 91/100
Epoch 92/100
Epoch 93/100
Epoch 94/100
Epoch 95/100
Epoch 96/100
Epoch 97/100
Epoch 98/100
Epoch 99/100
Epoch 100/100


In [24]:
# Evaluate the model using the test data
model_loss, model_accuracy = nn.evaluate(X_test_scaled,y_test,verbose=2)
print(f"Loss: {model_loss}, Accuracy: {model_accuracy}")

58/58 - 0s - loss: -2.9840e+09 - accuracy: 0.0000e+00
Loss: -2984043264.0, Accuracy: 0.0
