In [50]:
import pandas as pd
import re
from sklearn.preprocessing import LabelEncoder
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.preprocessing.sequence import pad_sequences
import joblib
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
import numpy as np

In [3]:
movies = pd.read_csv("C:\\Users\\ashri\\CPSC 8740\\preprocessed_movies.csv")

In [4]:
movies.head()

Unnamed: 0,Title,IMDb Rating,Year,Certificates,Director,Star Cast,Duration (minutes),Star Cast Lower,Star Cast List,Star Cast List Clean,...,Horror,Music,Musical,Mystery,Reality-TV,Romance,Sci-Fi,Sport,Thriller,Western
0,rebel moon - part two: the scargiver,0.232143,0.990741,PG-13,zack snyder,"Zack Snyder, Kurt Johnstad, Shay Hatten",0.280443,"zack snyder, kurt johnstad, shay hatten","['zack snyder', 'kurt johnstad', 'shay hatten']","['zack snyder', 'kurt johnstad', 'shay hatten']",...,0,0,0,0,0,0,0,0,0,0
1,borderlands,0.571429,0.990741,R,eli roth,"Eli Roth, Joe Crombie",0.25941,"eli roth, joe crombie","['eli roth', 'joe crombie']","['eli roth', 'joe crombie']",...,0,0,0,0,0,0,0,0,0,0
2,jurassic park,0.767857,0.703704,PG-13,steven spielberg,"Michael Crichton, David Koepp",0.298893,"michael crichton, david koepp","['michael crichton', 'david koepp']","['michael crichton', 'david koepp']",...,0,0,0,0,0,0,0,0,0,0
3,the fifth element,0.660714,0.740741,PG-13,luc besson,"Luc Besson, Robert Mark Kamen",0.295203,"luc besson, robert mark kamen","['luc besson', 'robert mark kamen']","['luc besson', 'robert mark kamen']",...,0,0,0,0,0,0,0,0,0,0
4,inside out,0.75,0.907407,PG,pete docter,"Pete Docter, Ronnie Del Carmen, Meg Le, Fauve",0.180812,"pete docter, ronnie del carmen, meg le, fauve","['pete docter', 'ronnie del carmen', 'meg le',...","['pete docter', 'ronnie del carmen', 'meg le',...",...,0,0,0,0,0,0,0,0,0,0


In [5]:
#Integer Encoding for Director
label_encoder = LabelEncoder()
movies['Director_Encoded'] = label_encoder.fit_transform(movies['Director'])
joblib.dump(label_encoder, 'director_encoder.pkl')

['director_encoder.pkl']

In [6]:
#Tokenization and padding for Title
all_titles = ' '.join(movies['Title'].astype(str).tolist())
words = re.findall(r'\b\w+\b', all_titles.lower())
unique_words = set(words)
num_unique_words = len(unique_words)
print(f"Number of Unique words in 'Title' {num_unique_words}")

Number of Unique words in 'Title' 4216


In [7]:
#With 4216 words, I'll set tokenizer to 5000 to account for words OOV during prediction
movies = movies.dropna(subset=['Title'])
tokenizer = Tokenizer(num_words = 5000)
tokenizer.fit_on_texts(movies['Title'])
sequences = tokenizer.texts_to_sequences(movies['Title'])
padded_sequences = pad_sequences(sequences, maxlen=20)
joblib.dump(tokenizer, 'title_tokenizer.pkl')

['title_tokenizer.pkl']

In [8]:
#Multi-Hot encoding for star cast
all_actors = sorted(list(set().union(*movies['Star Cast List Clean'])))
actor_to_index = {actor: index for index, actor in enumerate(all_actors)}
star_cast_encoded = np.array([[1 if actor in actors else 0 for actor in all_actors] for actors in movies['Star Cast List Clean']])

In [9]:
#Prepare title data for Neural Net
title_data = padded_sequences

#Prepare director data for Neural Net
director_data = movies['Director_Encoded'].values

#Prepare numerical data
numerical_data = movies[['IMDb Rating', 'Duration (minutes)', 'Year']].values

#Prepare genre data
genre_data = movies.iloc[:, 11:32].values

In [10]:
genre_data

array([[1, 0, 0, ..., 0, 0, 0],
       [1, 0, 0, ..., 0, 0, 0],
       [1, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 1, 1, ..., 0, 0, 0],
       [0, 0, 1, ..., 0, 0, 0],
       [0, 1, 1, ..., 0, 0, 0]], dtype=int64)

In [11]:
print(movies['IMDb Rating'].min())
print(movies['IMDb Rating'].max())

0.1428571428571429
0.9107142857142858


In [12]:
#Now, because we want to predict User-Movie Interactions, we're going to assume higher IMDb Ratings means more people like it
# and simulate User_Movie Interactions based on that
num_users = 100
num_movies = len(movies)
user_movie_matrix = np.zeros((num_users, num_movies))
rating_threshold = 0.6
for user_id in range(num_users):
    for movie_id in range(num_movies):
        if movies['IMDb Rating'].iloc[movie_id] >= rating_threshold and np.random.rand() > 0.1:
            user_movie_matrix[user_id, movie_id] = 1

In [13]:
user_movie_matrix

array([[0., 0., 1., ..., 1., 0., 1.],
       [0., 0., 1., ..., 1., 0., 1.],
       [0., 0., 1., ..., 1., 0., 1.],
       ...,
       [0., 0., 0., ..., 0., 0., 1.],
       [0., 0., 1., ..., 1., 0., 1.],
       [0., 0., 1., ..., 1., 0., 1.]])

In [47]:
user_input = layers.Input(shape=(1,), name='user_input')
movie_input = layers.Input(shape=(1,), name = 'movie_input')
numerical_input = layers.Input(shape=(numerical_data.shape[1],), name='numerical_input')
director_input = layers.Input(shape=(1,), name='director_input')
genre_input = layers.Input(shape=(genre_data.shape[1],),name='genre_input')
star_cast_input = layers.Input(shape=(star_cast_encoded.shape[1],), name='star_cast_input')
title_input = layers.Input(shape=(20,), name = 'title_input')
user_embedding = layers.Embedding(input_dim=num_users, output_dim=16, embeddings_regularizer=keras.regularizers.l2(0.001))(user_input)
movie_embedding = layers.Embedding(input_dim=num_movies, output_dim=16, embeddings_regularizer=keras.regularizers.l2(0.001))(movie_input)
user_vecs = layers.Flatten()(user_embedding)
movie_vecs = layers.Flatten()(movie_embedding)
dot_product = layers.Dot(axes=1)([user_vecs, movie_vecs])
dot_product_reshaped = layers.Reshape((1,))(dot_product)
title_embedding_layer = layers.Embedding(input_dim=5000, output_dim=16, input_length=20)(title_input)
title_vecs = layers.Flatten()(title_embedding_layer)
num_directors = len(label_encoder.classes_)
director_embedding_layer = layers.Embedding(input_dim=num_directors, output_dim=8)(director_input)
director_vecs = layers.Flatten()(director_embedding_layer)
movie_features_concat = layers.concatenate([
    numerical_input,
    director_vecs,
    genre_input,
    star_cast_input,
    title_vecs
])
movie_features_dense = layers.Dense(128, activation = 'relu', kernel_regularizer=keras.regularizers.l2(0.001))(movie_features_concat)
movie_features_dropout = layers.Dropout(0.5)(movie_features_dense)
movie_features_final = layers.Dense(64, activation = 'relu', kernel_regularizer= keras.regularizers.l2(0.001))(movie_features_dropout)
combined_vecs = layers.concatenate([user_vecs, movie_vecs, movie_features_final])
combined_dense = layers.Dense(64, activation='relu', kernel_regularizer=keras.regularizers.l2(0.001))(combined_vecs)
combined_dropout = layers.Dropout(0.5)(combined_dense)
output_layer = layers.Dense(1, activation='sigmoid')(combined_dropout)

In [48]:
model = keras.Model(
    inputs=[user_input, movie_input, numerical_input, director_input, genre_input, star_cast_input, title_input],
    outputs=output_layer
)

In [49]:
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

In [51]:
early_stopping = EarlyStopping(monitor='val_loss', patience=3, restore_best_weights=True)

In [54]:
user_ids = []
movie_ids = []
likes = []
for user_id in range(num_users):
    for movie_id in range(num_movies):
        if user_movie_matrix[user_id, movie_id] == 1:
            user_ids.append(user_id)
            movie_ids.append(movie_id)
            likes.append(1)
train_numerical = numerical_data[movie_ids]
train_director = director_data[movie_ids]
train_genre = genre_data[movie_ids]
train_star_cast = star_cast_encoded[movie_ids]
train_title = title_data[movie_ids]

In [55]:
history = model.fit(
    [np.array(user_ids), # User IDs
     np.array(movie_ids), # Movie IDs (can be used for movie embedding if layer exists)
     train_numerical,     # Numerical features
     train_director,      # Encoded Director
     train_genre,         # Genre features
     train_star_cast,     # Star cast features
     train_title],        # Title features
    np.array(likes),      # Labels
    epochs=50,            # Increase epochs, early stopping will handle the duration
    validation_split=0.2,
    callbacks=[early_stopping], # Add the callback here
    batch_size=64         # Consider adding batch size
)

Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50


In [57]:
model.save('movie_recommendation_model.h5')

# Save the necessary preprocessors
#joblib.dump(scaler, 'scaler.pkl')
#joblib.dump(min_max_scaler, 'min_max_scaler.pkl')
joblib.dump(label_encoder, 'director_encoder.pkl')
joblib.dump(tokenizer, 'title_tokenizer.pkl')
# Also save 'actor_to_index' and the list of 'all_actors' if needed for prediction
import json
with open('actor_to_index.json', 'w') as f:
    json.dump(actor_to_index, f)
np.save('all_actors.npy', np.array(all_actors))
# Save the genre columns order
genre_columns = movies.iloc[:, 11:32].columns.tolist() # Get column names
with open('genre_columns.json', 'w') as f:
    json.dump(genre_columns, f)