In [34]:
import nltk
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('omw-1.4')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


True

In [35]:
import re
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
stop_words = stopwords.words('english')

## Collect Data from TMDB

In [36]:
import requests
import json

from google.colab import userdata
tmdb_token = userdata.get('tmdb_auth')

# print(tmdb_token)

url = "https://api.themoviedb.org/3/movie/popular?language=en-US&page=1"

headers = {
    "accept": "application/json",
    "Authorization": "Bearer " + tmdb_token
}
response = requests.get(url, headers=headers)
data = response.json() #convert response to dictionary
# print(data)
# print(json.dumps(data, indent=4)) #convert dictionary to JSON-formatted string

In [37]:
import pandas as pd

# construct movies dataframe
movies_list = data['results'] #extracts list of movies
movies_df = pd.DataFrame(movies_list)

# filter relevant columns
relevant_columns = ['id', 'title', 'overview', 'genre_ids', 'vote_average', 'popularity', 'release_date']
movies_df = movies_df[relevant_columns]

movies_df.head()

Unnamed: 0,id,title,overview,genre_ids,vote_average,popularity,release_date
0,1029575,The Family Plan,"Dan Morgan is many things: a devoted husband, ...","[28, 35]",7.397,3089.182,2023-12-14
1,695721,The Hunger Games: The Ballad of Songbirds & Sn...,64 years before he becomes the tyrannical pres...,"[18, 878, 28]",7.236,1965.673,2023-11-15
2,848326,Rebel Moon - Part One: A Child of Fire,When a peaceful colony on the edge of the gala...,[878],6.456,1846.529,2023-12-15
3,891699,Silent Night,A tormented father witnesses his young son die...,"[28, 80]",5.862,1239.235,2023-11-30
4,572802,Aquaman and the Lost Kingdom,"Black Manta, still driven by the need to aveng...","[28, 12, 14]",6.489,1369.764,2023-12-20


In [38]:
# movies_df.tail()

In [39]:
# Create a dictionary mapping movie ID to index
id_to_index = pd.Series(movies_df.index, index=movies_df['id']).to_dict()
print(id_to_index)

{1029575: 0, 695721: 1, 848326: 2, 891699: 3, 572802: 4, 1071215: 5, 1131755: 6, 1143183: 7, 507089: 8, 844416: 9, 897087: 10, 299054: 11, 670292: 12, 1035982: 13, 872585: 14, 975902: 15, 787699: 16, 807172: 17, 1034062: 18, 1189927: 19}


## Data Preprocessing


### Text Cleansing

In [40]:
sw = stopwords.words('english')
lemmatizer = WordNetLemmatizer()

In [41]:
def clean(text):
    text = text.lower()
    text = re.sub(r"[^a-zA-Z?.!,¿]+", " ", text) #  remove unwanted characters
    text = re.sub(r"http\S+", "",text) # remove URLs
    punctuations = '@#!?+&*[]-%.:/();$=><|{}^' + "'`" + '_'
    for p in punctuations:
        text = text.replace(p,'') #remove punctuation marks
    text = [word.lower() for word in text.split() if word.lower() not in sw] #tokenize / remove stopwords
    text = [lemmatizer.lemmatize(word) for word in text] #convert word to its base form
    text = " ".join(text)
    return text

In [42]:
movies_df['overview'] = movies_df['overview'].apply(lambda x: clean(x))
movies_df.head()

Unnamed: 0,id,title,overview,genre_ids,vote_average,popularity,release_date
0,1029575,The Family Plan,"dan morgan many thing devoted husband, loving ...","[28, 35]",7.397,3089.182,2023-12-14
1,695721,The Hunger Games: The Ballad of Songbirds & Sn...,"year becomes tyrannical president panem, corio...","[18, 878, 28]",7.236,1965.673,2023-11-15
2,848326,Rebel Moon - Part One: A Child of Fire,peaceful colony edge galaxy find threatened ar...,[878],6.456,1846.529,2023-12-15
3,891699,Silent Night,tormented father witness young son die caught ...,"[28, 80]",5.862,1239.235,2023-11-30
4,572802,Aquaman and the Lost Kingdom,"black manta, still driven need avenge father d...","[28, 12, 14]",6.489,1369.764,2023-12-20


### MinMaxScaler
For the numerical columns like average votes, I used the min max scaler to scale values into the specified range of 0 and 1.

In [43]:
from sklearn.preprocessing import MinMaxScaler
def preprocess_minmax(data):
  scaler = MinMaxScaler(feature_range=(0, 1))
  return scaler.fit_transform(data)

movies_df['normalized_vote'] = preprocess_minmax(movies_df[['vote_average']])
movies_df.head()

Unnamed: 0,id,title,overview,genre_ids,vote_average,popularity,release_date,normalized_vote
0,1029575,The Family Plan,"dan morgan many thing devoted husband, loving ...","[28, 35]",7.397,3089.182,2023-12-14,0.771646
1,695721,The Hunger Games: The Ballad of Songbirds & Sn...,"year becomes tyrannical president panem, corio...","[18, 878, 28]",7.236,1965.673,2023-11-15,0.720584
2,848326,Rebel Moon - Part One: A Child of Fire,peaceful colony edge galaxy find threatened ar...,[878],6.456,1846.529,2023-12-15,0.4732
3,891699,Silent Night,tormented father witness young son die caught ...,"[28, 80]",5.862,1239.235,2023-11-30,0.284808
4,572802,Aquaman and the Lost Kingdom,"black manta, still driven need avenge father d...","[28, 12, 14]",6.489,1369.764,2023-12-20,0.483666


### One-hot encoding

For the categorical column like genre, I used one hot encoding to convert the variables into numerical values

In [44]:
from sklearn.preprocessing import MultiLabelBinarizer #useful for transforming categorical data like genre
mlb = MultiLabelBinarizer()

# transform genre IDs into a binary matrix
one_hot_genres = mlb.fit_transform(movies_df['genre_ids'])

# create a DataFrame from the encoded genres
genres_df = pd.DataFrame(one_hot_genres, columns=mlb.classes_)

# Concatenate the new genres DataFrame with the original movies DataFrame column-wise
movies_df = pd.concat([movies_df, genres_df], axis=1)
movies_df.head()

Unnamed: 0,id,title,overview,genre_ids,vote_average,popularity,release_date,normalized_vote,12,14,...,28,35,36,53,80,878,9648,10749,10751,10752
0,1029575,The Family Plan,"dan morgan many thing devoted husband, loving ...","[28, 35]",7.397,3089.182,2023-12-14,0.771646,0,0,...,1,1,0,0,0,0,0,0,0,0
1,695721,The Hunger Games: The Ballad of Songbirds & Sn...,"year becomes tyrannical president panem, corio...","[18, 878, 28]",7.236,1965.673,2023-11-15,0.720584,0,0,...,1,0,0,0,0,1,0,0,0,0
2,848326,Rebel Moon - Part One: A Child of Fire,peaceful colony edge galaxy find threatened ar...,[878],6.456,1846.529,2023-12-15,0.4732,0,0,...,0,0,0,0,0,1,0,0,0,0
3,891699,Silent Night,tormented father witness young son die caught ...,"[28, 80]",5.862,1239.235,2023-11-30,0.284808,0,0,...,1,0,0,0,1,0,0,0,0,0
4,572802,Aquaman and the Lost Kingdom,"black manta, still driven need avenge father d...","[28, 12, 14]",6.489,1369.764,2023-12-20,0.483666,1,1,...,1,0,0,0,0,0,0,0,0,0


In [45]:
# movies_df

### TF-IDF for feature extraction

Converts text data like overview into numerical vectors

In [46]:
import sklearn

from sklearn.feature_extraction.text import TfidfVectorizer


# Applying TF-IDF Vectorizer
tfidf = TfidfVectorizer(stop_words='english')
movies_df['overview'] = movies_df['overview'].fillna('') # replace NaN with an empty string
tfidf_matrix = tfidf.fit_transform(movies_df['overview']) # construct the TF-IDF matrix
# print(tfidf_matrix)

## Recommendation function to test combined features

In [47]:
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity

# combine one-hot encoded genre with tf-idf matrix from overview feature, and normalized votes
combined_features = np.hstack((tfidf_matrix.toarray(), one_hot_genres, movies_df[['normalized_vote']].to_numpy()))

# Pre calculate Cosine Similarity for convenience
cosine_sim = cosine_similarity(combined_features, combined_features)

# print(combined_features)
# print(cosine_sim)

# get recommendations based on cosine similarity
def get_recommendations(title, cosine_sim=cosine_sim):
    # Check if the movie title exists in the DataFrame
    if title in movies_df['title'].values:
      # Get the index of the movie that matches the title
      idx = movies_df.index[movies_df['title'] == title].tolist()[0]
      # Get the pairwise similarity scores of all movies with that movie
      sim_scores = list(enumerate(cosine_sim[idx]))
      # Sort the movies based on the similarity scores
      sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)

      sim_scores = sim_scores[1:11]  # Get top 10 similar movies
      movie_indices = [i[0] for i in sim_scores] # Get the movie indices
      return movies_df['title'].iloc[movie_indices]

    else:
      return "Movie title not found in the dataset."

In [48]:
# Test the recommendation system with a movie title
recommendations = get_recommendations("Aquaman and the Lost Kingdom")
print(recommendations)

12                          The Creator
18    Mortal Kombat Legends: Cage Match
11                          Expend4bles
0                       The Family Plan
9               Rumble Through the Dark
3                          Silent Night
10                            Freelance
15                              Boudica
19                             The Duel
16                                Wonka
Name: title, dtype: object
