In [None]:
import nltk
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('omw-1.4')

In [None]:
import re
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
stop_words = stopwords.words('english')

## Collect Data from TMDB

In [None]:
import requests
import json

from google.colab import userdata
tmdb_token = userdata.get('tmdb_auth')

# print(tmdb_token)

url = "https://api.themoviedb.org/3/movie/popular?language=en-US&page=1"

headers = {
    "accept": "application/json",
    "Authorization": "Bearer " + tmdb_token
}
response = requests.get(url, headers=headers)
data = response.json() #convert response to dictionary
# print(data)
# print(json.dumps(data, indent=4)) #convert dictionary to JSON-formatted string

In [None]:
import pandas as pd

# construct movies dataframe
movies_list = data['results'] #extracts list of movies
movies_df = pd.DataFrame(movies_list)

# filter relevant columns
relevant_columns = ['id', 'title', 'overview', 'genre_ids', 'vote_average', 'popularity', 'release_date']
movies_df = movies_df[relevant_columns]

movies_df.head()

In [None]:
# movies_df.tail()

In [None]:
# Create a dictionary mapping movie ID to index
id_to_index = pd.Series(movies_df.index, index=movies_df['id']).to_dict()
print(id_to_index)

## Data Preprocessing


### Text Cleansing

In [None]:
sw = stopwords.words('english')
lemmatizer = WordNetLemmatizer()

In [None]:
def clean(text):
    text = text.lower()
    text = re.sub(r"[^a-zA-Z?.!,¿]+", " ", text) #  remove unwanted characters
    text = re.sub(r"http\S+", "",text) # remove URLs
    punctuations = '@#!?+&*[]-%.:/();$=><|{}^' + "'`" + '_'
    for p in punctuations:
        text = text.replace(p,'') #remove punctuation marks
    text = [word.lower() for word in text.split() if word.lower() not in sw] #tokenize / remove stopwords
    text = [lemmatizer.lemmatize(word) for word in text] #convert word to its base form
    text = " ".join(text)
    return text

In [None]:
movies_df['overview'] = movies_df['overview'].apply(lambda x: clean(x))
movies_df.head()

### MinMaxScaler
For the numerical columns like average votes, I used the min max scaler to scale values into the specified range of 0 and 1.

In [None]:
from sklearn.preprocessing import MinMaxScaler
def preprocess_minmax(data):
  scaler = MinMaxScaler(feature_range=(0, 1))
  return scaler.fit_transform(data)

movies_df['normalized_vote'] = preprocess_minmax(movies_df[['vote_average']])
movies_df.head()

### One-hot encoding

For the categorical column like genre, I used one hot encoding to convert the variables into numerical values

In [None]:
from sklearn.preprocessing import MultiLabelBinarizer #useful for transforming categorical data like genre
mlb = MultiLabelBinarizer()

# transform genre IDs into a binary matrix
one_hot_genres = mlb.fit_transform(movies_df['genre_ids'])

# create a DataFrame from the encoded genres
genres_df = pd.DataFrame(one_hot_genres, columns=mlb.classes_)

# Concatenate the new genres DataFrame with the original movies DataFrame column-wise
movies_df = pd.concat([movies_df, genres_df], axis=1)
movies_df.head()

In [None]:
# movies_df

### TF-IDF for feature extraction

Converts text data like overview into numerical vectors

In [None]:
import sklearn

from sklearn.feature_extraction.text import TfidfVectorizer


# Applying TF-IDF Vectorizer
tfidf = TfidfVectorizer(stop_words='english')
movies_df['overview'] = movies_df['overview'].fillna('') # replace NaN with an empty string
tfidf_matrix = tfidf.fit_transform(movies_df['overview']) # construct the TF-IDF matrix
# print(tfidf_matrix)

## Recommendation function to test combined features

In [None]:
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity

# combine one-hot encoded genre with tf-idf matrix from overview feature, and normalized votes
combined_features = np.hstack((tfidf_matrix.toarray(), one_hot_genres, movies_df[['normalized_vote']].to_numpy()))

# Pre calculate Cosine Similarity for convenience
cosine_sim = cosine_similarity(combined_features, combined_features)

# print(combined_features)
# print(cosine_sim)

# get recommendations based on cosine similarity
def get_recommendations(title, cosine_sim=cosine_sim):
    # Check if the movie title exists in the DataFrame
    if title in movies_df['title'].values:
      # Get the index of the movie that matches the title
      idx = movies_df.index[movies_df['title'] == title].tolist()[0]
      # Get the pairwise similarity scores of all movies with that movie
      sim_scores = list(enumerate(cosine_sim[idx]))
      # Sort the movies based on the similarity scores
      sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)

      sim_scores = sim_scores[1:11]  # Get top 10 similar movies
      movie_indices = [i[0] for i in sim_scores] # Get the movie indices
      return movies_df['title'].iloc[movie_indices]

    else:
      return "Movie title not found in the dataset."

In [None]:
# Test the recommendation system with a movie title
recommendations = get_recommendations("Aquaman and the Lost Kingdom")
print(recommendations)