# Version 1: Movie Recommender based on summaries

In [None]:
# Import TensorFlow and hub
import tensorflow as tf
import tensorflow_hub as hub

# Plotting
import matplotlib.pyplot as plt

# some important packages
import os
import re
import numpy as np
import pandas as pd

# scikit-learn
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.neighbors import NearestNeighbors
from sklearn.decomposition import PCA

## Universal Sentence Encoder
- Now we will use Google's Universal Sentence Encoder which can generate embeddings for any sentence, those embeddings we can use to create a recommendation system for our movies dataset.

In [None]:
model_url = "https://tfhub.dev/google/universal-sentence-encoder/4"
model = hub.load(model_url)

def embed(texts):
    return model(texts)

embed(['This movie was great!'])

## Loading our movies csv into our dataframe.
- In this section, we load the dataset into a pandas dataframe and select the important columns we need for this movie recommender.

In [None]:
df = pd.read_csv("movies_cleaned.csv")
df = df[["title", "genre", "summary", "directors", "actors"]]

summaries = list(df['summary'])

## Generating Embeddings
- Here, we generate embeddings for each summary using the Universal Sentence Encoder

In [None]:
embeddings = embed(summaries)
print('The embedding shape is:', embeddings.shape)

## Visualizing Embeddings
- We use PCA to reduce the embeddings' dimensionality to 2D and plot them for visualization purposes.

In [None]:
pca = PCA(n_components=2)
emb_2d = pca.fit_transform(embeddings)

plt.figure(figsize=(11, 6))
plt.title('Embedding space')
plt.scatter(emb_2d[:, 0], emb_2d[:, 1])
plt.show()

## Nearest Neighbors
- We use the NearestNeighbors algorithm to find the closest movies in the embedding space.

In [None]:
nn = NearestNeighbors(n_neighbors=10)
nn.fit(embeddings)

## Recommend Function
- We define the recommend() function that takes a text input, finds the closest movies based on their embeddings, and returns the titles of the recommended movies.

In [None]:
def recommend(text):
    emb = embed([text])
    neighbors = nn.kneighbors(emb, return_distance=False)[0]
    return df['title'].iloc[neighbors].tolist()

print('Recommended Movies:')
recommend("After the devastating events of Avengers: Infinity War (2018), the universe is in ruins. With the help of remaining allies, the Avengers assemble once more in order to reverse Thanos' actions and restore balance to the universe.")

# Version 2: Movie Recommender several parameters.

In [1]:
import pandas as pd
import numpy as np
import torch
from transformers import AutoTokenizer, AutoModel
from sklearn.neighbors import NearestNeighbors
from sklearn.decomposition import PCA
import matplotlib.pyplot as plt

### Loading our dataset and pre-processing the data

In [2]:
df = pd.read_csv("movies_cleaned.csv")
df = df[["title", "genre", "summary", "directors", "actors"]]
df['combined'] = df['title'] + ' ' + df['genre'] + ' ' + df['summary'] + ' ' + df['directors'] + ' ' + df['actors']

### Loading a pre-trained BERT model and tokenizer

In [3]:
model_name = "bert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name)

# Set device to GPU if available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


BertModel(
  (embeddings): BertEmbeddings(
    (word_embeddings): Embedding(30522, 768, padding_idx=0)
    (position_embeddings): Embedding(512, 768)
    (token_type_embeddings): Embedding(2, 768)
    (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (encoder): BertEncoder(
    (layer): ModuleList(
      (0): BertLayer(
        (attention): BertAttention(
          (self): BertSelfAttention(
            (query): Linear(in_features=768, out_features=768, bias=True)
            (key): Linear(in_features=768, out_features=768, bias=True)
            (value): Linear(in_features=768, out_features=768, bias=True)
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (output): BertSelfOutput(
            (dense): Linear(in_features=768, out_features=768, bias=True)
            (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
            (dropout): Dropout(p=0.1, inplace=False)
          

### Function to get sentence embeddings

In [4]:
def get_sentence_embeddings(sentences, tokenizer, model, max_length=512):
    inputs = tokenizer(sentences, return_tensors="pt", padding=True, truncation=True, max_length=max_length)
    inputs = {k: v.to(device) for k, v in inputs.items()}
    with torch.no_grad():
        outputs = model(**inputs)
    embeddings = outputs.last_hidden_state.mean(dim=1).detach().cpu().numpy()
    return embeddings

In [5]:
# Function to generate embeddings for dataset in batches
from tqdm import tqdm

def generate_embeddings(sentences, tokenizer, model, batch_size=32):
    embeddings = []
    num_batches = (len(sentences) + batch_size - 1) // batch_size
    for i in tqdm(range(0, len(sentences), batch_size), total=num_batches, desc="Generating embeddings"):
        batch = sentences[i:i+batch_size]
        batch_embeddings = get_sentence_embeddings(batch, tokenizer, model)
        embeddings.append(batch_embeddings)
    return np.vstack(embeddings)

### Generate embeddings for our dataset:

In [None]:
sentences = df['combined'].tolist()
embeddings = generate_embeddings(sentences, tokenizer, model)

Generating embeddings:  75%|███████▌  | 24/32 [1:13:18<23:16, 174.57s/it]

### nearest neighbors

In [None]:
nn = NearestNeighbors(n_neighbors=10)
nn.fit(embeddings)

### Recommendation function

In [None]:
def recommend(text, tokenizer, model, nn, df):
    emb = get_sentence_embeddings([text], tokenizer, model)
    neighbors = nn.kneighbors(emb, return_distance=False)[0]
    return df['title'].iloc[neighbors].tolist()

In [None]:
input_text = "After the devastating events of Avengers: Infinity War (2018), the universe is in ruins. With the help of remaining allies, the Avengers assemble once more in order to reverse Thanos' actions and restore balance to the universe."

print('Recommended Movies:')
print(recommend(input_text, tokenizer, model, nn, df))

### Saving our model

In [None]:
model.save_pretrained("movie_recommender_model")
tokenizer.save_pretrained("movie_recommender_tokenizer")

### Load our model

In [None]:
# model = AutoModel.from_pretrained("movie_recommender_model")
# tokenizer = AutoTokenizer.from_pretrained("movie_recommender_tokenizer")