# Version 1: Movie Recommender based on summaries

In [None]:
# Import TensorFlow and hub
import tensorflow as tf
import tensorflow_hub as hub

# Plotting
import matplotlib.pyplot as plt

# some important packages
import os
import re
import numpy as np
import pandas as pd

# scikit-learn
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.neighbors import NearestNeighbors
from sklearn.decomposition import PCA

## Universal Sentence Encoder
- Now we will use Google's Universal Sentence Encoder which can generate embeddings for any sentence, those embeddings we can use to create a recommendation system for our movies dataset.

In [None]:
model_url = "https://tfhub.dev/google/universal-sentence-encoder/4"
model = hub.load(model_url)

def embed(texts):
    return model(texts)

embed(['This movie was great!'])

## Loading our movies csv into our dataframe.
- In this section, we load the dataset into a pandas dataframe and select the important columns we need for this movie recommender.

In [None]:
df = pd.read_csv("movies_cleaned.csv")
df = df[["title", "genre", "summary", "directors", "actors"]]

summaries = list(df['summary'])

## Generating Embeddings
- Here, we generate embeddings for each summary using the Universal Sentence Encoder

In [None]:
embeddings = embed(summaries)
print('The embedding shape is:', embeddings.shape)

## Visualizing Embeddings
- We use PCA to reduce the embeddings' dimensionality to 2D and plot them for visualization purposes.

In [None]:
pca = PCA(n_components=2)
emb_2d = pca.fit_transform(embeddings)

plt.figure(figsize=(11, 6))
plt.title('Embedding space')
plt.scatter(emb_2d[:, 0], emb_2d[:, 1])
plt.show()

## Nearest Neighbors
- We use the NearestNeighbors algorithm to find the closest movies in the embedding space.

In [None]:
nn = NearestNeighbors(n_neighbors=10)
nn.fit(embeddings)

## Recommend Function
- We define the recommend() function that takes a text input, finds the closest movies based on their embeddings, and returns the titles of the recommended movies.

In [None]:
def recommend(text):
    emb = embed([text])
    neighbors = nn.kneighbors(emb, return_distance=False)[0]
    return df['title'].iloc[neighbors].tolist()

print('Recommended Movies:')
recommend("After the devastating events of Avengers: Infinity War (2018), the universe is in ruins. With the help of remaining allies, the Avengers assemble once more in order to reverse Thanos' actions and restore balance to the universe.")

# Version 2: Movie Recommender with Multiple Parameters

**1. Select relevant AI methods that could solve the problem. Train, test and validate data models by using supervised and unsupervised methods, neural networks or graphs.**
- We have chosen the BERT (Bidirectional Encoder Representations from Transformers) model, which is a state-of-the-art method for natural language processing. We've used a pre-trained BERT model to generate embeddings for combined features (title, genre, summary, directors, and actors) of the movies.

**2. Select and apply appropriate measures for assessing the quality of your models. Iterate the process to explore possibilities for improving the quality of the models.**
- Although we have not implemented an explicit quality assessment in the code, we have utilized the NearestNeighbors algorithm from the scikit-learn library to find similar movies based on the embeddings. This implicitly evaluates the quality of the embeddings, as the recommendations would not be relevant if the embeddings were of poor quality.

**3. Implement the modules in the intended scenario as an AI prototype of your solution.**
- We have implemented a movie recommender system as an AI prototype solution. The code takes a text input (movie summary), processes it with the pre-trained BERT model, and recommends similar movies using the NearestNeighbors algorithm.

In [None]:
import pandas as pd
import numpy as np
import torch
from sklearn.model_selection import train_test_split
from transformers import BertTokenizerFast, BertForSequenceClassification, Trainer, TrainingArguments
from transformers import AutoTokenizer, AutoModel
from sklearn.neighbors import NearestNeighbors
from sklearn.decomposition import PCA
import matplotlib.pyplot as plt

### Loading our dataset and pre-processing the data

In [None]:
df = pd.read_csv("movies_cleaned.csv")
df = df[["title", "genre", "summary", "directors", "actors"]]
df['combined'] = df['title'] + ' ' + df['genre'] + ' ' + df['summary'] + ' ' + df['directors'] + ' ' + df['actors']

In [None]:
df.head(10)

### Loading a pre-trained BERT model and tokenizer

In [None]:
model_name = "bert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name)

# Set device to GPU if available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

### Function to get sentence embeddings

In [None]:
def get_sentence_embeddings(sentences, tokenizer, model, max_length=512):
    inputs = tokenizer(sentences, return_tensors="pt", padding=True, truncation=True, max_length=max_length)
    inputs = {k: v.to(device) for k, v in inputs.items()}
    with torch.no_grad():
        outputs = model(**inputs)
    embeddings = outputs.last_hidden_state.mean(dim=1).detach().cpu().numpy()
    return embeddings

In [None]:
# Function to generate embeddings for dataset in batches
from tqdm import tqdm

def generate_embeddings(sentences, tokenizer, model, batch_size=32):
    embeddings = []
    num_batches = (len(sentences) + batch_size - 1) // batch_size
    for i in tqdm(range(0, len(sentences), batch_size), total=num_batches, desc="Generating embeddings"):
        batch = sentences[i:i+batch_size]
        batch_embeddings = get_sentence_embeddings(batch, tokenizer, model)
        embeddings.append(batch_embeddings)
    return np.vstack(embeddings)

### Generate embeddings for our dataset:

In [None]:
sentences = df['combined'].tolist()
embeddings = generate_embeddings(sentences, tokenizer, model)

### nearest neighbors

In [None]:
nn = NearestNeighbors(n_neighbors=10)
# nn = NearestNeighbors(n_neighbors=10, metric="cosine")
# nn = NearestNeighbors(n_neighbors=10, metric="manhattan")
# nn = NearestNeighbors(n_neighbors=10, metric="minkowski")
nn.fit(embeddings)

### Recommendation function

In [None]:
def recommend(text, tokenizer, model, nn, df):
    emb = get_sentence_embeddings([text], tokenizer, model)
    neighbors = nn.kneighbors(emb, return_distance=False)[0]
    return df['title'].iloc[neighbors].tolist()

In [None]:
input_text = "After the devastating events of Avengers: Infinity War (2018), the universe is in ruins. With the help of remaining allies, the Avengers assemble once more in order to reverse Thanos' actions and restore balance to the universe."

print('Recommended Movies:')
print(recommend(input_text, tokenizer, model, nn, df))

### Saving our model

In [None]:
model.save_pretrained("movie_recommender_model")
tokenizer.save_pretrained("movie_recommender_tokenizer")

### Load our model

In [None]:
model = AutoModel.from_pretrained("movie_recommender_model")
tokenizer = AutoTokenizer.from_pretrained("movie_recommender_tokenizer")

### then we split the data into training, validation, and testing sets.

In [None]:
train_df, temp_df = train_test_split(df, test_size=0.2, random_state=42)
valid_df, test_df = train_test_split(temp_df, test_size=0.5, random_state=42)

### Tokenize the data and create PyTorch datasets

In [None]:
tokenizer = BertTokenizerFast.from_pretrained("bert-base-uncased")

train_encodings = tokenizer(train_df['combined'].tolist(), truncation=True, padding=True)
valid_encodings = tokenizer(valid_df['combined'].tolist(), truncation=True, padding=True)

class MovieDataset(torch.utils.data.Dataset):
    def __init__(self, encodings):
        self.encodings = encodings

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        return item

    def __len__(self):
        return len(self.encodings.input_ids)

train_dataset = MovieDataset(train_encodings)
valid_dataset = MovieDataset(valid_encodings)

### Fine Tuning the BERT-Model

In [None]:
model = BertForSequenceClassification.from_pretrained("bert-base-uncased", num_labels=1)

training_args = TrainingArguments(
    output_dir="output",
    num_train_epochs=3,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    warmup_steps=500,
    weight_decay=0.01,
    logging_dir="logs",
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=valid_dataset,
)

trainer.train()

model.save_pretrained("fine_tuned_movie_recommender_model")
tokenizer.save_pretrained("fine_tuned_movie_recommender_tokenizer")


### Replacing the pre-trained BERT model with the fine-tuned model in our recommendation system

In [None]:
model = BertForSequenceClassification.from_pretrained("fine_tuned_movie_recommender_model")
tokenizer = AutoTokenizer.from_pretrained("fine_tuned_movie_recommender_tokenizer")

In [None]:
input_text = "After the devastating events of Avengers: Infinity War (2018), the universe is in ruins. With the help of remaining allies, the Avengers assemble once more in order to reverse Thanos' actions and restore balance to the universe."

print('Recommended Movies:')
print(recommend(input_text, tokenizer, model, nn, df))