In [1]:
import pandas as pd
import torch
from transformers import BertTokenizer, BertModel
from sklearn.metrics.pairwise import cosine_similarity

# Load the BERT model and tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertModel.from_pretrained('bert-base-uncased')

# Create DataFrame
movies = pd.read_csv('movies.csv')


# Combine genres and description to create a single "content" feature
movies['content'] = movies['genres'] + " " + movies['description']  # Fixed typo in 'description'

# Function to get BERT embeddings for the content
def get_bert_embeddings(texts):
    # Tokenize the texts
    inputs = tokenizer(texts.tolist(), padding=True, truncation=True, return_tensors='pt')
    with torch.no_grad():
        outputs = model(**inputs)
    # Get the embeddings from the last hidden state
    embeddings = outputs.last_hidden_state.mean(dim=1)  # Average pooling
    return embeddings

# Step 1: Get BERT embeddings for the movie content
embeddings = get_bert_embeddings(movies['content'])

# Step 2: Compute cosine similarity between all movies based on embeddings
cosine_sim = cosine_similarity(embeddings.numpy(), embeddings.numpy())

# Function to get movie recommendations based on movie title
def get_recommendations(title, cosine_sim=cosine_sim):
    # Get the index of the movie that matches the title
    idx = movies.index[movies['title'] == title].tolist()  # Fixed typo in 'title'
    if not idx:
        return f"No movie found with title '{title}'"
    idx = idx[0]

    # Get pairwise similarity scores for all movies with that movie
    sim_scores = list(enumerate(cosine_sim[idx]))

    # Sort movies based on similarity scores
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)

    # Get the scores of the 5 most similar movies (excluding itself)
    sim_scores = sim_scores[1:6]

    # Get the movie indices
    movie_indices = [i[0] for i in sim_scores]

    # Return the top 5 most similar movies
    return movies['title'].iloc[movie_indices]

# Example usage
movie_title = "The Lion King"
recommendations = get_recommendations(movie_title)
print(f"Movies recommended for '{movie_title}':")
for rec in recommendations:
  print(rec)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Movies recommended for 'The Lion King':
Spirited Away
Dune
Blade Runner 2049
Mad Max: Fury Road
Forrest Gump
