In [28]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
from sentence_transformers import SentenceTransformer
from sklearn.preprocessing import LabelEncoder
import faiss

import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader




In [1]:
!pip install faiss-cpu


Collecting faiss-cpu
  Downloading faiss_cpu-1.10.0-cp310-cp310-macosx_11_0_arm64.whl.metadata (4.4 kB)
Downloading faiss_cpu-1.10.0-cp310-cp310-macosx_11_0_arm64.whl (3.2 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.2/3.2 MB[0m [31m1.7 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
Installing collected packages: faiss-cpu
Successfully installed faiss-cpu-1.10.0

[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m24.3.1[0m[39;49m -> [0m[32;49m25.0.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m


In [10]:
df = pd.read_csv("../data/processed/movies_final.csv")

df["overview"] = df["overview"].fillna("")

# Display dataset structure
print(df.shape)
df.head()

(61123, 17)


Unnamed: 0,movieId,imdbId,title,vote_average,vote_count,status,release_date,revenue,runtime,adult,budget,imdb_id,original_language,original_title,overview,popularity,genres
0,1,114709,Toy Story,7.971,17152,Released,1995-10-30,394400000,81,False,30000000,tt0114709,en,Toy Story,"Led by Woody, Andy's toys live happily in his ...",78.404,"Animation, Adventure, Family, Comedy"
1,2,113497,Jumanji,7.239,9833,Released,1995-12-15,262821940,104,False,65000000,tt0113497,en,Jumanji,When siblings Judy and Peter discover an encha...,13.444,"Adventure, Fantasy, Family"
2,3,113228,Grumpier Old Men,6.476,347,Released,1995-12-22,71500000,101,False,25000000,tt0113228,en,Grumpier Old Men,A family wedding reignites the ancient feud be...,14.815,"Romance, Comedy"
3,4,114885,Waiting to Exhale,6.183,142,Released,1995-12-22,81452156,127,False,16000000,tt0114885,en,Waiting to Exhale,"Cheated on, mistreated and stepped on, the wom...",14.451,"Comedy, Drama, Romance"
4,5,113041,Father of the Bride Part II,6.228,659,Released,1995-12-08,76594107,106,False,0,tt0113041,en,Father of the Bride Part II,Just when George Banks has recovered from his ...,14.537,"Comedy, Family"


In [6]:
tfidf = TfidfVectorizer(stop_words="english", max_features=5000)
tfidf_matrix = tfidf.fit_transform(df["overview"])
tfidf_df = pd.DataFrame(tfidf_matrix.toarray(), index=df["movieId"])

print("✅ TF-IDF embeddings generated successfully!")

✅ TF-IDF embeddings generated successfully!


In [8]:
np.save("../data/processed/tfidf_embeddings.npy", tfidf_matrix.toarray())

print("✅ TF-IDF embeddings saved at data/processed/tfidf_embeddings.npy")

✅ TF-IDF embeddings saved at data/processed/tfidf_embeddings.npy


In [4]:
tfidf_embeddings = np.load("../data/processed/tfidf_embeddings.npy")
cosine_sim = cosine_similarity(tfidf_embeddings)



In [13]:
def get_similar_movies(movie_title, n=5):
    if movie_title not in df["title"].values:
        return "❌ Movie not found in dataset."
    movie_index = df[df["title"] == movie_title].index[0]

    sim_scores = list(enumerate(cosine_sim[movie_index]))

    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    top_movies = sim_scores[1:n+1]

    similar_movie_titles = [df.iloc[i[0]]["title"] for i in top_movies]

    return similar_movie_titles

In [14]:
print("🎬 Movies similar to 'The Matrix':")
print(get_similar_movies("The Matrix", n=5))


🎬 Movies similar to 'The Matrix':
['The Vanishing Prairie', 'Pulse', 'Who Am I', 'The Web', 'Algorithm']


In [29]:
text_encoder = SentenceTransformer("all-MiniLM-L6-v2")

# Combine genres and overview for better embeddings
df["combined_text"] = df["genres"].fillna("") + " " + df["overview"].fillna("")

# Generate movie embeddings
movie_text_embeddings = text_encoder.encode(df["combined_text"].tolist(), batch_size=32, show_progress_bar=True)

# Save embeddings for later use
np.save("movie_text_embeddings.npy", movie_text_embeddings)

print("✅ Generated movie context embeddings (text-based features).")

Batches:   0%|          | 0/1911 [00:00<?, ?it/s]

✅ Generated movie context embeddings (text-based features).


In [18]:
np.save("../data/processed/transformer_embeddings.npy", transformer_embeddings)

print("✅ Transformer embeddings saved at data/processed/transformer_embeddings.npy")

✅ Transformer embeddings saved at data/processed/transformer_embeddings.npy


In [5]:
transformer_embeddings = np.load("../data/processed/transformer_embeddings.npy")
cosine_sim_transformer = cosine_similarity(transformer_embeddings)

print("✅ Transformer-based similarity matrix computed!")


✅ Transformer-based similarity matrix computed!


In [20]:
def get_similar_movies_transformer(movie_title, n=5):
    # Ensure movie title exists
    if movie_title not in df["title"].values:
        return " Movie not found in dataset."

    # Get the movie index
    movie_index = df[df["title"] == movie_title].index[0]

    # Get similarity scores for the movie
    sim_scores = list(enumerate(cosine_sim_transformer[movie_index]))

    # Sort movies based on similarity score (descending)
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    top_movies = sim_scores[1:n+1]
    similar_movie_titles = [df.iloc[i[0]]["title"] for i in top_movies]

    return similar_movie_titles


In [30]:
print("Movies similar to 'Spiderman 2' (Transformer-based):")
print(get_similar_movies_transformer("Spiderman 2", n=5))


Movies similar to 'Spiderman 2' (Transformer-based):
❌ Movie not found in dataset.


In [6]:

df["combined_text"] = df["genres"] + " " + df["overview"]
df[["title", "combined_text"]].head()


Unnamed: 0,title,combined_text
0,Toy Story,"Animation, Adventure, Family, Comedy Led by Wo..."
1,Jumanji,"Adventure, Fantasy, Family When siblings Judy ..."
2,Grumpier Old Men,"Romance, Comedy A family wedding reignites the..."
3,Waiting to Exhale,"Comedy, Drama, Romance Cheated on, mistreated ..."
4,Father of the Bride Part II,"Comedy, Family Just when George Banks has reco..."


In [None]:
cosine_sim_transformer = cosine_similarity(transformer_embeddings)

# Encode genres numerically
genre_encoder = LabelEncoder()
df["genre_encoded"] = genre_encoder.fit_transform(df["genres"])

# Compute genre similarity
genre_similarity = cosine_similarity(df["genre_encoded"].values.reshape(-1, 1))

# Compute year similarity
df["release_year"] = pd.to_datetime(df["release_date"], errors="coerce").dt.year.fillna(0).astype(int)
year_similarity = 1 / (1 + abs(df["release_year"].values.reshape(-1, 1) - df["release_year"].values.reshape(1, -1)))

# Final Hybrid Similarity Calculation
final_similarity = 0.7 * cosine_sim_transformer + 0.2 * genre_similarity + 0.1 * year_similarity

print("✅ Final Hybrid Similarity Matrix Computed!")

In [6]:
index = faiss.IndexFlatL2(transformer_embeddings.shape[1])
index.add(transformer_embeddings)

print("✅ FAISS index built successfully!")


✅ FAISS index built successfully!


In [9]:
def get_faiss_recommendations(movie_title, n=5):
    if movie_title not in df["title"].values:
        return "❌ Movie not found in dataset."

    # Get movie index
    movie_index = df[df["title"] == movie_title].index[0]

    # Search for nearest neighbors
    distances, indices = index.search(transformer_embeddings[movie_index].reshape(1, -1), n+1)

    # Retrieve movie titles (skip first one as it's the query itself)
    recommended_titles = [df.iloc[i]["title"] for i in indices[0][1:]]

    return recommended_titles


In [11]:
print("🎬 Movies similar to 'Interstellar' (FAISS-based):")
print(get_faiss_recommendations("Interstellar", n=5))

print("🎬 Movies similar to 'Superbad' (FAISS-based):")
print(get_faiss_recommendations("Superbad", n=5))

print("🎬 Movies similar to 'The Notebook' (FAISS-based):")
print(get_faiss_recommendations("The Notebook", n=5))


🎬 Movies similar to 'Interstellar' (FAISS-based):
['Prometheus', 'The Beyond', 'The Farthest', 'Passage to Mars', 'Voyager: To the Final Frontier']
🎬 Movies similar to 'Superbad' (FAISS-based):
['Always', 'Wobble Palace', 'Blue Denim', 'Project X', 'Bernard and Huey']
🎬 Movies similar to 'The Notebook' (FAISS-based):
['The Longest Ride', 'Street Level', 'Lovely, Still', 'Doomed Love', 'A Majority of One']


In [12]:
# Improve text representation by adding genres & title
df["combined_text"] = df["title"] + " " + df["genres"] + " " + df["overview"]

# Check an example
df[["title", "combined_text"]].head()


Unnamed: 0,title,combined_text
0,Toy Story,"Toy Story Animation, Adventure, Family, Comedy..."
1,Jumanji,"Jumanji Adventure, Fantasy, Family When siblin..."
2,Grumpier Old Men,"Grumpier Old Men Romance, Comedy A family wedd..."
3,Waiting to Exhale,"Waiting to Exhale Comedy, Drama, Romance Cheat..."
4,Father of the Bride Part II,"Father of the Bride Part II Comedy, Family Jus..."


In [13]:
model = SentenceTransformer("all-MiniLM-L6-v2")

# Generate new embeddings for combined text (title + genre + overview)
new_embeddings = model.encode(df["combined_text"].tolist(), batch_size=32, show_progress_bar=True)

print("✅ Improved Transformer embeddings generated successfully!")

Batches:   0%|          | 0/1911 [00:00<?, ?it/s]

✅ Improved Transformer embeddings generated successfully!


In [14]:
model = SentenceTransformer("all-MiniLM-L6-v2")

# Generate new embeddings for combined text (title + genre + overview)
new_embeddings = model.encode(df["combined_text"].tolist(), batch_size=32, show_progress_bar=True)

print("✅ Improved Transformer embeddings generated successfully!")

Batches:   0%|          | 0/1911 [00:00<?, ?it/s]

✅ Improved Transformer embeddings generated successfully!


In [15]:
print("🎬 Movies similar to 'Interstellar' (Improved FAISS-based):")
print(get_faiss_recommendations("Interstellar", n=5))

print("🎬 Movies similar to 'Superbad' (Improved FAISS-based):")
print(get_faiss_recommendations("Superbad", n=5))

print("🎬 Movies similar to 'The Notebook' (Improved FAISS-based):")
print(get_faiss_recommendations("The Notebook", n=5))


🎬 Movies similar to 'Interstellar' (Improved FAISS-based):
['Prometheus', 'The Beyond', 'The Farthest', 'Passage to Mars', 'Voyager: To the Final Frontier']
🎬 Movies similar to 'Superbad' (Improved FAISS-based):
['Always', 'Wobble Palace', 'Blue Denim', 'Project X', 'Bernard and Huey']
🎬 Movies similar to 'The Notebook' (Improved FAISS-based):
['The Longest Ride', 'Street Level', 'Lovely, Still', 'Doomed Love', 'A Majority of One']


In [22]:
num_synthetic_users = 5000
synthetic_users = np.arange(1, num_synthetic_users + 1)

# Assign movies based on vote_count (higher vote_count = more watches)
df_sorted = df.sort_values(by="vote_count", ascending=False)  # Sort movies by popularity

# Generate watch history
max_movies_per_user = 10  # Each user watches up to 10 movies
user_watch_history = {}

for user in synthetic_users:
    # Select movies, favoring popular ones
    num_watched = np.random.randint(1, max_movies_per_user + 1)
    watched_movies = np.random.choice(df_sorted["movieId"].head(1000), size=num_watched, replace=False).tolist()
    user_watch_history[user] = watched_movies

print(f"✅ Assigned watch history to {num_synthetic_users} users based on popularity.")

✅ Assigned watch history to 5000 users based on popularity.


In [None]:
## class TransformerRecSys(nn.Module):
    def __init__(self, num_users, num_movies, embedding_dim=256, num_heads=4, num_layers=2, dropout=0.2, max_seq_length=10):
        super(TransformerRecSys, self).__init__()

        # User and Movie Embeddings
        self.user_embedding = nn.Embedding(num_users, embedding_dim)
        self.movie_embedding = nn.Embedding(num_movies, embedding_dim)

        # Positional Encoding for Watch History
        self.position_encoding = nn.Parameter(torch.randn(1, max_seq_length, embedding_dim))

        # Transformer Encoder
        self.transformer = nn.TransformerEncoder(
            nn.TransformerEncoderLayer(d_model=embedding_dim, nhead=num_heads, dropout=dropout),
            num_layers=num_layers
        )

        # Fully Connected Layers for Rating Prediction
        self.fc = nn.Sequential(
            nn.Linear(embedding_dim, 128),
            nn.ReLU(),
            nn.Linear(128, 1)  # Output: rating or ranking score
        )

    def forward(self, user_ids, movie_ids, watch_history):
        """
        user_ids: Tensor of shape (batch_size,)
        movie_ids: Tensor of shape (batch_size,)
        watch_history: Tensor of shape (batch_size, max_seq_length)
        """

        # Get Embeddings
        user_embed = self.user_embedding(user_ids)  # Shape: (batch_size, embedding_dim)
        movie_embed = self.movie_embedding(movie_ids)  # Shape: (batch_size, embedding_dim)

        # Get Watch History Embeddings
        watch_history_embed = self.movie_embedding(watch_history)  # Shape: (batch_size, max_seq_length, embedding_dim)

        # Add Positional Encoding
        watch_history_embed = watch_history_embed + self.position_encoding[:, :watch_history.shape[1], :]

        # Apply Transformer Encoder
        transformed_history = self.transformer(watch_history_embed)  # Shape: (batch_size, max_seq_length, embedding_dim)

        # Pooling: Take the Mean of Transformer Output
        history_representation = transformed_history.mean(dim=1)  # Shape: (batch_size, embedding_dim)

        # Combine User, Movie, and History Representations
        combined_representation = user_embed + movie_embed + history_representation  # Shape: (batch_size, embedding_dim)

        # Pass Through Fully Connected Layers to Predict Rating
        output = self.fc(combined_representation)  # Shape: (batch_size, 1)

        return output


In [23]:
# Create user-movie interactions using real ratings (vote_average)
user_movie_interactions = []

for user, movies in user_watch_history.items():
    for movie in movies:
        # Get movie's vote_average
        movie_info = df[df["movieId"] == movie]
        if len(movie_info) == 0:
            continue  # Skip if movie ID is missing

        vote_avg = movie_info["vote_average"].values[0]

        user_movie_interactions.append({"userId": user, "movieId": movie, "rating": round(vote_avg, 1)})

# Convert to DataFrame
ratings_df = pd.DataFrame(user_movie_interactions)

print("✅ Synthetic dataset created using real `vote_average`.")


✅ Synthetic dataset created using real `vote_average`.


In [26]:
ratings_df.info

<bound method DataFrame.info of        userId  movieId  rating
0           1   185585     6.1
1           1    56145     6.9
2           1    93363     6.3
3           1     2194     7.8
4           1     5952     8.4
...       ...      ...     ...
27394    5000   166528     7.5
27395    5000   162602     6.4
27396    5000     3285     6.5
27397    5000   113348     5.9
27398    5000     3000     8.3

[27399 rows x 3 columns]>

In [30]:
import torch.nn as nn
import torch.nn.functional as F

class TransformerRecSys(nn.Module):
    def __init__(self, num_users, num_movies, embedding_dim=256, num_heads=4, num_layers=2, dropout=0.2, max_seq_length=10):
        super(TransformerRecSys, self).__init__()

        # User and Movie Embeddings
        self.user_embedding = nn.Embedding(num_users, embedding_dim)
        self.movie_embedding = nn.Embedding(num_movies, embedding_dim)

        # Positional Encoding for Watch History
        self.position_encoding = nn.Parameter(torch.randn(1, max_seq_length, embedding_dim))

        # Transformer Encoder for Watch History
        self.transformer = nn.TransformerEncoder(
            nn.TransformerEncoderLayer(d_model=embedding_dim, nhead=num_heads, dropout=dropout),
            num_layers=num_layers
        )

        # Fully Connected Layers for Numerical Features
        self.num_features_fc = nn.Linear(3, embedding_dim)  # vote_average, vote_count, popularity

        # Fully Connected Layer for Text Features (Text Embeddings)
        self.text_embedding_fc = nn.Linear(384, embedding_dim)  # Convert text embedding to same dimension

        # Fully Connected Layers for Rating Prediction
        self.fc = nn.Sequential(
            nn.Linear(embedding_dim * 3, 128),  # Combine user, movie, history, numerical + text features
            nn.ReLU(),
            nn.Linear(128, 1)  # Output: rating prediction
        )

    def forward(self, user_ids, movie_ids, watch_history, num_features, text_embedding):
        """
        user_ids: Tensor of shape (batch_size,)
        movie_ids: Tensor of shape (batch_size,)
        watch_history: Tensor of shape (batch_size, max_seq_length)
        num_features: Tensor of shape (batch_size, 3) - vote_average, vote_count, popularity
        text_embedding: Tensor of shape (batch_size, 384) - movie context embeddings
        """

        # Get Embeddings
        user_embed = self.user_embedding(user_ids)  # Shape: (batch_size, embedding_dim)
        movie_embed = self.movie_embedding(movie_ids)  # Shape: (batch_size, embedding_dim)

        # Get Watch History Embeddings
        watch_history_embed = self.movie_embedding(watch_history)  # Shape: (batch_size, max_seq_length, embedding_dim)

        # Add Positional Encoding
        watch_history_embed = watch_history_embed + self.position_encoding[:, :watch_history.shape[1], :]

        # Apply Transformer Encoder
        transformed_history = self.transformer(watch_history_embed)  # Shape: (batch_size, max_seq_length, embedding_dim)

        # Pooling: Take the Mean of Transformer Output
        history_representation = transformed_history.mean(dim=1)  # Shape: (batch_size, embedding_dim)

        # Process numerical features
        num_features_embed = self.num_features_fc(num_features)  # Shape: (batch_size, embedding_dim)

        # Process text embeddings
        text_features_embed = self.text_embedding_fc(text_embedding)  # Shape: (batch_size, embedding_dim)

        # Combine User, Movie, History, Numerical, and Text Representations
        combined_representation = (
            user_embed + movie_embed + history_representation + num_features_embed + text_features_embed
        )  # Shape: (batch_size, embedding_dim)

        # Pass Through Fully Connected Layers to Predict Rating
        output = self.fc(combined_representation)  # Shape: (batch_size, 1)

        return output


In [31]:
from torch.utils.data import Dataset, DataLoader
import torch

class MovieDataset(Dataset):
    def __init__(self, df, movie_text_embeddings, max_seq_length=10):
        self.df = df
        self.movie_text_embeddings = movie_text_embeddings
        self.max_seq_length = max_seq_length

    def __len__(self):
        return len(self.df)

    def __getitem__(self, idx):
        row = self.df.iloc[idx]

        # Convert user & movie IDs to tensors
        user_id = torch.tensor(row["userId"], dtype=torch.long)
        movie_id = torch.tensor(row["movieId"], dtype=torch.long)

        # Watch history (Pad sequences if needed)
        watch_history = torch.tensor(
            user_watch_history.get(row["userId"], [0] * self.max_seq_length)[:self.max_seq_length],
            dtype=torch.long
        )

        # Numerical features: vote_average, vote_count, popularity
        num_features = torch.tensor(
            [row["vote_average"], row["vote_count"], row["popularity"]],
            dtype=torch.float
        )

        # Text embedding for movie (from precomputed embeddings)
        text_embedding = torch.tensor(self.movie_text_embeddings[idx], dtype=torch.float)

        # Rating (target)
        rating = torch.tensor(row["rating"], dtype=torch.float)

        return user_id, movie_id, watch_history, num_features, text_embedding, rating

# Load precomputed text embeddings
movie_text_embeddings = np.load("movie_text_embeddings.npy")

# Create dataset & dataloader
train_dataset = MovieDataset(ratings_df, movie_text_embeddings)
train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)

print("✅ DataLoader now includes movie text embeddings + numerical metadata!")


✅ DataLoader now includes movie text embeddings + numerical metadata!
