In [1]:
import pandas as pd
import numpy as np

In [2]:
embeddings = np.load('embeddings.npy')
df_metadata = pd.read_csv('cleaned_metadata.csv')

In [3]:
print(embeddings.shape)

(32262, 384)


In [4]:
sentences = []
for row in df_metadata.itertuples(index=True):
    if row.overview is not None:
        sentences.append(str(row.overview))

In [5]:
import faiss

faiss.normalize_L2(embeddings)
index = faiss.IndexFlatIP(384)
index.add(embeddings)

In [6]:
def similar_movies(title, df_metadata, index):
    df_index = df_metadata[df_metadata['original_title'] == title].index
    if len(df_index) == 0: 
        return []
    real_index = 0
    if len(df_index) > 1: 
        popularity = 0
        
        for i in df_index:
            row = df_metadata.iloc[i]
            if row["popularity"] > popularity:
                popularity = row["popularity"]
                real_index = i
        
    print(len(df_index))
    print(real_index)
    query = embeddings[real_index].reshape(1,-1).copy()
    
    faiss.normalize_L2(query)
    D, I = index.search(query, 10)
    return I[0]


In [7]:
similar_movies("The weirdo Knight", df_metadata, index)

[]

In [8]:
df_ratings = pd.read_csv('cleaned_ratings.csv')

In [9]:
df_ratings

Unnamed: 0,userId,movieId,rating,timestamp
0,1,110,1.0,1425941529
1,1,147,4.5,1425942435
2,1,858,5.0,1425941523
3,1,1221,5.0,1425941546
4,1,1246,5.0,1425941556
...,...,...,...,...
26024284,270896,58559,5.0,1257031564
26024285,270896,60069,5.0,1257032032
26024286,270896,63082,4.5,1257031764
26024287,270896,64957,4.5,1257033990


In [10]:
user_counts = df_ratings['userId'].value_counts()


In [11]:
df_ratings = df_ratings[df_ratings['userId'].isin(user_counts[user_counts >= 5].index)]
df_ratings


Unnamed: 0,userId,movieId,rating,timestamp
0,1,110,1.0,1425941529
1,1,147,4.5,1425942435
2,1,858,5.0,1425941523
3,1,1221,5.0,1425941546
4,1,1246,5.0,1425941556
...,...,...,...,...
26024284,270896,58559,5.0,1257031564
26024285,270896,60069,5.0,1257032032
26024286,270896,63082,4.5,1257031764
26024287,270896,64957,4.5,1257033990


In [12]:
def mapping_movies():
    movie2idx = {}
    i = 0
    for movie_id in df_ratings["movieId"].unique():  # go through unique movie IDs
        movie2idx[movie_id] = i
        i+=1

    return movie2idx

def mapping_users():
    user2idx = {}
    i = 0
    for user_id in df_ratings["userId"].unique():  # go through unique movie IDs
        user2idx[user_id] = i
        i+=1

    return user2idx

movie2idx = mapping_movies()
user2idx = mapping_users()



In [13]:
data = []

length = 8000
for i in range(length):
    row = df_ratings.iloc[i]
    user_data = user2idx[int(row["userId"])]
    movie_data = movie2idx[int(row["movieId"])]
    rating_data = float(row["rating"])

    data.append([user_data, movie_data, rating_data])

print(df_ratings["userId"].nunique(), len(user2idx))
print(df_ratings["movieId"].nunique(), len(movie2idx))



256107 256107
45109 45109


In [1]:
print(df_ratings["userId"].nunique())
print(df_ratings["movieId"].nunique())

print(len(df_ratings))

NameError: name 'df_ratings' is not defined

In [None]:
from torch.utils.data import Dataset, DataLoader

class MoviesDataset(Dataset):
    def __init__(self, data):
        self.data = data
    def __len__(self):
        return len(self.data)
    def __getitem__(self, idx):

        user_data = self.data[idx][0]
        movie_data = self.data[idx][1]
        rating_data = self.data[idx][2]

        user_id_tensor = torch.tensor(user_data, dtype=torch.long)
        movie_id_tensor = torch.tensor(movie_data, dtype=torch.long)
        rating_tensor = torch.tensor(rating_data, dtype=torch.float32)

        return(user_id_tensor, movie_id_tensor, rating_tensor)


        

In [None]:
dataloader = DataLoader(data, batch_size=150, shuffle=True)

In [None]:
import torch
import torch.nn as nn
import torch.optim as optim

class MovieRecommender(nn.Module):
    def __init__(self, num_users, num_movies, embedding_dim=128):
            super(MyRecommender, self).__init__()
            self.user_embedding = nn.Embedding(num_users, embedding_dim)
            self.movie_embedding = nn.Embedding(num_movies, embedding_dim)
            self.mlp = nn.Sequential(
                nn.Linear(256, 128),  # shrink 256-dim input → 128
                nn.ReLU(),            # add nonlinearity
                nn.Linear(128, 64),   # shrink again
                nn.ReLU(),            
                nn.Linear(64, 1)      # final output: one number (predicted rating)
            )

    def forward(self, user_ids, movie_ids):
        user_vecs = self.user_embedding(user_ids)
        movie_vecs = self.movie_embedding(movie_ids)

        combined_vec = torch.cat((user_vecs, movie_vecs), dim=1)

        return self.mlp(combined_vec).squeeze(-1)



        


In [None]:
#training loop

num_epochs = 5
train_losses, val_losses = [], []

device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
model = MovieRecommender(8000,8000)
model.to(device)

criterion = nn.MSELoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

for epoch in range(num_epochs):
    running_loss = 0.0

    for batch in dataloader:
        user_ids, movie_ids, ratings = batch
        user_ids = user_ids.to(device)
        movie_ids = movie_ids.to(device)
        ratings = ratings.to(device)

        predictions = model(user_ids, movie_ids)
        loss = criterion(predictions, ratings)

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        running_loss += loss.item()  # accumulate
    
    avg_loss = running_loss / len(dataloader)  # mean per batch
    train_losses.append(avg_loss)
    print(f"Epoch [{epoch+1}/{num_epochs}], Loss: {avg_loss:.4f}")



