# Data

In [18]:
import pandas as pd

In [19]:
ratings = pd.read_csv('ml-25m/ratings.csv')
movies = pd.read_csv('ml-25m/movies.csv')
tags = pd.read_csv('ml-25m/tags.csv')

In [20]:
df = ratings.merge(movies, on='movieId', how='left')
df = df.merge(tags.drop(columns='timestamp'), on=['userId', 'movieId'], how='inner')
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 834731 entries, 0 to 834730
Data columns (total 7 columns):
 #   Column     Non-Null Count   Dtype  
---  ------     --------------   -----  
 0   userId     834731 non-null  int64  
 1   movieId    834731 non-null  int64  
 2   rating     834731 non-null  float64
 3   timestamp  834731 non-null  int64  
 4   title      834731 non-null  object 
 5   genres     834731 non-null  object 
 6   tag        834730 non-null  object 
dtypes: float64(1), int64(3), object(3)
memory usage: 44.6+ MB


In [21]:
#for testing purposes
df = df.sample(100000).reset_index()

In [22]:
metadata = pd.read_csv('ml-meta/movies_metadata.csv')
links_df = pd.read_csv('ml-meta/links.csv')

  metadata = pd.read_csv('ml-meta/movies_metadata.csv')


In [23]:
metadata = metadata[['id', 'overview']]

In [24]:
links_df = links_df.dropna(subset=['tmdbId'])
links_df['tmdbId'] = links_df['tmdbId'].astype('int')

In [25]:
metadata['id'] = pd.to_numeric(metadata['id'], errors='coerce')

metadata = metadata.dropna(subset=['id'])
metadata['id'] = metadata['id'].astype('int')

In [26]:

overviews = metadata.merge(links_df[['movieId', 'tmdbId']], left_on='id', right_on='tmdbId', how='inner')
overviews = overviews.dropna(subset=['overview'])
overviews.drop(columns=['tmdbId', 'id'], inplace=True)
overviews.head()

Unnamed: 0,overview,movieId
0,"Led by Woody, Andy's toys live happily in his ...",1
1,When siblings Judy and Peter discover an encha...,2
2,A family wedding reignites the ancient feud be...,3
3,"Cheated on, mistreated and stepped on, the wom...",4
4,Just when George Banks has recovered from his ...,5


# Models

In [9]:
import ollama

In [10]:

#model_llm = 'llama2'
#model_embed = 'nomic-embed-text'

##To generate, you can use
##response = ollama.generate(model=model, prompt=prompt)
##embedding ollama.embeddings(model=model_embed, prompt=prompt)

In [28]:
from transformers import AutoTokenizer, AutoModel
import torch
import torch.nn.functional as F

In [29]:
berttokenizer = AutoTokenizer.from_pretrained('sentence-transformers/all-MiniLM-L6-v2', cache_dir = '../src/embedcache')
bertmodel = AutoModel.from_pretrained('sentence-transformers/all-MiniLM-L6-v2', cache_dir = '../src/embedcache')

In [30]:
def mean_pooling(model_output, attention_mask):
    token_embeddings = model_output[0] #First element of model_output contains all token embeddings
    input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()
    return torch.sum(token_embeddings * input_mask_expanded, 1) / torch.clamp(input_mask_expanded.sum(1), min=1e-9)

In [31]:
def embed_func(batch, tokenizer, model):
    # Tokenize sentences
    encoded_input = tokenizer(batch, padding=True, truncation=True, return_tensors='pt')

    # Compute token embeddings
    with torch.no_grad():
        model_output = model(**encoded_input)

    # Perform pooling
    sentence_embeddings = mean_pooling(model_output, encoded_input['attention_mask'])

    # Normalize embeddings
    sentence_embeddings = F.normalize(sentence_embeddings, p=2, dim=1)

    return [list(sentence_embeddings[0].numpy())]

# Embeddings

In [35]:
from tqdm import tqdm
embeddings = []

progress_bar = tqdm(total=len(overviews), desc='Processing')

for idx, row in overviews.iterrows():
    overview = row['overview']
    embedding = embed_func(overview, berttokenizer, bertmodel)[0]
    embeddings.append(embedding)
    progress_bar.update(1)

overviews['vector'] = [row for row in embeddings]

Processing:   6%|▋         | 2840/44571 [01:07<16:36, 41.90it/s]
Processing: 100%|█████████▉| 44569/44571 [19:43<00:00, 56.00it/s]

Processing: 100%|██████████| 44571/44571 [20:00<00:00, 56.00it/s]

In [36]:
import lancedb

uri = '../ragdb'
ragdb = lancedb.connect(uri)
ragdb.drop_table("Movies")
tbl = ragdb.create_table('Movies', data=overviews[['movieId', 'vector']])

In [75]:
#test
query = "I would like to see more drama and romance movies"
max_suggestions = 10
vector = embed_func(query, berttokenizer, bertmodel)[0]

retrieved_records = tbl.search(vector).limit(max_suggestions).to_pandas()

In [78]:
tbl.search(vector).limit(max_suggestions).to_pandas()['movieId'].to_list()

[133375, 162336, 168888, 3131, 6368, 166546, 53651, 135071, 96851, 81667]

In [38]:
retrieved_records

Unnamed: 0,movieId,vector,_distance
0,133375,"[-0.0692963, -0.046866897, -0.12688683, 0.0058...",0.95614
1,162336,"[0.018908279, 0.007368817, -0.004198158, 0.045...",1.027969
2,168888,"[-0.0013160746, -0.00991325, -0.021595787, 0.0...",1.045778
3,3131,"[-0.029244866, -0.095703006, 0.041687638, 0.07...",1.064164
4,6368,"[0.029268965, -0.03936354, 0.002903857, 0.0079...",1.068176
5,166546,"[-0.015793266, -0.03366205, -0.053026374, 0.05...",1.068879
6,53651,"[-0.02170171, -0.0036294863, -0.017068068, 0.0...",1.071693
7,135071,"[0.062416494, -0.02689028, 0.025281953, 0.0437...",1.073855
8,96851,"[-0.033347163, -0.040253256, 0.0376712, 0.0561...",1.0758
9,81667,"[0.0028647617, 0.051629107, 0.028867915, 0.020...",1.080458


In [43]:
movies[movies['movieId'] == 133375]

Unnamed: 0,movieId,title,genres
29192,133375,Hush (2012),Thriller


# Prompts

In [81]:
def full_history_prompt(watched_movies):
    expertice = 'Pertend you are an expert recommender system. I will give you a user and their watch history of loved movies. Based on the watch history, recommend new movies to the user.\n'
    history = f"User watch history:\n{','.join(watched_movies)}"
    prompt = expertice + history
    return prompt

# Testing

In [68]:
df = df[df['userId'].map(df['userId'].value_counts()) >= 11]

In [69]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 45772 entries, 1 to 99999
Data columns (total 8 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   index      45772 non-null  int64  
 1   userId     45772 non-null  int64  
 2   movieId    45772 non-null  int64  
 3   rating     45772 non-null  float64
 4   timestamp  45772 non-null  int64  
 5   title      45772 non-null  object 
 6   genres     45772 non-null  object 
 7   tag        45772 non-null  object 
dtypes: float64(1), int64(4), object(3)
memory usage: 3.1+ MB


In [70]:
len(df['userId'].unique())


1116

In [79]:
def recall(test_movie_ids, retrieved_ids):
    n = len(test_movie_ids)
    correct = 0
    for idx in retrieved_ids:
        if idx in test_movie_ids:
            correct += 1
    return correct / n

In [86]:
recalls = []
pb = tqdm(total=len(df['userId'].unique()), desc='Processing')
for user_id in df['userId'].unique():
    user_df = df[df['userId'] == user_id]
    user_df.sort_values(by='timestamp')
    test_movie_ids = user_df.head(10)['movieId'].to_list()
    prompt_movies = user_df.head(-10)['title'].to_list()
    prompt = full_history_prompt(prompt_movies)
    vector = embed_func(prompt, berttokenizer, bertmodel)[0]
    retrieved_ids = tbl.search(vector).limit(10).to_pandas()['movieId'].to_list()
    rec = recall(test_movie_ids, retrieved_ids)
    recalls.append(rec)
    pb.update(1)

Processing:   6%|▌         | 63/1116 [00:17<04:45,  3.69it/s]
Processing: 100%|█████████▉| 1114/1116 [01:39<00:00, 20.02it/s]

In [90]:
print(f"total history recall: {sum(recalls)/len(recalls)}")

total history recall: 0.0008064516129032257
