# Loading data

In [79]:
import pandas as pd

MovieLens data

In [80]:
ratings = pd.read_csv('ml-25m/ratings.csv')
movies = pd.read_csv('ml-25m/movies.csv')

In [81]:
df_watches = ratings.merge(movies, on='movieId', how='inner')
df_watches = df_watches[df_watches['rating'] > 3]
df_watches.info()

<class 'pandas.core.frame.DataFrame'>
Index: 15630129 entries, 0 to 25000094
Data columns (total 6 columns):
 #   Column     Dtype  
---  ------     -----  
 0   userId     int64  
 1   movieId    int64  
 2   rating     float64
 3   timestamp  int64  
 4   title      object 
 5   genres     object 
dtypes: float64(1), int64(3), object(2)
memory usage: 834.7+ MB


Overviews data

In [82]:
metadata = pd.read_csv('ml-meta/movies_metadata.csv')
links_df = pd.read_csv('ml-meta/links.csv')

  metadata = pd.read_csv('ml-meta/movies_metadata.csv')


In [83]:
#drop unnecessary columns
metadata = metadata[['id', 'original_title','overview']]

In [84]:
#get equivalent ids
links_df = links_df.dropna(subset=['tmdbId'])
links_df['tmdbId'] = links_df['tmdbId'].astype('int')
metadata['id'] = pd.to_numeric(metadata['id'], errors='coerce')
metadata = metadata.dropna(subset=['id'])
metadata['id'] = metadata['id'].astype('int')

In [85]:
#put them all together
overviews = metadata.merge(links_df[['movieId', 'tmdbId']], left_on='id', right_on='tmdbId', how='inner')
overviews = overviews.dropna(subset=['overview'])
overviews.drop(columns=['tmdbId', 'id'], inplace=True)
overviews = overviews[['movieId', 'original_title', 'overview']]
overviews.info()

<class 'pandas.core.frame.DataFrame'>
Index: 44571 entries, 0 to 45524
Data columns (total 3 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   movieId         44571 non-null  int64 
 1   original_title  44571 non-null  object
 2   overview        44571 non-null  object
dtypes: int64(1), object(2)
memory usage: 1.4+ MB


Prepare for processing

In [86]:
common_movies = list(set(overviews['movieId'].unique()) & set(df_watches['movieId'].unique()))
df_watches = df_watches[df_watches['movieId'].isin(common_movies)]

In [87]:
df_watches = df_watches[df_watches['userId'].map(df_watches['userId'].value_counts()) >= 110]
common_movies_smaller = list(set(overviews['movieId'].unique()) & set(df_watches['movieId'].unique()))
overviews = overviews[overviews['movieId'].isin(common_movies_smaller)]
df_watches = df_watches[df_watches['movieId'].isin(common_movies_smaller)]

In [88]:
len(df_watches['userId'].unique())

39806

In [89]:
selected_users = df_watches['userId'].drop_duplicates().sample(1000).unique()
df_watches = df_watches[df_watches['userId'].isin(selected_users)]
common_movies_even_smaller = list(set(overviews['movieId'].unique()) & set(df_watches['movieId'].unique()))
overviews = overviews[overviews['movieId'].isin(common_movies_even_smaller)]
df_watches = df_watches[df_watches['movieId'].isin(common_movies_even_smaller)]

In [90]:
df_watches.info()

<class 'pandas.core.frame.DataFrame'>
Index: 263302 entries, 24 to 24999963
Data columns (total 6 columns):
 #   Column     Non-Null Count   Dtype  
---  ------     --------------   -----  
 0   userId     263302 non-null  int64  
 1   movieId    263302 non-null  int64  
 2   rating     263302 non-null  float64
 3   timestamp  263302 non-null  int64  
 4   title      263302 non-null  object 
 5   genres     263302 non-null  object 
dtypes: float64(1), int64(3), object(2)
memory usage: 14.1+ MB


In [91]:
overviews.info()

<class 'pandas.core.frame.DataFrame'>
Index: 12945 entries, 0 to 45497
Data columns (total 3 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   movieId         12945 non-null  int64 
 1   original_title  12945 non-null  object
 2   overview        12945 non-null  object
dtypes: int64(1), object(2)
memory usage: 404.5+ KB


In [92]:
overviews.to_pickle('data/overviews.pkl')
df_watches.to_pickle('data/watches.pkl')

# Constructing prompts

In [93]:
overviews = pd.read_pickle('data/overviews.pkl')
df_watches = pd.read_pickle('data/watches.pkl')

In [95]:
users_df = pd.DataFrame(columns=['userId', 'test_movie_ids', 'prompt'])

In [96]:
def make_prompt(prompt_movie_ids, add_overview = False):
    expertice = "You are given a user's movie watch history. Each of the history items below contains the name of a movie that a person enjoyed watching"
    if add_overview:
        expertice += " followed by a description of it"
    expertice += ".\n"
    tmp = ",\n"
    watched_movies = []
    for m_id in prompt_movie_ids:
        name = overviews[overviews['movieId'] == m_id]['original_title'].tolist()[0]
        overview = overviews[overviews['movieId'] == m_id]['overview'].tolist()[0]
        for_prompt = "Title: " + name
        if add_overview:
            for_prompt += "\nDescription: " + overview
        for_prompt += '\n'
        watched_movies.append(for_prompt)
    history = f"User watch history:\n{tmp.join(watched_movies)}"
    prompt = expertice + history
    return prompt

In [97]:
users_df['userId'] = df_watches['userId'].unique()

In [98]:
test_movie_ids = []
prompts_no_overview = []
prompts_with_overview = []
for user_id in users_df['userId'].unique():
    u_df = df_watches[df_watches['userId'] == user_id].sort_values('timestamp', ascending=False)
    t_m_ids = u_df.head(10)['movieId'].to_list()
    test_movie_ids.append(t_m_ids)
    prompt_movie_ids = u_df.head(-10).head(10)['movieId'].to_list()
    prompt_no_oveview = make_prompt(prompt_movie_ids)
    prompt_with_overview = make_prompt(prompt_movie_ids, add_overview=True)
    prompts_no_overview.append(prompt_no_oveview)
    prompts_with_overview.append(prompt_with_overview)

users_df['test_movie_ids'] = test_movie_ids
users_df['prompt_no_overview'] = prompts_no_overview
users_df['prompt_with_overview'] = prompts_with_overview

In [99]:
users_df.head()

Unnamed: 0,userId,test_movie_ids,prompt,prompt_no_overview,prompt_with_overview
0,43,"[48774, 2144, 5500, 538, 1275, 2137, 1297, 310...",,You are given a user's movie watch history. Ea...,You are given a user's movie watch history. Ea...
1,736,"[6978, 101525, 7072, 4102, 38304, 80463, 56367...",,You are given a user's movie watch history. Ea...,You are given a user's movie watch history. Ea...
2,1307,"[140146, 94864, 69757, 174727, 7991, 1256, 521...",,You are given a user's movie watch history. Ea...,You are given a user's movie watch history. Ea...
3,1475,"[356, 441, 2054, 2329, 8636, 68954, 79132, 618...",,You are given a user's movie watch history. Ea...,You are given a user's movie watch history. Ea...
4,1541,"[8798, 7320, 73319, 82167, 88179, 7460, 1271, ...",,You are given a user's movie watch history. Ea...,You are given a user's movie watch history. Ea...


In [100]:
users_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 5 columns):
 #   Column                Non-Null Count  Dtype 
---  ------                --------------  ----- 
 0   userId                1000 non-null   int64 
 1   test_movie_ids        1000 non-null   object
 2   prompt                0 non-null      object
 3   prompt_no_overview    1000 non-null   object
 4   prompt_with_overview  1000 non-null   object
dtypes: int64(1), object(4)
memory usage: 39.2+ KB


In [101]:
users_df.to_pickle('prompts/user_df1.pkl')

# Loading models

In [None]:
from proper_embedding.open_source_llm_embedding import OpenSourceLLMEmbeddings
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM
import pandas as pd

In [None]:
model_llama_path = "../llama2_model/llama2_model"
model_llama = AutoModelForCausalLM.from_pretrained(model_llama_path).to('cuda')
tokenizer_llama = AutoTokenizer.from_pretrained(model_llama_path)
model_llama = OpenSourceLLMEmbeddings(
                model=model_llama, 
                tokenizer=tokenizer_llama,
                batch_size=1,
                context_length=2048)

In [None]:
from transformers import AutoTokenizer, AutoModel
import torch
import torch.nn.functional as F

berttokenizer = AutoTokenizer.from_pretrained('bertmodel')
bertmodel = AutoModel.from_pretrained('bertmodel')

def mean_pooling(model_output, attention_mask):
    token_embeddings = model_output[0] #First element of model_output contains all token embeddings
    input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()
    return torch.sum(token_embeddings * input_mask_expanded, 1) / torch.clamp(input_mask_expanded.sum(1), min=1e-9)

def embed_func(batch, tokenizer, model):
    # Tokenize sentences
    encoded_input = tokenizer(batch, padding=True, truncation=True, return_tensors='pt')

    # Compute token embeddings
    with torch.no_grad():
        model_output = model(**encoded_input)

    # Perform pooling
    sentence_embeddings = mean_pooling(model_output, encoded_input['attention_mask'])

    # Normalize embeddings
    sentence_embeddings = F.normalize(sentence_embeddings, p=2, dim=1)

    return [list(sentence_embeddings[0].numpy())]

# Embeddings for overviews

In [None]:
overviews = pd.read_pickle('data/overviews.pkl')

In [66]:
overviews['title_and_overview'] = overviews['original_title'] + ":\n" + overviews['overview']

In [67]:
overviews.head()

Unnamed: 0,movieId,original_title,overview,title_and_overview
0,1,Toy Story,"Led by Woody, Andy's toys live happily in his ...","Toy Story:\nLed by Woody, Andy's toys live hap..."
1,2,Jumanji,When siblings Judy and Peter discover an encha...,Jumanji:\nWhen siblings Judy and Peter discove...
2,3,Grumpier Old Men,A family wedding reignites the ancient feud be...,Grumpier Old Men:\nA family wedding reignites ...
3,4,Waiting to Exhale,"Cheated on, mistreated and stepped on, the wom...","Waiting to Exhale:\nCheated on, mistreated and..."
4,5,Father of the Bride Part II,Just when George Banks has recovered from his ...,Father of the Bride Part II:\nJust when George...


Using Llama2

In [None]:
embeddings = []
for idx, row in overviews.iterrows():
    emb = model_llama.embed_query(row['overview_and_title'])
    embeddings.append(emb)
overviews['embedding_llama'] = embeddings

overviews.to_pickle('embeddings/overviews_embedding_llama.pkl')

Using BERT

In [None]:
embeddings = []
for idx, row in overviews.iterrows():
    emb = embed_func(row['overview_and_title'], berttokenizer, bertmodel)[0]
    embeddings.append(emb)
overviews['embedding_bert'] = embeddings

overviews.to_pickle('embeddings/overviews_embedding_bert.pkl')

# Embeddings for prompts

In [None]:
users_df = pd.read_pickle('prompts/user_df1.pkl')

In [68]:
users_df.head()

Unnamed: 0,userId,test_movie_ids,prompt,prompt_no_overview,prompt_with_overview
0,3,"[1732, 4262, 81834, 106489, 88125, 79091, 8787...",,You are given a user's movie watch history. Ea...,You are given a user's movie watch history. Ea...
1,4,"[1136, 5952, 139644, 156706, 53956, 127098, 14...",,You are given a user's movie watch history. Ea...,You are given a user's movie watch history. Ea...
2,12,"[44555, 48516, 55820, 41712, 69122, 68237, 715...",,You are given a user's movie watch history. Ea...,You are given a user's movie watch history. Ea...
3,13,"[87306, 44665, 34162, 52973, 3210, 54503, 6313...",,You are given a user's movie watch history. Ea...,You are given a user's movie watch history. Ea...
4,19,"[123, 296, 31878, 152081, 106163, 3328, 90057,...",,You are given a user's movie watch history. Ea...,You are given a user's movie watch history. Ea...


Using Llama

In [None]:
embeddings = []
for idx, row in users_df.iterrows():
    emb = model_llama.embed_query(row['prompt_with_overview'])
    embeddings.append(emb)
users_df['embedding_wo_llama'] = embeddings

users_df.to_pickle('embeddings/prompts_wo_embedding_llama.pkl')

In [None]:
embeddings = []
for idx, row in users_df.iterrows():
    emb = model_llama.embed_query(row['prompt_no_overview'])
    embeddings.append(emb)
users_df['embedding_no_llama'] = embeddings

users_df.to_pickle('embeddings/prompts_no_embedding_llama.pkl')

Using BERT

In [None]:
embeddings = []
for idx, row in users_df.iterrows():
    emb = embed_func(row['prompt_with_overview'], berttokenizer, bertmodel)[0]
    embeddings.append(emb)
users_df['embedding_wo_bert'] = embeddings

users_df.to_pickle('embeddings/prompts_wo_embedding_bert.pkl')

In [None]:
embeddings = []
for idx, row in users_df.iterrows():
    emb = embed_func(row['prompt_no_overview'], berttokenizer, bertmodel)[0]
    embeddings.append(emb)
users_df['embedding_no_bert'] = embeddings

users_df.to_pickle('embeddings/prompts_no_embedding_bert.pkl')

# Evaluating

Loading BERT embeddings

In [46]:
prompt_embeddings_bert = pd.read_pickle('embeddings/prompts_no_embedding_bert.pkl')

In [47]:
overview_embeddings_bert = pd.read_pickle('embeddings/overviews_embedding_bert.pkl')

In [49]:
prompt_embeddings_bert.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 7 columns):
 #   Column                Non-Null Count  Dtype 
---  ------                --------------  ----- 
 0   userId                1000 non-null   int64 
 1   test_movie_ids        1000 non-null   object
 2   prompt                0 non-null      object
 3   prompt_no_overview    1000 non-null   object
 4   prompt_with_overview  1000 non-null   object
 5   embedding_wo_bert     1000 non-null   object
 6   embedding_no_bert     1000 non-null   object
dtypes: int64(1), object(6)
memory usage: 54.8+ KB


Setting KNN models

In [1]:
from sklearn.neighbors import NearestNeighbors
import pandas as pd

In [56]:
def precision(test_movie_ids, retrieved_ids):
    n = max(len(test_movie_ids), len(retrieved_ids))
    correct = 0
    for idx in retrieved_ids:
        if idx in test_movie_ids:
            correct += 1
    return correct / n

Evaluating BERT embeddings

In [50]:
knn_bert = NearestNeighbors(n_neighbors=10)
knn_bert.fit(overview_embeddings_bert['embedding_bert'].tolist())

In [57]:
s_10 = 0
s_20 = 0
s_50 = 0
for user_id, row in prompt_embeddings_bert.iterrows():
    neighbors_indexes_10 = knn_bert.kneighbors([row['embedding_wo_bert']], n_neighbors=10, return_distance=False)[0]
    neighbors_indexes_20 = knn_bert.kneighbors([row['embedding_wo_bert']], n_neighbors=20, return_distance=False)[0]
    neighbors_indexes_50 = knn_bert.kneighbors([row['embedding_wo_bert']], n_neighbors=50, return_distance=False)[0]
    neighbors_movie_ids_10 = overview_embeddings_bert['movieId'].iloc[neighbors_indexes_10].tolist()
    neighbors_movie_ids_20 = overview_embeddings_bert['movieId'].iloc[neighbors_indexes_20].tolist()
    neighbors_movie_ids_50 = overview_embeddings_bert['movieId'].iloc[neighbors_indexes_50].tolist()
    prec_10 = precision(row['test_movie_ids'], neighbors_movie_ids_10)
    prec_20 = precision(row['test_movie_ids'], neighbors_movie_ids_20)
    prec_50 = precision(row['test_movie_ids'], neighbors_movie_ids_50)
    print(prec_10)
    s_10 += prec_10
    s_20 += prec_20
    s_50 += prec_50

0.0
0.2
0.1
0.1
0.2
0.1
0.0
0.1
0.2
0.2
0.2
0.0
0.2
0.1
0.2
0.2
0.2
0.1
0.1
0.1
0.2
0.1
0.1
0.1
0.0
0.1
0.1
0.0
0.0
0.0
0.2
0.2
0.3
0.1
0.1
0.1
0.2
0.1
0.1
0.1
0.0
0.2
0.0
0.1
0.3
0.3
0.2
0.2
0.1
0.1
0.0
0.4
0.0
0.1
0.2
0.1
0.2
0.0
0.0
0.2
0.1
0.0
0.3
0.0
0.1
0.3
0.0
0.2
0.3
0.2
0.2
0.1
0.1
0.2
0.0
0.1
0.2
0.0
0.2
0.0
0.3
0.0
0.3
0.1
0.1
0.1
0.4
0.2
0.2
0.2
0.2
0.2
0.2
0.1
0.3
0.3
0.1
0.2
0.2
0.1
0.1
0.1
0.2
0.0
0.1
0.3
0.1
0.0
0.3
0.1
0.1
0.3
0.1
0.2
0.1
0.2
0.0
0.3
0.2
0.2
0.2
0.3
0.0
0.0
0.2
0.2
0.2
0.2
0.2
0.2
0.2
0.2
0.2
0.4
0.1
0.2
0.2
0.1
0.2
0.2
0.2
0.2
0.2
0.2
0.3
0.2
0.0
0.2
0.2
0.2
0.2
0.3
0.1
0.4
0.2
0.0
0.2
0.2
0.1
0.1
0.1
0.1
0.3
0.2
0.2
0.0
0.0
0.2
0.2
0.3
0.1
0.2
0.2
0.1
0.0
0.1
0.1
0.0
0.3
0.1
0.3
0.2
0.4
0.2
0.2
0.2
0.1
0.1
0.2
0.1
0.4
0.1
0.0
0.1
0.1
0.1
0.1
0.3
0.1
0.3
0.1
0.1
0.1
0.2
0.2
0.1
0.1
0.2
0.1
0.1
0.1
0.3
0.4
0.2
0.0
0.2
0.1
0.1
0.0
0.2
0.1
0.2
0.0
0.1
0.1
0.0
0.1
0.1
0.1
0.3
0.0
0.3
0.1
0.1
0.1
0.2
0.3
0.2
0.0
0.2
0.1
0.2
0.2
0.1
0.2
0.2
0.2
0.1
0.1
0.2


In [61]:
print(2/((1/(s_10/1000)) + (1/(s_10/1000))))
print(2/((1/(s_20/1000)) + (1/(s_20/1000 * 2))))
print(2/((1/(s_50/1000)) + (1/(s_50/1000 * 5))))

0.15589999999999893
0.12619999999999915
0.08093333333333319


In [62]:
s_10 = 0
s_20 = 0
s_50 = 0
for user_id, row in prompt_embeddings_bert.iterrows():
    neighbors_indexes_10 = knn_bert.kneighbors([row['embedding_no_bert']], n_neighbors=10, return_distance=False)[0]
    neighbors_indexes_20 = knn_bert.kneighbors([row['embedding_no_bert']], n_neighbors=20, return_distance=False)[0]
    neighbors_indexes_50 = knn_bert.kneighbors([row['embedding_no_bert']], n_neighbors=50, return_distance=False)[0]
    neighbors_movie_ids_10 = overview_embeddings_bert['movieId'].iloc[neighbors_indexes_10].tolist()
    neighbors_movie_ids_20 = overview_embeddings_bert['movieId'].iloc[neighbors_indexes_20].tolist()
    neighbors_movie_ids_50 = overview_embeddings_bert['movieId'].iloc[neighbors_indexes_50].tolist()
    prec_10 = precision(row['test_movie_ids'], neighbors_movie_ids_10)
    prec_20 = precision(row['test_movie_ids'], neighbors_movie_ids_20)
    prec_50 = precision(row['test_movie_ids'], neighbors_movie_ids_50)
    s_10 += prec_10
    s_20 += prec_20
    s_50 += prec_50

In [63]:
print(2/((1/(s_10/1000)) + (1/(s_10/1000))))
print(2/((1/(s_20/1000)) + (1/(s_20/1000 * 2))))
print(2/((1/(s_50/1000)) + (1/(s_50/1000 * 5))))

0.01069999999999998
0.010933333333333314
0.011899999999999926


Evaluating Llama2 embeddings

In [74]:
overview_embeddings_llama = pd.read_pickle('embeddings/overviews_embedding_llama.pkl')

In [75]:
overview_embeddings_llama.head()

Unnamed: 0,movieId,original_title,overview,title_and_overview,embedding_llama
0,1,Toy Story,"Led by Woody, Andy's toys live happily in his ...","Toy Story:\nLed by Woody, Andy's toys live hap...","[-0.11505313217639923, -1.5514801740646362, -0..."
1,2,Jumanji,When siblings Judy and Peter discover an encha...,Jumanji:\nWhen siblings Judy and Peter discove...,"[0.7274162769317627, -0.7512491345405579, 0.70..."
2,3,Grumpier Old Men,A family wedding reignites the ancient feud be...,Grumpier Old Men:\nA family wedding reignites ...,"[1.0406371355056763, -1.5488317012786865, -0.2..."
3,4,Waiting to Exhale,"Cheated on, mistreated and stepped on, the wom...","Waiting to Exhale:\nCheated on, mistreated and...","[1.781296730041504, -0.5551576614379883, -0.98..."
4,5,Father of the Bride Part II,Just when George Banks has recovered from his ...,Father of the Bride Part II:\nJust when George...,"[1.4452979564666748, -0.8208339214324951, 0.83..."


In [76]:
prompt_embeddings_llama = pd.read_pickle('embeddings/prompts_no_embedding_llama.pkl')

In [77]:
prompt_embeddings_llama.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 7 columns):
 #   Column                Non-Null Count  Dtype 
---  ------                --------------  ----- 
 0   userId                1000 non-null   int64 
 1   test_movie_ids        1000 non-null   object
 2   prompt                0 non-null      object
 3   prompt_no_overview    1000 non-null   object
 4   prompt_with_overview  1000 non-null   object
 5   embedding_wo_llama    1000 non-null   object
 6   embedding_no_llama    1000 non-null   object
dtypes: int64(1), object(6)
memory usage: 54.8+ KB


In [78]:
len(overview_embeddings_llama['embedding_llama'][0])

4096

In [83]:
knn_llama = NearestNeighbors(n_neighbors=10)
knn_llama.fit(overview_embeddings_llama['embedding_llama'].tolist())

In [84]:
s_10 = 0
s_20 = 0
s_50 = 0
for user_id, row in prompt_embeddings_llama.iterrows():
    neighbors_indexes_10 = knn_llama.kneighbors([row['embedding_wo_llama']], n_neighbors=10, return_distance=False)[0]
    neighbors_indexes_20 = knn_llama.kneighbors([row['embedding_wo_llama']], n_neighbors=20, return_distance=False)[0]
    neighbors_indexes_50 = knn_llama.kneighbors([row['embedding_wo_llama']], n_neighbors=50, return_distance=False)[0]
    neighbors_movie_ids_10 = overview_embeddings_llama['movieId'].iloc[neighbors_indexes_10].tolist()
    neighbors_movie_ids_20 = overview_embeddings_llama['movieId'].iloc[neighbors_indexes_20].tolist()
    neighbors_movie_ids_50 = overview_embeddings_llama['movieId'].iloc[neighbors_indexes_50].tolist()
    prec_10 = precision(row['test_movie_ids'], neighbors_movie_ids_10)
    prec_20 = precision(row['test_movie_ids'], neighbors_movie_ids_20)
    prec_50 = precision(row['test_movie_ids'], neighbors_movie_ids_50)
    s_10 += prec_10
    s_20 += prec_20
    s_50 += prec_50
print(2/((1/(s_10/1000)) + (1/(s_10/1000))))
print(2/((1/(s_20/1000)) + (1/(s_20/1000 * 2))))
print(2/((1/(s_50/1000)) + (1/(s_50/1000 * 5))))

0.04890000000000034
0.044133333333333545
0.03343333333333296


In [85]:
s_10 = 0
s_20 = 0
s_50 = 0
for user_id, row in prompt_embeddings_llama.iterrows():
    neighbors_indexes_10 = knn_llama.kneighbors([row['embedding_no_llama']], n_neighbors=10, return_distance=False)[0]
    neighbors_indexes_20 = knn_llama.kneighbors([row['embedding_no_llama']], n_neighbors=20, return_distance=False)[0]
    neighbors_indexes_50 = knn_llama.kneighbors([row['embedding_no_llama']], n_neighbors=50, return_distance=False)[0]
    neighbors_movie_ids_10 = overview_embeddings_llama['movieId'].iloc[neighbors_indexes_10].tolist()
    neighbors_movie_ids_20 = overview_embeddings_llama['movieId'].iloc[neighbors_indexes_20].tolist()
    neighbors_movie_ids_50 = overview_embeddings_llama['movieId'].iloc[neighbors_indexes_50].tolist()
    prec_10 = precision(row['test_movie_ids'], neighbors_movie_ids_10)
    prec_20 = precision(row['test_movie_ids'], neighbors_movie_ids_20)
    prec_50 = precision(row['test_movie_ids'], neighbors_movie_ids_50)
    s_10 += prec_10
    s_20 += prec_20
    s_50 += prec_50
print(2/((1/(s_10/1000)) + (1/(s_10/1000))))
print(2/((1/(s_20/1000)) + (1/(s_20/1000 * 2))))
print(2/((1/(s_50/1000)) + (1/(s_50/1000 * 5))))

0.0020000000000000005
0.0022666666666666677
0.0020000000000000013


Evaluating generated prompt embeddings

In [87]:
prompt_gen_bert = pd.read_pickle('embeddings/prompts_generated_embeddings.pkl')

In [88]:
prompt_gen_bert.head()

Unnamed: 0,userId,test_movie_ids,prompt,responses,embedding_bert
0,43,"[48774, 2144, 5500, 538, 1275, 2137, 1297, 310...",,"Based on the movies watched by the user, it is...","[-0.060261574, -0.12186954, -0.061504867, -0.0..."
1,736,"[6978, 101525, 7072, 4102, 38304, 80463, 56367...",,"Based on the movies watched by this user, it i...","[0.013392865, -0.13235573, -0.06622448, -0.037..."
2,1307,"[140146, 94864, 69757, 174727, 7991, 1256, 521...",,Based on the movies listed in the user's watch...,"[-0.07227496, -0.07036866, -0.043054596, 0.018..."
3,1475,"[356, 441, 2054, 2329, 8636, 68954, 79132, 618...",,Based on the movies listed in the user's watch...,"[0.005810787, -0.07620952, -0.002406525, 0.011..."
4,1541,"[8798, 7320, 73319, 82167, 88179, 7460, 1271, ...",,Based on the movies listed in the user's watch...,"[-0.030125745, -0.09987682, -0.014032189, 0.01..."


In [89]:
s_10 = 0
s_20 = 0
s_50 = 0
for user_id, row in prompt_gen_bert.iterrows():
    neighbors_indexes_10 = knn_bert.kneighbors([row['embedding_bert']], n_neighbors=10, return_distance=False)[0]
    neighbors_indexes_20 = knn_bert.kneighbors([row['embedding_bert']], n_neighbors=20, return_distance=False)[0]
    neighbors_indexes_50 = knn_bert.kneighbors([row['embedding_bert']], n_neighbors=50, return_distance=False)[0]
    neighbors_movie_ids_10 = overview_embeddings_bert['movieId'].iloc[neighbors_indexes_10].tolist()
    neighbors_movie_ids_20 = overview_embeddings_bert['movieId'].iloc[neighbors_indexes_20].tolist()
    neighbors_movie_ids_50 = overview_embeddings_bert['movieId'].iloc[neighbors_indexes_50].tolist()
    prec_10 = precision(row['test_movie_ids'], neighbors_movie_ids_10)
    prec_20 = precision(row['test_movie_ids'], neighbors_movie_ids_20)
    prec_50 = precision(row['test_movie_ids'], neighbors_movie_ids_50)
    s_10 += prec_10
    s_20 += prec_20
    s_50 += prec_50

In [90]:
print(2/((1/(s_10/1000)) + (1/(s_10/1000))))
print(2/((1/(s_20/1000)) + (1/(s_20/1000 * 2))))
print(2/((1/(s_50/1000)) + (1/(s_50/1000 * 5))))

0.028500000000000074
0.029600000000000157
0.02323333333333309


In [91]:
embedding_summaries_gen = pd.read_pickle('embeddings/prompts_ls_embedding.pkl')

In [92]:
embedding_summaries_gen.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 7 columns):
 #   Column                        Non-Null Count  Dtype 
---  ------                        --------------  ----- 
 0   userId                        1000 non-null   int64 
 1   test_movie_ids                1000 non-null   object
 2   prompt                        0 non-null      object
 3   responses                     1000 non-null   object
 4   embedding_bert                1000 non-null   object
 5   embedding_bert_list_summary   1000 non-null   object
 6   embedding_bert_list_summary2  1000 non-null   object
dtypes: int64(1), object(6)
memory usage: 54.8+ KB


In [93]:
s_10 = 0
s_20 = 0
s_50 = 0
for user_id, row in embedding_summaries_gen.iterrows():
    neighbors_indexes_10 = knn_bert.kneighbors([row['embedding_bert_list_summary']], n_neighbors=10, return_distance=False)[0]
    neighbors_indexes_20 = knn_bert.kneighbors([row['embedding_bert_list_summary']], n_neighbors=20, return_distance=False)[0]
    neighbors_indexes_50 = knn_bert.kneighbors([row['embedding_bert_list_summary']], n_neighbors=50, return_distance=False)[0]
    neighbors_movie_ids_10 = overview_embeddings_bert['movieId'].iloc[neighbors_indexes_10].tolist()
    neighbors_movie_ids_20 = overview_embeddings_bert['movieId'].iloc[neighbors_indexes_20].tolist()
    neighbors_movie_ids_50 = overview_embeddings_bert['movieId'].iloc[neighbors_indexes_50].tolist()
    prec_10 = precision(row['test_movie_ids'], neighbors_movie_ids_10)
    prec_20 = precision(row['test_movie_ids'], neighbors_movie_ids_20)
    prec_50 = precision(row['test_movie_ids'], neighbors_movie_ids_50)
    s_10 += prec_10
    s_20 += prec_20
    s_50 += prec_50
print(2/((1/(s_10/1000)) + (1/(s_10/1000))))
print(2/((1/(s_20/1000)) + (1/(s_20/1000 * 2))))
print(2/((1/(s_50/1000)) + (1/(s_50/1000 * 5))))

0.02460000000000004
0.025066666666666765
0.02219999999999976


In [94]:
s_10 = 0
s_20 = 0
s_50 = 0
for user_id, row in embedding_summaries_gen.iterrows():
    neighbors_indexes_10 = knn_bert.kneighbors([row['embedding_bert_list_summary2']], n_neighbors=10, return_distance=False)[0]
    neighbors_indexes_20 = knn_bert.kneighbors([row['embedding_bert_list_summary2']], n_neighbors=20, return_distance=False)[0]
    neighbors_indexes_50 = knn_bert.kneighbors([row['embedding_bert_list_summary2']], n_neighbors=50, return_distance=False)[0]
    neighbors_movie_ids_10 = overview_embeddings_bert['movieId'].iloc[neighbors_indexes_10].tolist()
    neighbors_movie_ids_20 = overview_embeddings_bert['movieId'].iloc[neighbors_indexes_20].tolist()
    neighbors_movie_ids_50 = overview_embeddings_bert['movieId'].iloc[neighbors_indexes_50].tolist()
    prec_10 = precision(row['test_movie_ids'], neighbors_movie_ids_10)
    prec_20 = precision(row['test_movie_ids'], neighbors_movie_ids_20)
    prec_50 = precision(row['test_movie_ids'], neighbors_movie_ids_50)
    s_10 += prec_10
    s_20 += prec_20
    s_50 += prec_50
print(2/((1/(s_10/1000)) + (1/(s_10/1000))))
print(2/((1/(s_20/1000)) + (1/(s_20/1000 * 2))))
print(2/((1/(s_50/1000)) + (1/(s_50/1000 * 5))))

0.15589999999999893
0.12619999999999915
0.08093333333333319


# Colaborative filtering

In [19]:
import pandas as pd

In [20]:
watches = pd.read_pickle('data/watches.pkl')

In [21]:
watches.info()

<class 'pandas.core.frame.DataFrame'>
Index: 263302 entries, 24 to 24999963
Data columns (total 6 columns):
 #   Column     Non-Null Count   Dtype  
---  ------     --------------   -----  
 0   userId     263302 non-null  int64  
 1   movieId    263302 non-null  int64  
 2   rating     263302 non-null  float64
 3   timestamp  263302 non-null  int64  
 4   title      263302 non-null  object 
 5   genres     263302 non-null  object 
dtypes: float64(1), int64(3), object(2)
memory usage: 14.1+ MB


In [26]:
movies_list = watches['movieId'].unique()
users_list = watches['userId'].unique()
CF_df = pd.DataFrame(columns=movies_list)

In [31]:
user_watch_dict = dict()
for user in users_list:
    user_watched = watches[watches['userId'] == user]['movieId'].unique()
    user_watch_dict[user] = [x in user_watched for x in movies_list]

In [32]:
df_dict = dict()
for i, movie in enumerate(movies_list):
    v = []
    for user in users_list:
        v.append(user_watch_dict[user][i])
    df_dict[movie] = v

In [34]:
CF_df = pd.DataFrame(df_dict)
CF_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Columns: 12923 entries, 296 to 168810
dtypes: bool(12923)
memory usage: 12.3 MB


In [35]:
CF_df.head()

Unnamed: 0,296,306,307,665,899,1088,1175,1217,1237,1250,...,146499,157777,95425,155992,79104,155996,141436,146475,146540,168810
0,True,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
1,True,False,False,False,True,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
2,True,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
3,True,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
4,True,False,False,False,False,False,False,False,True,False,...,False,False,False,False,False,False,False,False,False,False


In [38]:
from scipy import spatial

In [44]:
recalls = []
for i, user1 in enumerate(users_list):
    tmi = embedding_summaries_gen.iloc[i]['test_movie_ids']
    ds = dict()
    for j, user2 in enumerate(users_list):
        if i == j:
            continue
        v1 = CF_df.drop(columns=tmi).iloc[i].tolist()
        v2 = CF_df.drop(columns=tmi).iloc[j].tolist()
        d = spatial.distance.cosine(v1, v2)
        ds[j] = d
    r = 0
    for mi in tmi:
        s = 0
        for j, user2 in enumerate(users_list):
            if i == j:
                continue
            s += ds[j] * CF_df.iloc[j][mi]
        s /= sum(ds.values())
        if s > 0.5:
            r += 1
    recalls.append(r/len(tmi))
    print(r/len(tmi))

0.0
0.0
0.0
0.1
0.0
0.0
0.0
0.1
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.1
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.1
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.1
0.0
0.1
0.1
0.0
0.1
0.0
0.0
0.0
0.0
0.1
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.1
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.2
0.1
0.0
0.0
0.1
0.1
0.0
0.0
0.0
0.1
0.0
0.1
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.1
0.0
0.1
0.1
0.0
0.0
0.2
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.1
0.0
0.2
0.1
0.1
0.0
0.0
0.1
0.0
0.0
0.0
0.0
0.0
0.1
0.2
0.1
0.0
0.1
0.0
0.2
0.1
0.0
0.0
0.1
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.1
0.0
0.0
0.1
0.0
0.0
0.0
0.3
0.0
0.0
0.0
0.1
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.2
0.0
0.0
0.0
0.0
0.1
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.1
0.0
0.0
0.0
0.0
0.1
0.0
0.0
0.0
0.0
0.1
0.0
0.0
0.1
0.0
0.0
0.0
0.1
0.0
0.4
0.0
0.0
0.0
0.0
0.0
0.0
0.2
0.0
0.0
0.0
0.1
0.2
0.0
0.1
0.0
0.0
0.0
0.0
0.0
0.0


In [45]:
print(sum(recalls)/len(recalls))

0.02710000000000005
