In [1]:
import pandas as pd
import numpy as np
from src.models.content_handler_v1 import ContentHandler
from src.models.content_handler_v2 import ContentHandler as ContentHandler2
from src.models.content_evaluation import split_user_interactions, evaluate_hit_rate, evaluate_intra_list_diversity
from src.pipeline.data_processor import DataProcessor, CleanedData

In [2]:
dp = DataProcessor()
bookmarks_df = dp.load_table(CleanedData.BOOKMARKS) 
shiur_df = dp.load_table(CleanedData.SHIURIM)
user_listens_df = bookmarks_df.merge(shiur_df[['shiur', 'full_details']], on='shiur', how='inner')
user_listens_df = user_listens_df[(user_listens_df['played'] == 1)
                                                     | (user_listens_df['bookmark'] == 'queue')]
user_listens_df['date'] = user_listens_df['date_played'].combine_first(
            user_listens_df['queue_date'])


2024-07-11 22:25:30,512 - root - INFO - DataProcessor instance created
2024-07-11 22:25:30,512 - root - INFO - Loading data from: bookmarks_cleaned
2024-07-11 22:25:30,555 - root - INFO - Loading data from: shiurim_cleaned


In [3]:
# Analyze interaction distribution
interaction_counts = user_listens_df['user'].value_counts()
interaction_counts

user
196334    184
66892     134
5448      132
49823      83
222432     81
         ... 
72804       1
72886       1
72981       1
73169       1
224577      1
Name: count, Length: 1748, dtype: int64

In [4]:
percentiles = np.percentile(interaction_counts, [25, 50, 75])
percentiles

array([1., 2., 5.])

In [5]:
# Split data
train_df, test_df = split_user_interactions(user_listens_df, interaction_threshold=4)

# Instantiate ContentHandler with training data
content_handler = ContentHandler(train_df)
content_handler2 = ContentHandler2(train_df)


2024-07-11 22:25:31,376 - root - INFO - DataProcessor instance created
2024-07-11 22:25:31,377 - root - INFO - Loading data from: bookmarks_cleaned
2024-07-11 22:25:31,419 - root - INFO - Loading data from: shiurim_cleaned
2024-07-11 22:25:31,678 - gensim.utils - INFO - loading Word2Vec object from ./saved_models/content_filtering/word2vec_titles_v1.model
2024-07-11 22:25:31,690 - gensim.utils - INFO - loading wv recursively from ./saved_models/content_filtering/word2vec_titles_v1.model.wv.* with mmap=None
2024-07-11 22:25:31,691 - gensim.utils - INFO - setting ignored attribute cum_table to None
2024-07-11 22:25:31,776 - gensim.utils - INFO - Word2Vec lifecycle event {'fname': './saved_models/content_filtering/word2vec_titles_v1.model', 'datetime': '2024-07-11T22:25:31.776555', 'gensim': '4.3.2', 'python': '3.12.3 (main, Apr  9 2024, 08:09:14) [Clang 15.0.0 (clang-1500.3.9.4)]', 'platform': 'macOS-14.4.1-arm64-arm-64bit', 'event': 'loaded'}
2024-07-11 22:25:33,560 - root - INFO - Data

In [6]:
hit_rate1, user_hits1, recommendations1 = evaluate_hit_rate(test_df, content_handler, k=10)
hit_rate2, user_hits2, recommendations2 = evaluate_hit_rate(test_df, content_handler2, k=10)
diversity1 = evaluate_intra_list_diversity(recommendations1, content_handler.shiur_embeddings)
diversity2 = evaluate_intra_list_diversity(recommendations2, content_handler2.shiur_embeddings)

2024-07-11 22:25:38,785 - root - INFO - Total users in test set: 485
2024-07-11 22:25:38,889 - root - INFO - Evaluating user 1/485 (ID: 986):
2024-07-11 22:25:38,891 - root - INFO -   Recommendations: []
2024-07-11 22:25:38,892 - root - INFO -   Relevant items: [1098994]
2024-07-11 22:25:38,893 - root - INFO -   No recommendations or no relevant items for user 986
2024-07-11 22:25:39,065 - root - INFO - Evaluating user 2/485 (ID: 1053):
2024-07-11 22:25:39,078 - root - INFO -   Recommendations: [1094807]
2024-07-11 22:25:39,081 - root - INFO -   Relevant items: [1099756, 1099756, 1099756, 1099721, 1099721, 1099721, 1099730, 1099595, 1099878, 1100022]
2024-07-11 22:25:39,084 - root - INFO -   Hit: 0
2024-07-11 22:25:39,177 - root - INFO - Evaluating user 3/485 (ID: 1058):
2024-07-11 22:25:39,179 - root - INFO -   Recommendations: [1099742, 1099743, 1099516, 1096883]
2024-07-11 22:25:39,189 - root - INFO -   Relevant items: [1099545]
2024-07-11 22:25:39,191 - root - INFO -   Hit: 0
2024-

ValueError: Incompatible dimension for X and Y matrices: X.shape[1] == 64 while Y.shape[1] == 200

In [None]:
print(f"Hit Rate (Attention): {hit_rate1}")
print(f"Hit Rate (AutoEncoder/Clustering): {hit_rate2}")
print(f"Diversity (Attention): {diversity1}")
print(f"Diversity (AutoEncoder/Clustering): {diversity2}")

In [None]:
print(len(user_hits1))
print(len(user_hits2))
print(set(user_hits1).intersection(set(user_hits2)))

In [None]:
def get_user_shiurs(user_id, df):
    return [title for title in df[df['user'] == user_id]['full_details']]

In [None]:
user_id = 64041
train = get_user_shiurs(user_id, train_df)
test = get_user_shiurs(user_id, test_df)
recommend = content_handler.recommend_for_user_content(user_id, 7)

print("Recommendations")
for x in recommend.values():
    print(x)
print("Test")
for x in test:
    print(x)
print("Train")
for x in train:
    print(x)
print("Overlap")
for x in recommend.values():
    if x in test:
        print(x)


