In [1]:
import pandas as pd
import numpy as np
from src.models.old_content_models.content_handler_v1 import ContentHandler
from src.models.content_handler import ContentHandler as ContentHandler2
from src.models.content_evaluation import split_user_interactions, evaluate_hit_rate, evaluate_intra_list_diversity
from src.pipeline.data_processor import DataProcessor, CleanedData
from src.models.content_models import ContentModel

In [2]:
dp = DataProcessor()
bookmarks_df = dp.load_table(CleanedData.BOOKMARKS) 
cm = ContentModel()
shiur_df = cm.shiur_df
user_listens_df = bookmarks_df.merge(shiur_df[['shiur', 'full_details','embedding']], on='shiur', how='inner')
user_listens_df = user_listens_df[(user_listens_df['played'] == 1)
                                                     | (user_listens_df['bookmark'] == 'queue')]
user_listens_df['date'] = user_listens_df['date_played'].combine_first(
            user_listens_df['queue_date'])


2024-07-15 00:00:57,928 - root - INFO - DataProcessor instance created
2024-07-15 00:00:57,929 - root - INFO - Loading data from: bookmarks_cleaned
2024-07-15 00:01:13,204 - root - INFO - DataProcessor instance created
2024-07-15 00:01:13,205 - root - INFO - Loading data from: shiurim_cleaned
2024-07-15 00:01:14,958 - gensim.utils - INFO - loading Word2Vec object from /Users/jeremywizenfeld/Desktop/Torah-Navigator/src/models/saved_models/content_filtering/word2vec_v1.model
2024-07-15 00:01:14,975 - gensim.utils - INFO - loading wv recursively from /Users/jeremywizenfeld/Desktop/Torah-Navigator/src/models/saved_models/content_filtering/word2vec_v1.model.wv.* with mmap=None
2024-07-15 00:01:14,976 - gensim.utils - INFO - setting ignored attribute cum_table to None
2024-07-15 00:01:15,077 - gensim.utils - INFO - Word2Vec lifecycle event {'fname': '/Users/jeremywizenfeld/Desktop/Torah-Navigator/src/models/saved_models/content_filtering/word2vec_v1.model', 'datetime': '2024-07-15T00:01:15.0

In [3]:
# Analyze interaction distribution
interaction_counts = user_listens_df['user'].value_counts()
interaction_counts

user
1109      11028
5448      10455
1002       9354
1053       8955
35236      8642
          ...  
48700         1
206641        1
66133         1
89471         1
198635        1
Name: count, Length: 52452, dtype: int64

In [4]:
percentiles = np.percentile(interaction_counts, [25, 50, 75])
percentiles

array([ 3.,  8., 36.])

In [5]:
# Split data
train_df, test_df = split_user_interactions(user_listens_df, interaction_threshold=4)


In [6]:
user_counts = test_df.groupby('user').size()
valid_users = user_counts[(user_counts >= 36) & (user_counts <= 75)].index
filtered_test_df = test_df[test_df['user'].isin(valid_users)]


In [7]:

# Instantiate ContentHandler with training data
content_handler = ContentHandler(train_df)


2024-07-15 00:02:03,321 - root - INFO - DataProcessor instance created
2024-07-15 00:02:03,322 - root - INFO - Loading data from: bookmarks_cleaned
2024-07-15 00:02:18,863 - root - INFO - Loading data from: shiurim_cleaned
2024-07-15 00:02:22,633 - gensim.utils - INFO - loading Word2Vec object from /Users/jeremywizenfeld/Desktop/Torah-Navigator/src/models/saved_models/content_filtering/word2vec_v1.model
2024-07-15 00:02:22,661 - gensim.utils - INFO - loading wv recursively from /Users/jeremywizenfeld/Desktop/Torah-Navigator/src/models/saved_models/content_filtering/word2vec_v1.model.wv.* with mmap=None
2024-07-15 00:02:22,663 - gensim.utils - INFO - setting ignored attribute cum_table to None
2024-07-15 00:02:22,769 - gensim.utils - INFO - Word2Vec lifecycle event {'fname': '/Users/jeremywizenfeld/Desktop/Torah-Navigator/src/models/saved_models/content_filtering/word2vec_v1.model', 'datetime': '2024-07-15T00:02:22.769374', 'gensim': '4.3.2', 'python': '3.12.3 (main, Apr  9 2024, 08:09:

In [8]:
content_handler2 = ContentHandler2(train_df)


2024-07-15 00:03:44,605 - root - INFO - DataProcessor instance created
2024-07-15 00:03:44,606 - root - INFO - Loading data from: shiurim_cleaned
2024-07-15 00:03:46,476 - gensim.utils - INFO - loading Word2Vec object from /Users/jeremywizenfeld/Desktop/Torah-Navigator/src/models/saved_models/content_filtering/word2vec_v1.model
2024-07-15 00:03:46,500 - gensim.utils - INFO - loading wv recursively from /Users/jeremywizenfeld/Desktop/Torah-Navigator/src/models/saved_models/content_filtering/word2vec_v1.model.wv.* with mmap=None
2024-07-15 00:03:46,501 - gensim.utils - INFO - setting ignored attribute cum_table to None
2024-07-15 00:03:46,599 - gensim.utils - INFO - Word2Vec lifecycle event {'fname': '/Users/jeremywizenfeld/Desktop/Torah-Navigator/src/models/saved_models/content_filtering/word2vec_v1.model', 'datetime': '2024-07-15T00:03:46.599261', 'gensim': '4.3.2', 'python': '3.12.3 (main, Apr  9 2024, 08:09:14) [Clang 15.0.0 (clang-1500.3.9.4)]', 'platform': 'macOS-14.4.1-arm64-arm-6

In [9]:
hit_rate1, user_hits1, recommendations1 = evaluate_hit_rate(filtered_test_df, content_handler, k=10)
hit_rate2, user_hits2, recommendations2 = evaluate_hit_rate(filtered_test_df, content_handler2, k=10)
diversity1 = evaluate_intra_list_diversity(recommendations1, content_handler.shiur_embeddings)


2024-07-15 00:04:10,142 - root - INFO - Total users in test set: 2171
2024-07-15 00:04:10,804 - root - INFO - Evaluating user 1/2171 (ID: 1021):
2024-07-15 00:04:10,813 - root - INFO -   Recommendations: [895552, 832095, 831321, 732663, 832096, 788226, 717798, 870918, 791783]
2024-07-15 00:04:10,824 - root - INFO -   Relevant items: [728215, 726198, 726198, 968941, 968926, 968758, 970292, 894377, 943332, 813778, 780580, 953864, 976282, 976240, 976209, 976209, 976209, 976326, 976504, 976504, 978060, 982457, 713733, 731244, 755108, 985958, 947964, 994028, 993936, 995345, 910352, 999809, 999809, 1017176, 1021843, 1021843, 1022028, 993502, 1028111, 1027573, 1027430, 1027430, 1027430, 1029288, 926169, 768323]
2024-07-15 00:04:10,835 - root - INFO -   Hit: 0
2024-07-15 00:04:11,252 - root - INFO - Evaluating user 2/2171 (ID: 1049):
2024-07-15 00:04:11,254 - root - INFO -   Recommendations: [908346, 717798, 819586, 879158, 851926, 805171, 901429, 885586, 775788, 1050873]
2024-07-15 00:04:11,2

KeyError: 'embedding'

In [13]:
diversity2 = evaluate_intra_list_diversity(recommendations2, content_handler2.content_models.shiur_df)

2024-07-15 00:36:50,632 - root - INFO - Average Intra-list Diversity: 0.08560068905353546


In [14]:
print(f"Hit Rate (Attention): {hit_rate1}")
print(f"Hit Rate (AutoEncoder/Clustering): {hit_rate2}")
print(f"Diversity (Attention): {diversity1}")
print(f"Diversity (AutoEncoder/Clustering): {diversity2}")

Hit Rate (Attention): 0.06817134960847536
Hit Rate (AutoEncoder/Clustering): 0.06448641179180101
Diversity (Attention): 0.05656706914305687
Diversity (AutoEncoder/Clustering): 0.08560068905353546


In [15]:
print(len(user_hits1))
print(len(user_hits2))
print(set(user_hits1).intersection(set(user_hits2)))

148
140
{203777, 82952, 78863, 56853, 6686, 85024, 210976, 74277, 63532, 35887, 37433, 25159, 37459, 1112, 15455, 58987, 205938, 88179, 12916, 75381, 71287, 175226, 42112, 204422, 81037, 49297, 25753, 5786, 37021, 8350, 52922, 202944, 62145, 87236, 54985, 54989, 61646, 71900, 218346, 40173, 59118, 62703, 220947, 1823, 52514, 83747, 91946, 48442, 204604, 76098, 212290, 204111, 50515, 57176, 64856, 82789, 18793, 44905, 82796, 59249, 45945, 87931, 21886, 13182, 88448, 52618, 203660, 64397, 1428, 53657, 61856, 81327, 219062, 210364, 70594, 72643, 60875, 208336, 17361, 42471, 48104, 40447}


In [16]:
def get_user_shiurs(user_id, df):
    return [title for title in df[df['user'] == user_id]['full_details']]

In [26]:
user_id = 6686
train = get_user_shiurs(user_id, train_df)
test = get_user_shiurs(user_id, test_df)
recommend = content_handler2.recommend_for_user_content(user_id, 7)

print("Recommendations")
for x in recommend.values():
    print(x)


Recommendations
Title Ten Minute Halacha  March Madness Betting on Basketball Speaker Lebowitz Category Halacha
Title Ten Minute Halacha  Hagomel When You Werent Aware of the Danger Speaker Lebowitz Category Halacha
Title Ten Minute Halacha  Couples Living Together After Their Kesubos Were Lost in The Fires in Israel Speaker Lebowitz Category Halacha
Title Ten Minute Halacha  Which Way to Face When the Aron is Not in Mizrach Speaker Lebowitz Category Halacha
Title Ten Minute Halacha  Going to Work on Purim Speaker Lebowitz Category Halacha
Title Ten Minute Halacha  Learning on Nittel Nacht Speaker Lebowitz Category Halacha
Title Ten Minute Halacha  Thanksgiving Celebrations Speaker Lebowitz Category Halacha


In [27]:
print("Train")
for x in train:
    print(x)


Train
Title Chayei Sara 1 The Purchase of the Three Center Cities of Israel Speaker Luban Category Parsha
Title Vayeira 4 The Conflict with Avimelech and the Plishtim Speaker Luban Category Parsha
Title Vayeira 3 Developing Harmony Between Chessed and Gevurah Speaker Luban Category Machshava
Title Vayeira 2 Yitzchak the Middah of Gevurah and Mesiras Nefesh Speaker Luban Category Parsha
Title Vayeira 1 The Structure of the Parsha Speaker Luban Category Parsha
Title Lech Lecha 5 The Connection Between Brit Milah and Eretz Yisrael Speaker Luban Category Parsha
Title Lech Lecha 4 The Role of Yishmael in Bereishit and History Speaker Luban Category Parsha
Title Lech Lecha 3 Avraham and the Seven Sefirot Speaker Luban Category Parsha
Title Lech Lecha 2 The Daled Malchuyot  The Four Stages of Exile Speaker Luban Category Parsha
Title Lech Lecha 1 Maaseh Avot Siman LBanim  A History of Exile and Redemption Speaker Luban Category Parsha
Title Noach 4 The Role of the 70 Nations Speaker Luban Cat

In [28]:
print("Test")
for x in test:
    print(x)


Test
Title A Tale of Three Teshuvos Yonah perakim 1 3 4 Speaker Cohen Category Gemara
Title Ovadiah Intro Speaker Trump Category Nach
Title Ovadiah 1 Speaker Trump Category Nach
Title Yonah Intro Speaker Trump Category Nach
Title Yonah 1 Speaker Trump Category Nach
Title Yonah 2 Speaker Trump Category Nach
Title Yonah 3 Speaker Trump Category Nach
Title Yonah 4 Speaker Trump Category Nach
Title Beshalach 2 Kriyat Yam Suf  Seeing with Clarity at the Sea Speaker Charnoff Category Parsha
Title Beshalach 2 Kriyat Yam Suf  Seeing with Clarity at the Sea Speaker Charnoff Category Parsha
Title Beshalach 3 The Eternal Battle with Amalek Speaker Charnoff Category Parsha
Title Beshalach 3 The Eternal Battle with Amalek Speaker Charnoff Category Parsha
Title Leshem 1 Speaker Rosenfeld Category Machshava
Title Ten Minute Halacha  HoldingCarrying a Sefer Torah Speaker Lebowitz Category Halacha
Title Ten Minute Halacha  The World Cant Exist Without Yisrael Speaker Lebowitz Category Halacha
Title Ten

In [29]:
print("Overlap")
for x in recommend.values():
    if x in test:
        print(x)




Overlap
