In [2]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np
from sklearn.metrics import ndcg_score
from collections import defaultdict

In [35]:
# importing the dataset
caption = pd.read_csv('data/kuairec_caption_category.csv', encoding='utf-8',
    na_values=[],
    keep_default_na=False,
    on_bad_lines='skip',   # skip problematic rows
    engine='python')

# data to test
interactions = pd.read_csv('data/small_matrix.csv')

In [None]:
print(len(caption))
caption['video_id'] = caption['video_id'].astype(str)
# drop all rows with video_ids that are not digits
caption = caption[caption['video_id'].str.isdigit()]
# convert video_id to int64
caption['video_id'] = caption['video_id'].astype(np.int64)
caption.dtypes


10732
10728


video_id                        int64
manual_cover_text              object
caption                        object
topic_tag                      object
first_level_category_id       float64
first_level_category_name      object
second_level_category_id      float64
second_level_category_name     object
third_level_category_id       float64
third_level_category_name      object
dtype: object

In [26]:
import dtale
dtale.show(caption)



In [37]:
def combine_text_fields(row):
    fields = [
        row['manual_cover_text'],
        row['caption'],
        row['topic_tag'],
        row['first_level_category_name'],
        row['second_level_category_name'],
        row['third_level_category_name']
    ]
    return ' '.join([str(f) for f in fields if f and f != 'UNKNOWN' and f != '[]'])

caption['text'] = caption.apply(combine_text_fields, axis=1)

vectorizer = TfidfVectorizer()
tfidf_matrix = vectorizer.fit_transform(caption['text'])

In [38]:
# Compute cosine similarity
cosine_sim = cosine_similarity(tfidf_matrix, tfidf_matrix)

In [44]:
def generate_top_n_recommendations_from_logs(interactions_df, cosine_sim, video_ids, N=10, min_watch_ratio=0.5):
    """
    Generate top-N recommendations based on user-video interactions and cosine similarity.
    """
    # Filter for meaningful interactions
    filtered = interactions_df[interactions_df['watch_ratio'] >= min_watch_ratio]

    # Build user -> set of watched video_ids
    user_histories = filtered.groupby('user_id')['video_id'].apply(set).to_dict()

    recommendations = {}

    for user, watched in user_histories.items():
        # print(user, watched)
        watched_indices = [video_ids.get_loc(v) for v in watched if v in video_ids]
        if not watched_indices:
            recommendations[user] = []
            continue

        # Sum similarities of all watched videos
        sim_scores = np.sum(cosine_sim[watched_indices], axis=0)

        # Set similarity of already watched videos to -1
        for v in watched:
            if v in video_ids:
                sim_scores[video_ids.get_loc(v)] = -1

        top_indices = np.argsort(sim_scores)[::-1][:N]
        top_videos = video_ids[top_indices].tolist()
        recommendations[user] = top_videos

    return recommendations

In [40]:
video_ids = pd.Index(caption['video_id'].unique())
#video_ids
interactions['video_id'].unique()
caption['video_id'].unique()

for i, row in interactions.iterrows():
    if row['video_id'] not in video_ids:
        print(f"Video ID {row['video_id']} not found in caption dataset.")

KeyboardInterrupt: 

In [41]:
def temporal_train_test_split(df, frac=0.8):
    """
    For each user, sort by timestamp and take the first `frac` of interactions as train,
    the rest as test.
    """
    train_rows, test_rows = [], []
    for u, group in df.groupby('user_id'):
        group = group.sort_values('timestamp')
        cutoff = int(len(group) * frac)
        train_rows.append(group.iloc[:cutoff])
        test_rows.append(group.iloc[cutoff:])
    return pd.concat(train_rows), pd.concat(test_rows)

train_df, test_df = temporal_train_test_split(interactions, frac=0.8)
train_df

Unnamed: 0,user_id,video_id,play_duration,video_duration,time,date,timestamp,watch_ratio
0,14,148,4381,6067,2020-07-05 05:27:48.378,20200705.0,1.593898e+09,0.722103
1,14,183,11635,6100,2020-07-05 05:28:00.057,20200705.0,1.593898e+09,1.907377
2,14,3649,22422,10867,2020-07-05 05:29:09.479,20200705.0,1.593898e+09,2.063311
3,14,5262,4479,7908,2020-07-05 05:30:43.285,20200705.0,1.593898e+09,0.566388
4,14,8234,4602,11000,2020-07-05 05:35:43.459,20200705.0,1.593899e+09,0.418364
...,...,...,...,...,...,...,...,...
4675902,7162,4690,11934,8592,2020-08-16 11:17:13.921,20200816.0,1.597548e+09,1.388966
4675903,7162,3032,16694,8534,2020-08-16 11:28:19.423,20200816.0,1.597548e+09,1.956175
4675904,7162,3021,14675,9262,2020-08-16 11:30:16.603,20200816.0,1.597549e+09,1.584431
4675905,7162,1353,16690,14034,2020-08-16 11:50:30.337,20200816.0,1.597550e+09,1.189255


In [45]:
N = 10
recs_train = generate_top_n_recommendations_from_logs(
    interactions_df=train_df,
    cosine_sim=cosine_sim,
    video_ids=video_ids,
    N=N,
    min_watch_ratio=0.5
)
# recs_train

In [46]:
def prepare_test_ground_truth(interactions_df, video_ids, min_watch_ratio=0.5):
    """
    Return user -> set of relevant video_ids from test interactions.
    """
    filtered = interactions_df[interactions_df['watch_ratio'] >= min_watch_ratio]
    return filtered.groupby('user_id')['video_id'].apply(set).to_dict()

test_truth = prepare_test_ground_truth(
    interactions_df=test_df,
    video_ids=video_ids,
    min_watch_ratio=0.5
)

In [47]:
def hit_rate_log(recommendations, test_ground_truth):
    hits, total = 0, 0
    for user, recs in recommendations.items():
        true_items = test_ground_truth.get(user, set())
        hits += len(set(recs) & true_items)
        total += len(true_items)
    return hits / total if total else 0

def precision_at_k_log(recommendations, test_ground_truth, k=10):
    precisions = []
    for user, recs in recommendations.items():
        true_items = test_ground_truth.get(user, set())
        if not true_items:
            continue
        hits = len(set(recs[:k]) & true_items)
        precisions.append(hits / k)
    return np.mean(precisions) if precisions else 0

def ndcg_at_k_log(recommendations, test_ground_truth, k=10):
    ndcgs = []
    for user, recs in recommendations.items():
        true_items = test_ground_truth.get(user, set())
        if not true_items:
            continue
        y_true = [1 if vid in true_items else 0 for vid in recs[:k]]
        y_score = list(range(k, 0, -1))
        ndcgs.append(ndcg_score([y_true], [y_score]))
    return np.mean(ndcgs) if ndcgs else 0

def mrr_log(recommendations, test_ground_truth):
    rr = []
    for user, recs in recommendations.items():
        true_items = test_ground_truth.get(user, set())
        if not true_items:
            continue
        for rank, vid in enumerate(recs, 1):
            if vid in true_items:
                rr.append(1 / rank)
                break
        else:
            rr.append(0)
    return np.mean(rr) if rr else 0

def serendipity_log(recommendations, train_ground_truth, cosine_sim, video_ids, k=10):
    serendipities = []
    for user, recs in recommendations.items():
        watched = train_ground_truth.get(user, set())
        if not watched:
            continue
        rec_indices = [video_ids.get_loc(v) for v in recs[:k] if v in video_ids]
        watched_indices = [video_ids.get_loc(v) for v in watched if v in video_ids]
        if not rec_indices or not watched_indices:
            continue
        avg_sim = np.mean([cosine_sim[i, j] for i in watched_indices for j in rec_indices])
        serendipities.append(1 - avg_sim)
    return np.mean(serendipities) if serendipities else 0

In [None]:
prec  = precision_at_k_log(recs_train, test_truth, k=N)
print(f"Precision@{N}: {prec:.4f}")

Precision@10: 0.0561


2025-05-15 14:26:32,651 - INFO     - Executing shutdown due to inactivity...
2025-05-15 14:26:32,661 - INFO     - Executing shutdown...
2025-05-15 14:26:32,662 - INFO     - Not running with the Werkzeug Server, exiting by searching gc for BaseWSGIServer
