In [1]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np
from sklearn.metrics import ndcg_score
from collections import defaultdict
import ast
from scipy.sparse import csr_matrix
from sklearn.preprocessing import StandardScaler
from scipy.sparse import hstack
from scipy.sparse import csr_matrix
from numpy import where


In [2]:
# Load the datasets
small = pd.read_csv('data/small_matrix.csv')
big = pd.read_csv('data/big_matrix.csv')
categories = pd.read_csv('data/item_categories.csv')
item_daily_features = pd.read_csv('data/item_daily_features.csv')
caption = pd.read_csv('data/kuairec_caption_category.csv', encoding='utf-8',
    na_values=[],
    keep_default_na=False,
    on_bad_lines='skip',   # skip problematic rows
    engine='python')
# user = pd.read_csv('data/user_features.csv')
# social = pd.read_csv('data/social_network.csv')

For this project i decided to use content based filtering to create my recommendation model. Therefore i'll not use the user and social dataset that give features for user (more adequate for a collaborative filtering)

## 1) Data Analysis

### small_matrix & big_matrix

In [None]:
# Small matrix represents all the interactions between users and items

### item_daily_features



📹 Video Metadata

   video_id: Unique identifier for the video.

   date: Date of the statistics in YYYYMMDD format.

   author_id: Identifier of the video's author.

   video_type: Type of the video, e.g., "NORMAL" or "AD".

   upload_dt: Date when the video was uploaded (YYYY-MM-DD).

   upload_type: Method of upload, e.g., "ShortImport", "LongImport", or "ShortCamera".

   visible_status: Visibility status of the video, such as "public".

   video_duration: Duration of the video in milliseconds.

   video_width: Width of the video in pixels.

   video_height: Height of the video in pixels.

   music_id: Identifier for the background music used in the video.

   video_tag_id: Identifier for the video's tag.

   video_tag_name: Name of the tag associated with the video.
   KuaiSAR+1ar5iv+1
   KuaiSAR+2GitHub+2KuaiRec+2

👁️ Exposure and Playback Metrics

   show_cnt: Number of times the video was shown to users on that day.

   show_user_num: Number of unique users who were shown the video.

   play_cnt: Total number of times the video was played.

   play_user_num: Number of unique users who played the video.

   play_duration: Total time (in milliseconds) the video was played.

   complete_play_cnt: Number of times the video was played to completion.

   complete_play_user_num: Number of unique users who played the video to completion.

   valid_play_cnt: Number of valid plays based on duration thresholds.

   valid_play_user_num: Number of unique users with valid plays.

   long_time_play_cnt: Number of long-duration plays based on specific criteria.

   long_time_play_user_num: Number of unique users with long-duration plays.

   short_time_play_cnt: Number of short-duration plays.

   short_time_play_user_num: Number of unique users with short-duration plays.

   play_progress: Average play progress ratio, calculated as play_duration / video_duration.
   GitHub+1KuaiRec+1

💬 Engagement Metrics

   comment_stay_duration: Total time users spent in the comments section.

   like_cnt: Total number of likes the video received.

   like_user_num: Number of unique users who liked the video.

   click_like_cnt: Number of likes resulting from double-clicks.

   double_click_cnt: Number of double-click interactions on the video.

   cancel_like_cnt: Number of likes that were canceled.

   cancel_like_user_num: Number of unique users who canceled their likes.

   comment_cnt: Total number of comments made on the video.

   comment_user_num: Number of unique users who commented.

   direct_comment_cnt: Number of direct (top-level) comments.

   reply_comment_cnt: Number of reply (nested) comments.

   delete_comment_cnt: Number of comments that were deleted.

   delete_comment_user_num: Number of unique users who deleted their comments.

   comment_like_cnt: Number of likes on comments.

   comment_like_user_num: Number of unique users who liked comments.
   GitHub

👥 Social Interaction Metrics

   follow_cnt: Number of new follows generated from the video.

   follow_user_num: Number of unique users who followed the author due to the video.

   cancel_follow_cnt: Number of unfollows resulting from the video.

   cancel_follow_user_num: Number of unique users who unfollowed the author due to the video.
   GitHub

📤 Sharing and Feedback Metrics

   share_cnt: Number of times the video was shared.

   share_user_num: Number of unique users who shared the video.

   download_cnt: Number of times the video was downloaded.

   download_user_num: Number of unique users who downloaded the video.

   report_cnt: Number of times the video was reported.

   report_user_num: Number of unique users who reported the video.

   reduce_similar_cnt: Number of times users chose to reduce similar content.

   reduce_similar_user_num: Number of unique users who opted to reduce similar content.

   collect_cnt: Number of times the video was added to favorites.

   collect_user_num: Number of unique users who added the video to favorites.

   cancel_collect_cnt: Number of times the video was removed from favorites.

   cancel_collect_user_num: Number of unique users who removed the video from favorites.

In [None]:
# Data analysis of item_daily_features


### Categories

### Kuairec caption

In [4]:
video_0 = item_daily_features[item_daily_features['video_id'] == 0]
video_0 = video_0.sort_values(by='date', ascending=False)
video_0_stat = video_0[['date', 'play_cnt', 'show_cnt', 'play_duration', 'complete_play_cnt', 'valid_play_cnt']] 
video_0_stat

Unnamed: 0,date,play_cnt,show_cnt,play_duration,complete_play_cnt,valid_play_cnt
62,20200905,2213,3710,19547072,1230,1199
61,20200904,1601,2468,14158046,953,930
60,20200903,1200,1650,9749977,682,667
59,20200902,1313,1876,10796387,704,685
58,20200901,1523,2172,13514121,796,771
...,...,...,...,...,...,...
4,20200709,5392,8502,46952744,3058,2946
3,20200708,5172,8916,45281254,2950,2865
2,20200707,4757,7842,41338741,2734,2640
1,20200706,7321,10883,64264607,4162,4039


2) Feature Engineering

In [5]:
# video represent by their caption
caption['video_id'] = caption['video_id'].astype(str)
# drop all rows with video_ids that are not digits
caption = caption[caption['video_id'].str.isdigit()]
# convert video_id to int64
caption['video_id'] = caption['video_id'].astype(np.int64)
categories['video_id'] = categories['video_id'].astype(np.int64)
item_daily_features['video_id'] = item_daily_features['video_id'].astype(np.int64)

In [6]:
# videos represent by their metadata
video_metadata = item_daily_features[['video_id', 'author_id', 'video_type', 'upload_dt', 'visible_status', 'video_duration', 'video_width', 'video_height', 'music_id']] # maybe add upload_type
video_metadata = video_metadata.drop_duplicates(subset=['video_id'])

# Convert video type to int meaning if it is AD it is 0 and if it is NORMAL 1
video_metadata['video_type'] = video_metadata['video_type'].apply(lambda x: 1 if x == 'NORMAL' else 0)
video_metadata['visible_status'] = video_metadata['visible_status'].apply(lambda x: 1 if x == 'public' else 0)
video_metadata['upload_dt'] = pd.to_datetime(video_metadata['upload_dt'], format='%Y-%m-%d')
video_metadata['upload_dt'] = video_metadata['upload_dt'].astype('int64') // 10**9
video_metadata.dropna(inplace=True)
video_metadata.reset_index(drop=True, inplace=True)
video_metadata

Unnamed: 0,video_id,author_id,video_type,upload_dt,visible_status,video_duration,video_width,video_height,music_id
0,0,3309,1,1585526400,1,5966.0,720,1280,3350323409
1,2,939,1,1586563200,1,8000.0,720,1280,0
2,4,4284,1,1586649600,1,18000.0,720,1280,3442844592
3,5,3483,1,1586908800,1,8000.0,720,720,2457773462
4,6,725,1,1586908800,1,6000.0,720,1280,3393422084
...,...,...,...,...,...,...,...,...,...
10433,10723,236,1,1599264000,1,4833.0,720,1280,4428603493
10434,10724,5271,1,1599264000,1,54720.0,720,1280,1090207430
10435,10725,1924,1,1599264000,1,15800.0,576,1024,4429406509
10436,10726,7604,1,1599264000,1,5132.0,528,960,68154


In [7]:
common_video_ids = set(caption['video_id']).intersection(set(categories['video_id']))
common_video_ids = set(common_video_ids).intersection(set(video_metadata['video_id']))
# print(len(common_video_ids))
common_video_ids = list(common_video_ids)
common_video_ids
# Get datatypes of the items in common_video_ids

[0,
 2,
 4,
 5,
 6,
 7,
 8,
 9,
 10,
 11,
 12,
 13,
 14,
 15,
 16,
 17,
 18,
 19,
 20,
 21,
 22,
 23,
 24,
 25,
 26,
 27,
 28,
 29,
 31,
 32,
 33,
 34,
 36,
 37,
 38,
 39,
 40,
 41,
 42,
 43,
 44,
 45,
 46,
 47,
 48,
 49,
 50,
 51,
 52,
 53,
 54,
 55,
 56,
 57,
 58,
 59,
 60,
 61,
 62,
 63,
 64,
 65,
 66,
 67,
 68,
 69,
 70,
 71,
 72,
 73,
 74,
 75,
 76,
 77,
 78,
 79,
 80,
 81,
 82,
 83,
 85,
 86,
 87,
 88,
 89,
 90,
 91,
 92,
 93,
 94,
 95,
 96,
 97,
 98,
 99,
 100,
 101,
 102,
 103,
 104,
 105,
 106,
 107,
 108,
 109,
 110,
 111,
 112,
 113,
 114,
 115,
 116,
 117,
 118,
 119,
 120,
 121,
 122,
 123,
 125,
 126,
 127,
 128,
 129,
 130,
 131,
 133,
 134,
 135,
 136,
 137,
 138,
 139,
 140,
 141,
 142,
 143,
 144,
 145,
 146,
 147,
 148,
 149,
 150,
 151,
 152,
 153,
 154,
 155,
 156,
 157,
 158,
 159,
 160,
 161,
 162,
 163,
 164,
 165,
 166,
 167,
 168,
 169,
 170,
 171,
 172,
 173,
 174,
 175,
 176,
 177,
 178,
 179,
 180,
 181,
 182,
 183,
 184,
 185,
 186,
 187,
 188,
 189,
 190,

In [8]:
# video represents by their stats
videos_stats_intersting = item_daily_features[['video_id', 'play_cnt', 'show_cnt', 'play_duration', 'complete_play_cnt', 'valid_play_cnt', 'like_cnt', 'comment_stay_duration']]
videos_stats_intersting = videos_stats_intersting[videos_stats_intersting['video_id'].isin(common_video_ids)]
videos_stats_intersting
videos_stats = videos_stats_intersting.groupby('video_id').agg({
    'play_cnt': 'sum',
    'show_cnt': 'sum',
    'play_duration': 'sum',
    'complete_play_cnt': 'sum',
    'valid_play_cnt': 'sum',
    'like_cnt': 'sum',
    'comment_stay_duration': 'sum'
}).reset_index()
videos_stats

Unnamed: 0,video_id,play_cnt,show_cnt,play_duration,complete_play_cnt,valid_play_cnt,like_cnt,comment_stay_duration
0,0,411691,805839,3672586209,231018,224751,24493,435345336
1,2,670248,650377,8644331149,453815,474834,3180,25088506
2,4,610,825,7506745,186,280,3,308977
3,5,17522,36435,194709010,9064,10025,1909,53262940
4,6,1745,9651,10085581,589,566,54,513511
...,...,...,...,...,...,...,...,...
10433,10723,214,277,1681908,117,114,24,337534
10434,10724,965,1100,56090732,535,754,264,1249884
10435,10725,15487,16996,323787284,8149,9317,851,1963153
10436,10726,7859,7644,128835301,5480,5382,44,192695


In [9]:
# videos represent by their categories
categories = categories[categories['video_id'].isin(common_video_ids)]
for i in range (31):
    categories['category_' + str(i)] = 0

categories['feat'] = categories['feat'].apply(ast.literal_eval)
for index, row in categories.iterrows():
    # Get the list of features for the current row
    features = row['feat']

    # Set the corresponding category columns to 1
    for feat in features:
        col_name = f'category_{feat}'
        if col_name in categories.columns:
            categories.at[index, col_name] = 1
categories.drop(columns=['feat'], inplace=True)
categories

Unnamed: 0,video_id,category_0,category_1,category_2,category_3,category_4,category_5,category_6,category_7,category_8,...,category_21,category_22,category_23,category_24,category_25,category_26,category_27,category_28,category_29,category_30
0,0,0,0,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
2,2,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,4,0,0,0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
5,5,0,0,0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
6,6,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
10723,10723,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
10724,10724,0,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
10725,10725,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
10726,10726,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [10]:
caption = caption[caption['video_id'].isin(common_video_ids)]

def combine_text_fields(row):
    fields = [
        row['manual_cover_text'],
        row['caption'],
        row['topic_tag'],
        row['first_level_category_name'],
        row['second_level_category_name'],
        row['third_level_category_name']
    ]
    return ' '.join([str(f) for f in fields if f and f != 'UNKNOWN' and f != '[]'])

caption['text'] = caption.apply(combine_text_fields, axis=1)

vectorizer = TfidfVectorizer()
tfidf_matrix = vectorizer.fit_transform(caption['text'])
tfidf_matrix

<Compressed Sparse Row sparse matrix of dtype 'float64'
	with 66201 stored elements and shape (10438, 28280)>

In [11]:
def generate_top_n_recommendations_from_logs(interactions_df, cosine_sim, video_ids, N=10, min_watch_ratio=0.5):
    """
    Generate top-N recommendations based on user-video interactions and cosine similarity.
    """
    # Filter for meaningful interactions
    filtered = interactions_df[interactions_df['watch_ratio'] >= min_watch_ratio]

    # Build user -> set of watched video_ids
    user_histories = filtered.groupby('user_id')['video_id'].apply(set).to_dict()

    recommendations = {}

    for user, watched in user_histories.items():
        # print(user, watched)
        watched_indices = [video_ids.get_loc(v) for v in watched if v in video_ids]
        if not watched_indices:
            recommendations[user] = []
            continue

        # Sum similarities of all watched videos
        sim_scores = np.sum(cosine_sim[watched_indices], axis=0)

        # Set similarity of already watched videos to -1
        for v in watched:
            if v in video_ids:
                sim_scores[video_ids.get_loc(v)] = -1

        top_indices = np.argsort(sim_scores)[::-1][:N]
        top_videos = video_ids[top_indices].tolist()
        recommendations[user] = top_videos

    return recommendations

def generate_top_n_recommendations_from_logs_sp(interactions_df, cosine_sim, video_ids, N=10, min_watch_ratio=0.5):
    """
    Generate top-N recommendations based on user-video interactions and cosine similarity.
    """
    # Filter for meaningful interactions
    filtered = interactions_df[interactions_df['watch_ratio'] >= min_watch_ratio]

    # Build user -> set of watched video_ids
    user_histories = filtered.groupby('user_id')['video_id'].apply(set).to_dict()

    recommendations = {}

    for user, watched in user_histories.items():
        # print(user, watched)
        watched_indices = [video_ids.get_loc(v) for v in watched if v in video_ids]
        if not watched_indices:
            recommendations[user] = []
            continue

        # Sum similarities of all watched videos
        sim_scores = np.array(np.sum(cosine_sim[watched_indices], axis=0)).ravel()

        # Set similarity of already watched videos to -1
        for v in watched:
            if v in video_ids:
                sim_scores[video_ids.get_loc(v)] = -1

        top_indices = np.argsort(sim_scores)[::-1][:N]
        top_videos = video_ids[top_indices].tolist()
        recommendations[user] = top_videos

    return recommendations

In [12]:
def prepare_test_ground_truth(interactions_df, min_watch_ratio=0.7):
    """
    Return user -> set of relevant video_ids from test interactions.
    """
    filtered = interactions_df[interactions_df['watch_ratio'] >= min_watch_ratio]
    return filtered.groupby('user_id')['video_id'].apply(set).to_dict()

In [18]:
train_df, test_df = big, small
video_ids_train = pd.Index(train_df['video_id'].unique())
# Only keep video_ids that are in caption, metadata and categories
video_ids_train = video_ids_train[video_ids_train.isin(caption['video_id']) & video_ids_train.isin(video_metadata['video_id']) & video_ids_train.isin(categories['video_id'])]
train_df.dropna(inplace=True)
test_df.dropna(inplace=True)
train_df.drop_duplicates(inplace=True)
test_df.drop_duplicates(inplace=True)
train_df = train_df[train_df["timestamp"] >= 0]
test_df = test_df[test_df["timestamp"] >= 0]

In [15]:
def hit_rate_log(recommendations, test_ground_truth):
    hits, total = 0, 0
    for user, recs in recommendations.items():
        true_items = test_ground_truth.get(user, set())
        hits += len(set(recs) & true_items)
        total += len(true_items)
    return hits / total if total else 0

def precision_at_k_log(recommendations, test_ground_truth, k=10):
    precisions = []
    for user, recs in recommendations.items():
        true_items = test_ground_truth.get(user, set())
        if not true_items:
            continue
        hits = len(set(recs[:k]) & true_items)
        precisions.append(hits / k)
    return np.mean(precisions) if precisions else 0

def ndcg_at_k_log(recommendations, test_ground_truth, k=10):
    ndcgs = []
    for user, recs in recommendations.items():
        true_items = test_ground_truth.get(user, set())
        if not true_items:
            continue
        y_true = [1 if vid in true_items else 0 for vid in recs[:k]]
        y_score = list(range(k, 0, -1))
        ndcgs.append(ndcg_score([y_true], [y_score]))
    return np.mean(ndcgs) if ndcgs else 0

def mrr_log(recommendations, test_ground_truth):
    rr = []
    for user, recs in recommendations.items():
        true_items = test_ground_truth.get(user, set())
        if not true_items:
            continue
        for rank, vid in enumerate(recs, 1):
            if vid in true_items:
                rr.append(1 / rank)
                break
        else:
            rr.append(0)
    return np.mean(rr) if rr else 0

def serendipity_log(recommendations, train_ground_truth, cosine_sim, video_ids, k=10):
    serendipities = []
    for user, recs in recommendations.items():
        watched = train_ground_truth.get(user, set())
        if not watched:
            continue
        rec_indices = [video_ids.get_loc(v) for v in recs[:k] if v in video_ids]
        watched_indices = [video_ids.get_loc(v) for v in watched if v in video_ids]
        if not rec_indices or not watched_indices:
            continue
        avg_sim = np.mean([cosine_sim[i, j] for i in watched_indices for j in rec_indices])
        serendipities.append(1 - avg_sim)
    return np.mean(serendipities) if serendipities else 0

In [15]:
cosine_sim_caption = cosine_similarity(tfidf_matrix, tfidf_matrix)

N = 10
recs_train_caption = generate_top_n_recommendations_from_logs(
    interactions_df=train_df,
    cosine_sim=cosine_sim_caption,
    video_ids=video_ids_train,
    N=N,
    min_watch_ratio=0.5
)

test_truth = prepare_test_ground_truth(
    interactions_df=test_df,
    min_watch_ratio=0.5
)

hit_rate = hit_rate_log(recs_train_caption, test_truth)
prec  = precision_at_k_log(recs_train_caption, test_truth, k=N)
ndcg = ndcg_at_k_log(recs_train_caption, test_truth, k=N)
mrr = mrr_log(recs_train_caption, test_truth)
serendipity = serendipity_log(recs_train_caption, test_truth, cosine_sim_caption, video_ids_train, k=N)
print(f"Hit Rate@{N} for caption sim: {hit_rate:.4f}")
print(f"Precision@{N} for caption sim: {prec:.4f}")
print(f"NDCG@{N} for caption sim: {ndcg:.4f}")
print(f"MRR@{N} for caption sim: {mrr:.4f}")
print(f"Serendipity@{N} for caption sim: {serendipity:.4f}")

Hit Rate@10 for caption sim: 0.0008
Precision@10 for caption sim: 0.1875
NDCG@10 for caption sim: 0.5239
MRR@10 for caption sim: 0.4260
Serendipity@10 for caption sim: 0.9713


Hit Rate@10 for caption sim: 0.0008
Precision@10 for caption sim: 0.1875
NDCG@10 for caption sim: 0.5239
MRR@10 for caption sim: 0.4260
Serendipity@10 for caption sim: 0.9713

In [20]:
cosine_sim_category = cosine_similarity(categories.iloc[:, 1:])

N = 10
recs_train_category = generate_top_n_recommendations_from_logs(
    interactions_df=train_df,
    cosine_sim=cosine_sim_category,
    video_ids=video_ids_train,
    N=N,
    min_watch_ratio=0.5
)

test_truth = prepare_test_ground_truth(
    interactions_df=test_df,
    min_watch_ratio=0.5
)

hit_rate = hit_rate_log(recs_train_category, test_truth)
prec  = precision_at_k_log(recs_train_category, test_truth, k=N)
ndcg = ndcg_at_k_log(recs_train_category, test_truth, k=N)
mrr = mrr_log(recs_train_category, test_truth)
serendipity = serendipity_log(recs_train_category, test_truth, cosine_sim_category, video_ids_train, k=N)
print(f"Hit Rate@{N} for category sim: {hit_rate:.4f}")
print(f"Precision@{N} for category sim: {prec:.4f}")
print(f"NDCG@{N} for category sim: {ndcg:.4f}")
print(f"MRR@{N} for category sim: {mrr:.4f}")
print(f"Serendipity@{N} for category sim: {serendipity:.4f}")

Hit Rate@10 for category sim: 0.0011
Precision@10 for category sim: 0.2483
NDCG@10 for category sim: 0.5708
MRR@10 for category sim: 0.4892
Serendipity@10 for category sim: 0.7950


Hit Rate@10 for category sim: 0.0011
Precision@10 for category sim: 0.2483
NDCG@10 for category sim: 0.5708
MRR@10 for category sim: 0.4892
Serendipity@10 for category sim: 0.7950

In [None]:
cosine_sim_metadata = cosine_similarity(video_metadata.iloc[:, 1:])

N = 10
recs_train_metadata = generate_top_n_recommendations_from_logs(
    interactions_df=train_df,
    cosine_sim=cosine_sim_metadata,
    video_ids=video_ids_train,
    N=N,
    min_watch_ratio=0.75
)

test_truth = prepare_test_ground_truth(
    interactions_df=test_df,
    min_watch_ratio=0.75
)

hit_rate = hit_rate_log(recs_train_metadata, test_truth)
prec  = precision_at_k_log(recs_train_metadata, test_truth, k=N)
ndcg = ndcg_at_k_log(recs_train_metadata, test_truth, k=N)
mrr = mrr_log(recs_train_metadata, test_truth)
serendipity = serendipity_log(recs_train_metadata, test_truth, cosine_sim_metadata, video_ids_train, k=N)
print(f"Hit Rate@{N} for metadata sim: {hit_rate:.4f}")
print(f"Precision@{N} for metadata sim: {prec:.4f}")
print(f"NDCG@{N} for metadata sim: {ndcg:.4f}")
print(f"MRR@{N} for metadata sim: {mrr:.4f}")
print(f"Serendipity@{N} for metadata sim: {serendipity:.4f}")

In [None]:
# Merge metadata + categories (both as DataFrames)
video_features = pd.merge(categories, video_metadata, on='video_id', how='inner')
# Drop video_id temporarily
video_ids = video_features['video_id'].values
video_features_no_id = video_features.drop(columns=['video_id'])

# Scale metadata numerics
scaler = StandardScaler()
dense_features_scaled = scaler.fit_transform(video_features_no_id)

id_pos = caption['video_id'].reset_index(drop=True)
indices = [id_pos[id_pos == vid].index[0] for vid in video_ids]
tfidf_aligned = tfidf_matrix[indices]

dense_sparse = csr_matrix(dense_features_scaled)  # convert dense to sparse
combined_matrix = hstack([dense_sparse * 2.00, tfidf_aligned * 1.00])  # sparse + sparse = sparse

In [None]:
cosine_sim_merged = cosine_similarity(combined_matrix, dense_output=False)  # still sparse

N = 10
recs_train_merged = generate_top_n_recommendations_from_logs(
    interactions_df=train_df,
    cosine_sim=cosine_sim_merged,
    video_ids=video_ids_train,
    N=N,
    min_watch_ratio=0.75
)

test_truth = prepare_test_ground_truth(
    interactions_df=test_df,
    min_watch_ratio=0.75
)

hit_rate = hit_rate_log(recs_train_merged, test_truth)
prec  = precision_at_k_log(recs_train_merged, test_truth, k=N)
ndcg = ndcg_at_k_log(recs_train_merged, test_truth, k=N)
mrr = mrr_log(recs_train_merged, test_truth)
# serendipity = serendipity_log(recs_train_caption, test_truth, cosine_sim_merged, video_ids_train, k=N)
print(f"Hit Rate@{N} for caption sim: {hit_rate:.4f}")
print(f"Precision@{N} for caption sim: {prec:.4f}")
print(f"NDCG@{N} for caption sim: {ndcg:.4f}")
print(f"MRR@{N} for caption sim: {mrr:.4f}")
# print(f"Serendipity@{N} for caption sim: {serendipity:.4f}")

In [42]:
# Merge metadata + categories (both as DataFrames)
video_features = pd.merge(categories, video_metadata, on='video_id', how='inner')
cosine_sim_cat_meta = cosine_similarity(video_features.iloc[:, 1:])

In [None]:
N = 10
recs_train_cat_meta = generate_top_n_recommendations_from_logs(
    interactions_df=train_df,
    cosine_sim=cosine_sim_cat_meta,
    video_ids=video_ids_train,
    N=N,
    min_watch_ratio=0.75
)

test_truth = prepare_test_ground_truth(
    interactions_df=test_df,
    min_watch_ratio=0.75
)

hit_rate = hit_rate_log(recs_train_cat_meta, test_truth)
prec  = precision_at_k_log(recs_train_cat_meta, test_truth, k=N)
ndcg = ndcg_at_k_log(recs_train_cat_meta, test_truth, k=N)
mrr = mrr_log(recs_train_cat_meta, test_truth)
# serendipity = serendipity_log(recs_train_caption, test_truth, cosine_sim_merged, video_ids_train, k=N)
print(f"Hit Rate@{N} for caption sim: {hit_rate:.4f}")
print(f"Precision@{N} for caption sim: {prec:.4f}")
print(f"NDCG@{N} for caption sim: {ndcg:.4f}")
print(f"MRR@{N} for caption sim: {mrr:.4f}")
# print(f"Serendipity@{N} for caption sim: {serendipity:.4f}")

In [21]:
cosine_sim_stats = cosine_similarity(videos_stats[['play_cnt', 'show_cnt', 'play_duration', 'complete_play_cnt', 'valid_play_cnt', 'like_cnt', 'comment_stay_duration']])

N = 10
recs_train_stats = generate_top_n_recommendations_from_logs(
    interactions_df=train_df,
    cosine_sim=cosine_sim_stats,
    video_ids=video_ids_train,
    N=N,
    min_watch_ratio=0.75
)

test_truth = prepare_test_ground_truth(
    interactions_df=test_df,
    min_watch_ratio=0.75
)

hit_rate = hit_rate_log(recs_train_stats, test_truth)
prec  = precision_at_k_log(recs_train_stats, test_truth, k=N)
ndcg = ndcg_at_k_log(recs_train_stats, test_truth, k=N)
mrr = mrr_log(recs_train_stats, test_truth)
#serendipity = serendipity_log(recs_train_stats, test_truth, cosine_sim_stats, video_ids_train, k=N)
print(f"Hit Rate@{N} for stats sim: {hit_rate:.4f}")
print(f"Precision@{N} for stats sim: {prec:.4f}")
print(f"NDCG@{N} for stats sim: {ndcg:.4f}")
print(f"MRR@{N} for stats sim: {mrr:.4f}")
#print(f"Serendipity@{N} for caption sim: {serendipity:.4f}")

Hit Rate@10 for stats sim: 0.0010
Precision@10 for stats sim: 0.1585
NDCG@10 for stats sim: 0.4285
MRR@10 for stats sim: 0.3099


Hit Rate@10 for stats sim: 0.0010
Precision@10 for stats sim: 0.1585
NDCG@10 for stats sim: 0.4285
MRR@10 for stats sim: 0.3099

In [19]:
video_stats_categories = pd.merge(videos_stats, categories, on='video_id', how='inner')
video_stats_categories = video_stats_categories.drop(columns=['video_id'])

cosine_sim_stats_categories = cosine_similarity(video_stats_categories)
N = 10
recs_train_stats_categories = generate_top_n_recommendations_from_logs(
    interactions_df=train_df,
    cosine_sim=cosine_sim_stats_categories,
    video_ids=video_ids_train,
    N=N,
    min_watch_ratio=0.75
)

test_truth = prepare_test_ground_truth(
    interactions_df=test_df,
    min_watch_ratio=0.75
)
hit_rate = hit_rate_log(recs_train_stats_categories, test_truth)
prec  = precision_at_k_log(recs_train_stats_categories, test_truth, k=N)
ndcg = ndcg_at_k_log(recs_train_stats_categories, test_truth, k=N)
mrr = mrr_log(recs_train_stats_categories, test_truth)
# serendipity = serendipity_log(recs_train_caption, test_truth, cosine_sim_caption, video_ids_train, k=N)
print(f"Hit Rate@{N} for stats & categories sim: {hit_rate:.4f}")
print(f"Precision@{N} for caption & categories sim: {prec:.4f}")
print(f"NDCG@{N} for caption & categories sim: {ndcg:.4f}")
print(f"MRR@{N} for caption & categories sim: {mrr:.4f}")
# print(f"Serendipity@{N} for caption sim: {serendipity:.4f}")


Hit Rate@10 for stats & categories sim: 0.0010
Precision@10 for caption & categories sim: 0.1585
NDCG@10 for caption & categories sim: 0.4285
MRR@10 for caption & categories sim: 0.3099


Hit Rate@10 for stats & categories sim: 0.0010
Precision@10 for caption & categories sim: 0.1585
NDCG@10 for caption & categories sim: 0.4285
MRR@10 for caption & categories sim: 0.3099

In [28]:
# video_meta = video_metadata[['video_id', 'author_id', 'video_type', 'video_duration']]
# video_features = video_meta.merge(categories, on='video_id', how='inner')
video_features = categories

video_ids = video_features['video_id'].values
video_features_no_id = video_features.drop(columns=['video_id'])

# Scale metadata numerics
scaler = StandardScaler()
dense_features_scaled = scaler.fit_transform(video_features_no_id)

id_pos = caption['video_id'].reset_index(drop=True)
indices = [id_pos[id_pos == vid].index[0] for vid in video_ids]
tfidf_aligned = tfidf_matrix[indices]

dense_sparse = csr_matrix(dense_features_scaled)  # convert dense to sparse
combined_matrix = hstack([dense_sparse * 10.00, tfidf_aligned * 1.00])  # sparse + sparse = sparse

In [32]:
cosine_sim_custom = cosine_similarity(combined_matrix, dense_output=False)  # still sparse

N = 10
recs_train_merged = generate_top_n_recommendations_from_logs_sp(
    interactions_df=train_df,
    cosine_sim=cosine_sim_custom,
    video_ids=video_ids_train,
    N=N,
    min_watch_ratio=0.7
)

test_truth = prepare_test_ground_truth(
    interactions_df=test_df,
    min_watch_ratio=0.7
)

hit_rate = hit_rate_log(recs_train_merged, test_truth)
prec  = precision_at_k_log(recs_train_merged, test_truth, k=N)
ndcg = ndcg_at_k_log(recs_train_merged, test_truth, k=N)
mrr = mrr_log(recs_train_merged, test_truth)
# serendipity = serendipity_log(recs_train_caption, test_truth, cosine_sim_merged, video_ids_train, k=N)
print(f"Hit Rate@{N} for merged sim: {hit_rate:.4f}")
print(f"Precision@{N} for merged sim: {prec:.4f}")
print(f"NDCG@{N} for merged sim: {ndcg:.4f}")
print(f"MRR@{N} for merged sim: {mrr:.4f}")
# print(f"Serendipity@{N} for caption sim: {serendipity:.4f}")

Hit Rate@10 for merged sim: 0.0007
Precision@10 for merged sim: 0.1215
NDCG@10 for merged sim: 0.3208
MRR@10 for merged sim: 0.2041


Hit Rate@10 for merged sim: 0.0009
Precision@10 for merged sim: 0.1442
NDCG@10 for merged sim: 0.4265
MRR@10 for merged sim: 0.3400