In [1]:
from datetime import datetime
from scipy.sparse import csr_matrix
from collections import defaultdict
from src.utils.RecNN import RecNN

import numpy as np
import pandas as pd
import os
import torch
import pickle

### Load the models and datasets

In [2]:
K = 10  # Number of recommendations to generate

processed_path = "../data/processed/"

# Load the processed datasets
interactions_train = pd.read_csv(os.path.join(processed_path, "interactions_train.csv"))
interactions_test = pd.read_csv(os.path.join(processed_path, "interactions_test.csv"))
user_features = pd.read_csv(os.path.join(processed_path, "user_features.csv"))
video_metadata = pd.read_csv(os.path.join(processed_path, "video_metadata.csv"))

# Load the models
model_path = "../models/"

# Neural Network Model
model_nn_state_dict = torch.load(os.path.join(model_path, "nn_model.pth"), map_location=torch.device('cpu'))
model_nn = RecNN()
model_nn.load_state_dict(model_nn_state_dict)

# ALS Model
with open(os.path.join(model_path, 'als_model.pkl'), 'rb') as f:
    model_als = pickle.load(f)

# Similarity Matrix for Content-Based Filtering
similarity_matrix = np.load(os.path.join(model_path, 'similarity_matrix.npy'))

  from .autonotebook import tqdm as notebook_tqdm


### Recommendations ALS Model

In [3]:
interaction_matrix = csr_matrix((interactions_train['watch_ratio'],
                                 (interactions_train['user_id'], interactions_train['video_id'])))

def recommend_als(model, user_ids, video_ids, top_n=5):
    recommendations_als = {}
    for user_id in user_ids:
        # Get the user's recommendations from the ALS model
        if 0 <= user_id < interaction_matrix.shape[0]:
            user_recs = model.recommend(user_id, interaction_matrix[user_id], N=top_n, filter_already_liked_items=True)
            # Extract the video IDs from the recommendations
            rec_video_ids = [rec[0] for rec in user_recs]
            recommendations_als[user_id] = rec_video_ids
    return recommendations_als

train_video_ids = set(interactions_train['video_id'].unique())
valid_video_ids = [vid for vid in video_metadata['video_id'].unique() if vid in train_video_ids]

user_ids = user_features['user_id'].values

als_recommendations = recommend_als(model_als, user_ids, valid_video_ids, top_n=K)

### Recommendations Content-Based

In [4]:
def recommend_cb(sim_matrix, video_ids, top_n=10):
    recommendations = {}
    for user_idx, sims in enumerate(sim_matrix):
        top_indices = np.argsort(sims)[::-1]
        unique_recs = []
        seen = set()
        for idx in top_indices:
            vid = video_ids[idx]
            if vid not in seen:
                unique_recs.append(vid)
                seen.add(vid)
            if len(unique_recs) == top_n:
                break
        recommendations[user_ids[user_idx - 1]] = unique_recs
    return recommendations

video_ids = video_metadata['video_id'].values

cb_recommendations = recommend_cb(similarity_matrix, video_ids, K)

### Recommendations Hybrid

In [5]:
# Normalize popularity
video_metadata['normalized_popularity'] = (
    (video_metadata['like_cnt'] - video_metadata['like_cnt'].min()) /
    (video_metadata['like_cnt'].max() - video_metadata['like_cnt'].min())
)
popularity_scores = video_metadata.set_index('video_id')['normalized_popularity'].reindex(video_ids).fillna(0).values

def recommend_hybrid(sim_matrix, video_ids, popularity_scores, alpha=0.7, top_n=10):
    recommendations = {}
    for user_idx, sims in enumerate(sim_matrix):
        # Blend content similarity with popularity
        final_scores = alpha * sims + (1 - alpha) * popularity_scores

        top_indices = np.argsort(final_scores)[::-1]
        unique_recs = []
        seen = set()
        for idx in top_indices:
            vid = video_ids[idx]
            if vid not in seen:
                unique_recs.append(vid)
                seen.add(vid)
            if len(unique_recs) == top_n:
                break
        recommendations[user_ids[user_idx]] = unique_recs
    return recommendations

hybrid_recommendations = recommend_hybrid(similarity_matrix, video_ids, popularity_scores, alpha=0.7, top_n=K)

### Recommendations Neural Network

In [6]:
def recommend_nn(model, user_features_df, video_metadata_df, top_n=10):
    model.eval()
    recommendations = {}
    video_ids = video_metadata_df['video_id'].values
    video_tags = video_metadata_df['video_tag_id'].infer_objects(copy=False).fillna(0).astype(int).values

    # Prepare user features
    begin = datetime.now()
    for _, user_row in user_features_df.iterrows():
        user_id = user_row['user_id']
        if user_id % 100 == 0:
            print(f"{user_id}/{user_features_df.shape[0]} users processed in {datetime.now() - begin}")
        onehot_feats = [f'onehot_feat{i}' for i in range(1, 18)]
        user_input = user_row[onehot_feats].infer_objects(copy=False).fillna(0).to_numpy(dtype=np.int64)

        inputs = []
        for tag in video_tags:
            x = np.concatenate([user_input, [tag]])
            inputs.append(x)

        inputs_tensor = torch.tensor(np.array(inputs), dtype=torch.long)
        with torch.no_grad():
            scores = model(inputs_tensor).squeeze().numpy()

        top_indices = np.argsort(scores)[-top_n:][::-1]
        recommended_videos = video_ids[top_indices]
        recommendations[user_id] = recommended_videos.tolist()

    return recommendations

nn_recommendations = recommend_nn(model_nn, user_features[:500], video_metadata, top_n=K) # TODO: remove limit

0/500 users processed in 0:00:00.000787
100/500 users processed in 0:00:47.844554
200/500 users processed in 0:01:35.296783
300/500 users processed in 0:02:23.376665
400/500 users processed in 0:03:10.683394


### Combine Recommendations

In [7]:
def combine_three_recommendations(rec1, rec2, rec3, alpha=0.5, beta=0.3):
    combined_recs = {}
    gamma = 1 - alpha - beta

    for user_id in set(rec1.keys()).union(rec2.keys()).union(rec3.keys()):
        rec1_list = rec1.get(user_id, [])
        rec2_list = rec2.get(user_id, [])
        rec3_list = rec3.get(user_id, [])

        # Rank scores: higher rank = higher number
        rec1_scores = {vid: len(rec1_list) - idx for idx, vid in enumerate(rec1_list)}
        rec2_scores = {vid: len(rec2_list) - idx for idx, vid in enumerate(rec2_list)}
        rec3_scores = {vid: len(rec3_list) - idx for idx, vid in enumerate(rec3_list)}

        # Count how many lists each video appears in
        video_counts = defaultdict(int)
        for vid in set(rec1_list):
            video_counts[vid] += 1
        for vid in set(rec2_list):
            video_counts[vid] += 1
        for vid in set(rec3_list):
            video_counts[vid] += 1

        # Union of all videos
        all_vids = set(rec1_list) | set(rec2_list) | set(rec3_list)

        combined_scores = {}
        for vid in all_vids:
            score1 = rec1_scores.get(vid, 0)
            score2 = rec2_scores.get(vid, 0)
            score3 = rec3_scores.get(vid, 0)
            combined_scores[vid] = alpha * score1 + beta * score2 + gamma * score3

        # Sort videos by score
        sorted_vids = sorted(combined_scores.keys(), key=lambda x: combined_scores[x], reverse=True)

        # Guarantee: add any video that appears in at least 2 lists if not already in sorted list
        must_include = {vid for vid, count in video_counts.items() if count >= 2}
        final_list = []
        seen = set()

        for vid in sorted_vids:
            final_list.append(vid)
            seen.add(vid)

        for vid in must_include:
            if vid not in seen:
                final_list.append(vid)

        combined_recs[user_id] = final_list

    return combined_recs

# Combine content-based and ALS recommendations
combined_recommendations = combine_three_recommendations(cb_recommendations, als_recommendations, nn_recommendations, alpha=0.5, beta=0.3)

### Save Recommendations

In [11]:
# save to CSV

def save_recommendations(recommendations, filename):
    # Create a DataFrame from the recommendations dictionary
    df = pd.DataFrame.from_dict(recommendations, orient='index')
    # Reset index to make user_id a column
    df = df.reset_index()
    # Rename the index column to user_id
    df = df.rename(columns={'index': 'user_id'})
    # Save to CSV
    df.to_csv(os.path.join("../data/recommendations/", filename), index=False)

save_recommendations(als_recommendations, "als_recommendations.csv")

save_recommendations(cb_recommendations, "content_based_recommendations.csv")

save_recommendations(hybrid_recommendations, "hybrid_recommendations.csv")

save_recommendations(nn_recommendations, "nn_recommendations.csv")

save_recommendations(combined_recommendations, "combined_recommendations.csv")