In [1]:
import os
import datetime
import pandas as pd
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
from scipy.sparse import csr_matrix
from sklearn.feature_extraction.text import TfidfTransformer
from implicit.als import AlternatingLeastSquares
from implicit.evaluation import precision_at_k, ndcg_at_k, mean_average_precision_at_k
from src.utils.utils import safe_parse_feat
from sklearn.preprocessing import MultiLabelBinarizer
from collections import defaultdict
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader


  from .autonotebook import tqdm as notebook_tqdm


### Load the Data

In [2]:
# Define the path to the processed data
processed_path = "../data/processed/"

# Load the processed datasets
interactions_train = pd.read_csv(os.path.join(processed_path, "interactions_train.csv"))
interactions_test = pd.read_csv(os.path.join(processed_path, "interactions_test.csv"))
user_features = pd.read_csv(os.path.join(processed_path, "user_features_engineered.csv"))
video_metadata = pd.read_csv(os.path.join(processed_path, "video_metadata.csv"))

video_metadata["feat"] = video_metadata["feat"].apply(safe_parse_feat)
video_metadata = video_metadata[video_metadata["feat"].notnull()]

user_features["preferred_category"] = user_features["preferred_category"].apply(safe_parse_feat)
user_features = user_features[user_features["preferred_category"].notnull()]

user_features["friends_preferred_category"] = user_features["friends_preferred_category"].apply(safe_parse_feat)
user_features = user_features[user_features["friends_preferred_category"].notnull()]

#Print the maximum value for each onehot
for i in range(1, 18):
    user_features[f"onehot_feat{i}"] = user_features[f"onehot_feat{i}"].fillna(0)
    user_features[f"onehot_feat{i}"] = user_features[f"onehot_feat{i}"].apply(lambda x: x if x >= 0 else 0)
    if user_features[f"onehot_feat{i}"].dtype == float:
        user_features = user_features.astype({f"onehot_feat{i}": int})

video_metadata["video_tag_id"] = video_metadata["video_tag_id"].fillna(0)
video_metadata["video_tag_id"] = video_metadata["video_tag_id"].apply(lambda x: x if x >= 0 else 0)
if video_metadata["video_tag_id"].dtype == float:
    video_metadata = video_metadata.astype({"video_tag_id": int})

### Collaborative Filtering with ALS

In [3]:
# Create the interaction matrix
interaction_matrix = csr_matrix((interactions_train['watch_ratio'],
                                 (interactions_train['user_id'], interactions_train['video_id'])))

# Create the test interaction matrix
test_interaction_matrix = csr_matrix((interactions_test['watch_ratio'],
                                      (interactions_test['user_id'], interactions_test['video_id'])))

# Initialize the ALS model
als_model = AlternatingLeastSquares(factors=50, regularization=0.1, iterations=20)

# Train the ALS model
als_model.fit(interaction_matrix)

  check_blas_config()
100%|██████████| 20/20 [01:36<00:00,  4.83s/it]


### Evaluate the ALS Model

In [4]:
# Calculate Precision@K and Recall@K
K = 30
ndcg = ndcg_at_k(als_model, interaction_matrix, test_interaction_matrix, K=K, show_progress=True, num_threads=1)
ma_precision = mean_average_precision_at_k(als_model, interaction_matrix, test_interaction_matrix, K=K, show_progress=True, num_threads=1)
precision = precision_at_k(als_model, interaction_matrix, test_interaction_matrix, K=K, show_progress=True, num_threads=1)

print(f"NDCG@{K}: {ndcg} - Best for position-aware ranking quality.")
print(f"MAP@{K}: {ma_precision} – Great global ranking evaluation.")
print(f"Precision@{K}: {precision} - Simple and intuitive.")

100%|██████████| 7176/7176 [00:00<00:00, 14948.91it/s]
100%|██████████| 7176/7176 [00:00<00:00, 16402.59it/s]
100%|██████████| 7176/7176 [00:00<00:00, 16457.39it/s]

NDCG@30: 0.8052790170186468 - Best for position-aware ranking quality.
MAP@30: 0.6876894387588626 – Great global ranking evaluation.
Precision@30: 0.7892445844308484 - Simple and intuitive.





#### Compute recommendations for ALS

In [5]:
def recommend_top_n_als(model, user_ids, video_ids, top_n=5):
    recommendations_als = {}
    for user_id in user_ids:
        # Get the user's recommendations from the ALS model
        if 0 <= user_id < interaction_matrix.shape[0]:
            user_recs = model.recommend(user_id, interaction_matrix[user_id], N=top_n, filter_already_liked_items=True)
            # Extract the video IDs from the recommendations
            rec_video_ids = [rec[0] for rec in user_recs]
            recommendations_als[user_id] = rec_video_ids
    return recommendations_als

train_video_ids = set(interactions_train['video_id'].unique())
valid_video_ids = [vid for vid in video_metadata['video_id'].unique() if vid in train_video_ids]

user_ids = user_features['user_id'].values

als_recommendations = recommend_top_n_als(als_model, user_ids, valid_video_ids, top_n=30)

### Content-Based Filtering with and Cosine Similarity

#### Prepare video embeddings


In [6]:
# Combine all tags into a flat list to get unique values
all_tags = set(tag for tags in video_metadata['feat'].tolist() +
                         user_features['preferred_category'].tolist() +
                         user_features['friends_preferred_category'].tolist() for tag in tags)
print(f"Unique tags: {len(all_tags)}")
print(user_features.head())
# Initialize MultiLabelBinarizer
mlb = MultiLabelBinarizer(classes=sorted(all_tags))
tfidf_transformer = TfidfTransformer()

print(user_features)
# Encode video tags
print("Encoding video tags...")
video_tag_matrix = mlb.fit_transform(video_metadata['feat'])
video_ids = video_metadata['video_id'].values

# Encode user preferences (we'll combine preferred and friends_preferred)
user_combined_tags = user_features.apply(
    lambda row: list(set(row['preferred_category'] + row['friends_preferred_category'])), axis=1
)
print("Encoding user tags...")
user_tag_matrix = mlb.transform(user_combined_tags)
user_ids = user_features['user_id'].values

Unique tags: 31
   user_id user_active_degree  is_lowactive_period  is_live_streamer  \
0        0        high_active                    0                 0   
1        1        full_active                    0                 0   
2        2        full_active                    0                 0   
3        3        full_active                    0                 0   
4        4        full_active                    0                 0   

   is_video_author  follow_user_num follow_user_num_range  fans_user_num  \
0                0                5                (0,10]              0   
1                0              386             (250,500]              4   
2                0               27               (10,50]              0   
3                0               16               (10,50]              0   
4                0              122             (100,150]              4   

  fans_user_num_range  friend_user_num  ... onehot_feat15  onehot_feat16  \
0                 

#### Compute recommendations

In [7]:
print("Computing recommendations...")

# Compute cosine similarity: users x videos
#similarity_matrix = cosine_similarity(user_tag_matrix, video_tag_matrix)
similarity_matrix = cosine_similarity(user_tag_matrix, video_tag_matrix)

print(f"Similarity matrix shape: {similarity_matrix.shape}")


Computing recommendations...
Similarity matrix shape: (7176, 343341)


In [8]:

def recommend_top_n(sim_matrix, video_ids, top_n=5):
    recommendations_cb = {}
    for user_idx, sims in enumerate(sim_matrix):
        top_indices = np.argsort(sims)[::-1]
        unique_recs = []
        seen = set()
        for idx in top_indices:
            vid = video_ids[idx]
            if vid not in seen:
                unique_recs.append(vid)
                seen.add(vid)
            if len(unique_recs) == top_n:
                break
        recommendations_cb[user_ids[user_idx]] = unique_recs
    return recommendations_cb

top_n = 10  # Number of videos to recommend per user
recommendations = recommend_top_n(similarity_matrix, video_ids, top_n)

#### Hybridization

In [9]:
# Normalize popularity
video_metadata['normalized_popularity'] = (
    (video_metadata['like_cnt'] - video_metadata['like_cnt'].min()) /
    (video_metadata['like_cnt'].max() - video_metadata['like_cnt'].min())
)
popularity_scores = video_metadata.set_index('video_id')['normalized_popularity'].reindex(video_ids).fillna(0).values

def recommend_top_n_hybrid(sim_matrix, video_ids, popularity_scores, alpha=0.7, top_n=10):
    recommendations = {}
    for user_idx, sims in enumerate(sim_matrix):
        # Blend content similarity with popularity
        final_scores = alpha * sims + (1 - alpha) * popularity_scores

        top_indices = np.argsort(final_scores)[::-1]
        unique_recs = []
        seen = set()
        for idx in top_indices:
            vid = video_ids[idx]
            if vid not in seen:
                unique_recs.append(vid)
                seen.add(vid)
            if len(unique_recs) == top_n:
                break
        recommendations[user_ids[user_idx]] = unique_recs
    return recommendations

recommendations_hybrid = recommend_top_n_hybrid(similarity_matrix, video_ids, popularity_scores, alpha=0.7, top_n=top_n)

### Neural Network

In [10]:
# Neural network recommender with embeddings for one-hot features
# Define a PyTorch Dataset class
class RecSysDataset(Dataset):
    def __init__(self, interactions, user_features_df, video_features_df):
        self.user_map = user_features_df.set_index('user_id')
        self.video_map = video_features_df.set_index('video_id')

        self.onehot_feats = [f'onehot_feat{i}' for i in range(1, 18)]
        self.samples = []

        time_begin = datetime.datetime.now()

        expected_user_feat_len = len(self.onehot_feats)
        for idx, row in interactions.iterrows():
            if idx % 100000 == 0:
                print(f"Processed {idx} rows")
            if idx % 1000000 == 0:
                print(f"Processed {idx} rows, elapsed time: {datetime.datetime.now() - time_begin}")
                time_begin = datetime.datetime.now()
            user_id = row['user_id']
            video_id = row['video_id']

            if user_id not in self.user_map.index or video_id not in self.video_map.index:
                continue

            user_feat = self.user_map.loc[user_id]
            video_feat = self.video_map.loc[video_id]

            # Ensure all onehot_feats are present and ordered, fill missing with 0
            user_input = user_feat.reindex(self.onehot_feats).infer_objects(copy=False).fillna(0).astype(int).to_numpy()
            user_input = np.clip(user_input, a_min=0, a_max=None)

            video_input = np.array([video_feat['video_tag_id']], dtype=np.int64)

            # Check input sizes
            if user_input.shape[0] != expected_user_feat_len:
                continue  # Skip malformed user input

            x = np.concatenate([user_input, video_input.flatten()])

            # Extra safety check
            if x.shape[0] != expected_user_feat_len + 1:
                continue  # Skip malformed total input

            y = 1.0 if row['watch_ratio'] > 0.5 else 0.0
            self.samples.append((x, y))


    def __len__(self):
        return len(self.samples)

    def __getitem__(self, idx):
        return torch.tensor(self.samples[idx][0], dtype=torch.long), torch.tensor(self.samples[idx][1], dtype=torch.float32)


print("After class RecSysDataset")
# Neural net with embeddings
class RecNN(nn.Module):
    def __init__(self):
        super(RecNN, self).__init__()
        self.embeddings = nn.ModuleList([
            nn.Embedding(8, 4),     # onehot_feat1
            nn.Embedding(30, 8),    # onehot_feat2
            nn.Embedding(1076, 32), # onehot_feat3
            nn.Embedding(12, 4),
            nn.Embedding(10, 4),
            nn.Embedding(3, 2),
            nn.Embedding(47, 6),
            nn.Embedding(340, 16),
            nn.Embedding(7, 4),
            nn.Embedding(5, 3),
            nn.Embedding(3, 2),
            nn.Embedding(2, 2),
            nn.Embedding(2, 2),
            nn.Embedding(2, 2),
            nn.Embedding(2, 2),
            nn.Embedding(2, 2),
            nn.Embedding(2, 2),
            nn.Embedding(2892, 32)  # video_tag_id, adjust range as needed
        ])


        total_emb_size = sum(emb.embedding_dim for emb in self.embeddings)

        self.model = nn.Sequential(
            nn.Linear(total_emb_size, 64),
            nn.ReLU(),
            nn.Linear(64, 32),
            nn.ReLU(),
            nn.Linear(32, 1),
            nn.Sigmoid()
        )

    def forward(self, x):
        for i, emb in enumerate(self.embeddings):
            max_index = emb.num_embeddings
            col_vals = x[:, i]
            if (col_vals >= max_index).any() or (col_vals < 0).any():
                print(f"[ERROR] Embedding {i}: min = {col_vals.min().item()}, max = {col_vals.max().item()}")

        embedded = [emb(x[:, i]) for i, emb in enumerate(self.embeddings)]
        x_cat = torch.cat(embedded, dim=1)
        return self.model(x_cat)

print("After class RecNN")
# Prepare dataset and model
# TODO: Align features and normalize/encode any continuous features if used
train_dataset = RecSysDataset(interactions_train[:200000], user_features, video_metadata)
print("After train_dataset")
train_loader = DataLoader(train_dataset, batch_size=1024, shuffle=True)

print("After train_loader")
model = RecNN()

print("After model")
# Training setup
criterion = nn.BCELoss()
print("After criterion")
optimizer = optim.Adam(model.parameters(), lr=0.001)
print("After optimizer")
# Training loop
for epoch in range(5):  # Keep epochs small for lightweight computing
    model.train()
    total_loss = 0
    for batch_x, batch_y in train_loader:
        optimizer.zero_grad()
        outputs = model(batch_x)
        loss = criterion(outputs.squeeze(), batch_y)
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
    print(f"Epoch {epoch + 1}, Loss: {total_loss:.4f}")
print("Finished!")
# Generate top-N recommendations for each user
def generate_recommendations(model, user_features_df, video_metadata_df, N=10):
    model.eval()
    recommendations = {}
    video_ids = video_metadata_df['video_id'].values
    video_tags = video_metadata_df['video_tag_id'].infer_objects(copy=False).fillna(0).astype(int).values

    # Prepare user features
    begin = datetime.datetime.now()
    for _, user_row in user_features_df.iterrows():
        user_id = user_row['user_id']
        if user_id % 100 == 0:
            print(f"{user_id}/{user_features_df.shape[0]} users processed in {datetime.datetime.now() - begin}")
        onehot_feats = [f'onehot_feat{i}' for i in range(1, 18)]
        user_input = user_row[onehot_feats].infer_objects(copy=False).fillna(0).to_numpy(dtype=np.int64)

        inputs = []
        for tag in video_tags:
            x = np.concatenate([user_input, [tag]])
            inputs.append(x)

        inputs_tensor = torch.tensor(np.array(inputs), dtype=torch.long)
        with torch.no_grad():
            scores = model(inputs_tensor).squeeze().numpy()

        top_indices = np.argsort(scores)[-N:][::-1]
        recommended_videos = video_ids[top_indices]
        recommendations[user_id] = recommended_videos.tolist()

    return recommendations

nn_recommendations = generate_recommendations(model, user_features[:500], video_metadata, N=30)


After class RecSysDataset
After class RecNN
Processed 0 rows
Processed 0 rows, elapsed time: 0:00:00.115620
Processed 100000 rows
After train_dataset
After train_loader
After model
After criterion
After optimizer
Epoch 1, Loss: 0.6804
Epoch 2, Loss: 0.6731
Epoch 3, Loss: 0.6665
Epoch 4, Loss: 0.6606
Epoch 5, Loss: 0.6552
Finished!
0/500 users processed in 0:00:00.000524
100/500 users processed in 0:00:48.751895
200/500 users processed in 0:01:36.982012
300/500 users processed in 0:02:25.206804
400/500 users processed in 0:03:13.335605


In [11]:
ground_truth = defaultdict(set)
for _, row in interactions_test.iterrows():
    ground_truth[row['user_id']].add(row['video_id'])

def evaluate_recommendations(recommendations, ground_truth, k=20):
    precision_list = []
    recall_list = []
    hit_count = 0
    user_count = 0
    for user_id, recs in recommendations.items():
        if user_id not in ground_truth:
            continue  # No test data for this user
        true_items = ground_truth[user_id]
        recommended_items = recs[:k]
        hits = len(set(recommended_items) & true_items)
        precision = hits / k
        recall = hits / len(true_items) if len(true_items) > 0 else 0
        hit = 1 if hits > 0 else 0
        precision_list.append(precision)
        recall_list.append(recall)
        hit_count += hit
        user_count += 1

    precision_at_k = np.mean(precision_list)
    recall_at_k = np.mean(recall_list)
    hit_rate = hit_count / user_count if user_count > 0 else 0

    return {
        'Precision@K': precision_at_k,
        'Recall@K': recall_at_k,
        'HitRate@K': hit_rate,
        'Evaluated Users': user_count
    }

# Evaluate content-based recommendations
content_based_results = evaluate_recommendations(recommendations_hybrid, ground_truth, k=30)
print('Content-Based Evaluation Results:', content_based_results)

# Evaluate ALS recommendations
als_results = evaluate_recommendations(als_recommendations, ground_truth, k=30)
print('ALS Evaluation Results:', als_results)

# Evaluate ALS recommendations
nn_results = evaluate_recommendations(nn_recommendations, ground_truth, k=30)
print('NN Evaluation Results:', nn_results)

Content-Based Evaluation Results: {'Precision@K': np.float64(0.03939056112969156), 'Recall@K': np.float64(0.0031613462128858875), 'HitRate@K': 0.6885451505016722, 'Evaluated Users': 7176}
ALS Evaluation Results: {'Precision@K': np.float64(0.02900408769973987), 'Recall@K': np.float64(0.002589262589964959), 'HitRate@K': 0.8701226309921962, 'Evaluated Users': 7176}
NN Evaluation Results: {'Precision@K': np.float64(0.004733333333333333), 'Recall@K': np.float64(0.00036201210006546437), 'HitRate@K': 0.124, 'Evaluated Users': 500}


#### Save recommendations

In [12]:
print("\nSample Recommendations:")
for uid in list(recommendations_hybrid.keys())[:5]:
    print(f"User {uid} recommendations: {recommendations[uid]}")

# save to CSV
def save_recommendations(recommendations, filename):
    # Create a DataFrame from the recommendations dictionary
    df = pd.DataFrame.from_dict(recommendations, orient='index')
    # Reset index to make user_id a column
    df = df.reset_index()
    # Rename the index column to user_id
    df = df.rename(columns={'index': 'user_id'})
    # Save to CSV
    df.to_csv(os.path.join(processed_path, filename), index=False)

save_recommendations(recommendations_hybrid, "content_based_recommendations.csv")

save_recommendations(recommendations, "als_recommendations.csv")

save_recommendations(nn_recommendations, "nn_recommendations.csv")


Sample Recommendations:
User 0 recommendations: [np.int64(8699), np.int64(9436), np.int64(5567), np.int64(7716), np.int64(2073), np.int64(9433), np.int64(9431), np.int64(652), np.int64(649), np.int64(663)]
User 1 recommendations: [np.int64(8699), np.int64(9436), np.int64(5567), np.int64(7716), np.int64(2073), np.int64(9433), np.int64(9431), np.int64(652), np.int64(649), np.int64(663)]
User 2 recommendations: [np.int64(9121), np.int64(3793), np.int64(9064), np.int64(5375), np.int64(9060), np.int64(6839), np.int64(9540), np.int64(464), np.int64(4434), np.int64(6028)]
User 3 recommendations: [np.int64(8699), np.int64(9436), np.int64(5567), np.int64(7716), np.int64(2073), np.int64(9433), np.int64(9431), np.int64(652), np.int64(649), np.int64(663)]
User 4 recommendations: [np.int64(280), np.int64(1884), np.int64(4590), np.int64(684), np.int64(6204), np.int64(1889), np.int64(3096), np.int64(4026), np.int64(1865), np.int64(5663)]


### Save the Trained Models

In [13]:
# Save the ALS model
import pickle

with open(os.path.join(processed_path, 'als_model.pkl'), 'wb') as f:
    pickle.dump(als_model, f)

print("Models saved successfully!")


Models saved successfully!
