In [2]:
import os
import datetime
import pandas as pd
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
from scipy.sparse import csr_matrix
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.metrics import precision_score, recall_score, f1_score
from implicit.als import AlternatingLeastSquares
from implicit.evaluation import precision_at_k, ndcg_at_k, mean_average_precision_at_k
from src.utils.utils import safe_parse_feat
from sklearn.preprocessing import MultiLabelBinarizer
import pickle
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
from src.utils.RecNN import RecNN


KeyboardInterrupt: 

### Load the Data

In [2]:
# Define the path to the processed data
processed_path = "../data/processed/"

# Load the processed datasets
interactions_train = pd.read_csv(os.path.join(processed_path, "interactions_train.csv"))
interactions_test = pd.read_csv(os.path.join(processed_path, "interactions_test.csv"))
user_features = pd.read_csv(os.path.join(processed_path, "user_features.csv"))
video_metadata = pd.read_csv(os.path.join(processed_path, "video_metadata.csv"))

video_metadata["feat"] = video_metadata["feat"].apply(safe_parse_feat)
video_metadata = video_metadata[video_metadata["feat"].notnull()]

user_features["preferred_category"] = user_features["preferred_category"].apply(safe_parse_feat)
user_features = user_features[user_features["preferred_category"].notnull()]

user_features["friends_preferred_category"] = user_features["friends_preferred_category"].apply(safe_parse_feat)
user_features = user_features[user_features["friends_preferred_category"].notnull()]

### Collaborative Filtering with ALS

In [3]:
# Create the interaction matrix
interaction_matrix = csr_matrix((interactions_train['watch_ratio'],
                                 (interactions_train['user_id'], interactions_train['video_id'])))

# Create the test interaction matrix
test_interaction_matrix = csr_matrix((interactions_test['watch_ratio'],
                                      (interactions_test['user_id'], interactions_test['video_id'])))

# Initialize the ALS model
als_model = AlternatingLeastSquares(factors=50, regularization=0.1, iterations=20)

# Train the ALS model
als_model.fit(interaction_matrix)

  check_blas_config()
100%|██████████| 20/20 [01:43<00:00,  5.18s/it]


### Evaluate the ALS Model

In [4]:
# Calculate Precision@K and Recall@K
K = 30
ndcg = ndcg_at_k(als_model, interaction_matrix, test_interaction_matrix, int_K=K, show_progress=True, int_num_threads=1)
ma_precision = mean_average_precision_at_k(als_model, interaction_matrix, test_interaction_matrix, int_K=K, show_progress=True, int_num_threads=1)
precision = precision_at_k(als_model, interaction_matrix, test_interaction_matrix, int_K=K, show_progress=True, int_num_threads=1)

print(f"NDCG@{K}: {ndcg} - Best for position-aware ranking quality.")
print(f"MAP@{K}: {ma_precision} – Great global ranking evaluation.")
print(f"Precision@{K}: {precision} - Simple and intuitive.")

100%|██████████| 7176/7176 [00:00<00:00, 13371.57it/s]
100%|██████████| 7176/7176 [00:00<00:00, 16109.02it/s]
100%|██████████| 7176/7176 [00:00<00:00, 15646.40it/s]

NDCG@30: 0.8049591525105662 - Best for position-aware ranking quality.
MAP@30: 0.6872343248485162 – Great global ranking evaluation.
Precision@30: 0.7890308883716824 - Simple and intuitive.





### Content-Based Filtering with and Cosine Similarity

#### Prepare video embeddings


In [5]:
# Combine all tags into a flat list to get unique values
all_tags = set(tag for tags in video_metadata['feat'].tolist() +
                         user_features['preferred_category'].tolist() +
                         user_features['friends_preferred_category'].tolist() for tag in tags)
print(f"Unique tags: {len(all_tags)}")
print(user_features.head())
# Initialize MultiLabelBinarizer
mlb = MultiLabelBinarizer(classes=sorted(all_tags))
tfidf_transformer = TfidfTransformer()

print(user_features)
# Encode video tags
print("Encoding video tags...")
video_tag_matrix = mlb.fit_transform(video_metadata['feat'])
video_ids = video_metadata['video_id'].values

# Encode user preferences (we'll combine preferred and friends_preferred)
user_combined_tags = user_features.apply(
    lambda row: list(set(row['preferred_category'] + row['friends_preferred_category'])), axis=1
)
print("Encoding user tags...")
user_tag_matrix = mlb.transform(user_combined_tags)
user_ids = user_features['user_id'].values

Unique tags: 31
   user_id user_active_degree  is_lowactive_period  is_live_streamer  \
0        0        high_active                    0                 0   
1        1        full_active                    0                 0   
2        2        full_active                    0                 0   
3        3        full_active                    0                 0   
4        4        full_active                    0                 0   

   is_video_author  follow_user_num follow_user_num_range  fans_user_num  \
0                0                5                (0,10]              0   
1                0              386             (250,500]              4   
2                0               27               (10,50]              0   
3                0               16               (10,50]              0   
4                0              122             (100,150]              4   

  fans_user_num_range  friend_user_num  ... onehot_feat14  onehot_feat15  \
0                 

#### Compute recommendations

In [6]:
print("Computing recommendations...")

# Compute cosine similarity: users x videos
#similarity_matrix = cosine_similarity(user_tag_matrix, video_tag_matrix)
similarity_matrix = cosine_similarity(user_tag_matrix, video_tag_matrix)

print(f"Similarity matrix shape: {similarity_matrix.shape}")


Computing recommendations...
Similarity matrix shape: (7176, 343341)


### Neural Network

In [7]:
# Neural network recommender with embeddings for one-hot features
# Define a PyTorch Dataset class
class RecSysDataset(Dataset):
    def __init__(self, interactions, user_features_df, video_features_df):
        self.user_map = user_features_df.set_index('user_id')
        self.video_map = video_features_df.set_index('video_id')

        self.onehot_feats = [f'onehot_feat{i}' for i in range(1, 18)]
        self.samples = []

        time_begin = datetime.datetime.now()

        expected_user_feat_len = len(self.onehot_feats)
        for idx, row in interactions.iterrows():
            if idx % 100000 == 0:
                print(f"Processed {idx} rows")
            if idx % 1000000 == 0:
                print(f"Processed {idx} rows, elapsed time: {datetime.datetime.now() - time_begin}")
                time_begin = datetime.datetime.now()
            user_id = row['user_id']
            video_id = row['video_id']

            if user_id not in self.user_map.index or video_id not in self.video_map.index:
                continue

            user_feat = self.user_map.loc[user_id]
            video_feat = self.video_map.loc[video_id]

            # Ensure all onehot_feats are present and ordered, fill missing with 0
            user_input = user_feat.reindex(self.onehot_feats).infer_objects(copy=False).fillna(0).astype(int).to_numpy()
            user_input = np.clip(user_input, a_min=0, a_max=None)

            video_input = np.array([video_feat['video_tag_id']], dtype=np.int64)

            # Check input sizes
            if user_input.shape[0] != expected_user_feat_len:
                continue  # Skip malformed user input

            x = np.concatenate([user_input, video_input.flatten()])

            # Extra safety check
            if x.shape[0] != expected_user_feat_len + 1:
                continue  # Skip malformed total input

            y = 1.0 if row['watch_ratio'] > 0.5 else 0.0
            self.samples.append((x, y))


    def __len__(self):
        return len(self.samples)

    def __getitem__(self, idx):
        return torch.tensor(self.samples[idx][0], dtype=torch.long), torch.tensor(self.samples[idx][1], dtype=torch.float32)

# Prepare dataset and model
print("Preparing dataset...")
train_dataset = RecSysDataset(interactions_train[:200000], user_features, video_metadata) # TODO: remove limit
train_loader = DataLoader(train_dataset, batch_size=1024, shuffle=True)
val_dataset = RecSysDataset(interactions_test[:20000], user_features, video_metadata) # TODO: remove limit
val_loader = DataLoader(val_dataset, batch_size=1024, shuffle=False)
# Create the model
print("Creating model...")
model = RecNN()

# Training setup
print("Setting up training...")
criterion = nn.BCELoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

# Training loop
print("\nTRAINING STARTED")
for epoch in range(5):  # Keep epochs small for lightweight computing
    model.train()
    total_loss = 0
    for batch_x, batch_y in train_loader:
        optimizer.zero_grad()
        outputs = model(batch_x)
        loss = criterion(outputs.squeeze(), batch_y)
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
    print(f"Epoch {epoch + 1}, Loss: {total_loss:.4f}")
print("Finished!")


y_true = []
y_pred = []
model.eval()
with torch.no_grad():
    for batch_x, batch_y in val_loader:
        outputs = model(batch_x)
        preds = (outputs.squeeze() > 0.5).float()
        y_true.extend(batch_y.cpu().numpy())
        y_pred.extend(preds.cpu().numpy())

print("Precision:", precision_score(y_true, y_pred))
print("Recall:", recall_score(y_true, y_pred))
print("F1:", f1_score(y_true, y_pred))


# Generate top-N recommendations for each user
def generate_recommendations(model, user_features_df, video_metadata_df, N=10):
    model.eval()
    recommendations = {}
    video_ids = video_metadata_df['video_id'].values
    video_tags = video_metadata_df['video_tag_id'].infer_objects(copy=False).fillna(0).astype(int).values

    # Prepare user features
    begin = datetime.datetime.now()
    for _, user_row in user_features_df.iterrows():
        user_id = user_row['user_id']
        if user_id % 100 == 0:
            print(f"{user_id}/{user_features_df.shape[0]} users processed in {datetime.datetime.now() - begin}")
        onehot_feats = [f'onehot_feat{i}' for i in range(1, 18)]
        user_input = user_row[onehot_feats].infer_objects(copy=False).fillna(0).to_numpy(dtype=np.int64)

        inputs = []
        for tag in video_tags:
            x = np.concatenate([user_input, [tag]])
            inputs.append(x)

        inputs_tensor = torch.tensor(np.array(inputs), dtype=torch.long)
        with torch.no_grad():
            scores = model(inputs_tensor).squeeze().numpy()

        top_indices = np.argsort(scores)[-N:][::-1]
        recommended_videos = video_ids[top_indices]
        recommendations[user_id] = recommended_videos.tolist()

    return recommendations

nn_recommendations = generate_recommendations(model, user_features[:500], video_metadata, N=30) # TODO: remove limit


Preparing dataset...
Processed 0 rows
Processed 0 rows, elapsed time: 0:00:00.100675
Processed 100000 rows
Processed 0 rows
Processed 0 rows, elapsed time: 0:00:00.007752
Creating model...
Setting up training...

TRAINING STARTED
Epoch 1, Loss: 0.7061
Epoch 2, Loss: 0.6963
Epoch 3, Loss: 0.6872
Epoch 4, Loss: 0.6789
Epoch 5, Loss: 0.6711
Finished!
Precision: 0.47058823529411764
Recall: 1.0
F1: 0.64
0/500 users processed in 0:00:00.000378
100/500 users processed in 0:00:47.984602
200/500 users processed in 0:01:35.474542
300/500 users processed in 0:02:22.538598
400/500 users processed in 0:03:09.096341


### Save the Trained Models

In [1]:
model_path = "../models/"

np.save('similarity_matrix.npy', similarity_matrix)

print("Similarity matrix saved successfully!")

# Save the ALS model
with open(os.path.join(model_path, 'als_model.pkl'), 'wb') as f:
    pickle.dump(als_model, f)
print("ALS model saved successfully!")

# Save the neural network model
torch.save(model.state_dict(), os.path.join(model_path, 'nn_model.pth'))
print("Neural network model saved successfully!")

print("Models saved successfully!")


NameError: name 'pd' is not defined