In [1]:
import os
import pandas as pd
from sklearn.model_selection import train_test_split
from src.utils.utils import ensure_datetime
import numpy as np
from datetime import datetime

### Load the datasets

In [2]:
# Define the path to the processed data
processed_path = "../data/processed/"

# Load the processed datasets
big_matrix = pd.read_csv(os.path.join(processed_path, "big_matrix_processed.csv"))
small_matrix = pd.read_csv(os.path.join(processed_path, "small_matrix_processed.csv"))
user_features = pd.read_csv(os.path.join(processed_path, "user_features_processed.csv"))
item_daily_features = pd.read_csv(os.path.join(processed_path, "item_daily_features_processed.csv"))

# Ensure datetime columns are in the correct format
big_matrix = ensure_datetime(big_matrix, 'datetime')
small_matrix = ensure_datetime(small_matrix, 'datetime')

### User Features

In [3]:
def create_user_based_features(interactions, user_features):
    # Total Videos Watched
    total_videos_watched = interactions.groupby('user_id')['video_id'].count().reset_index(name='total_videos_watched')

    # Average Watch Ratio
    avg_watch_ratio = interactions.groupby('user_id')['watch_ratio'].mean().reset_index(name='avg_watch_ratio')

    # Preferred Video Category
    video_category_counts = item_daily_features.groupby('video_id')['feat'].first().reset_index()
    interactions_with_categories = interactions.merge(video_category_counts, on='video_id', how='left')
    preferred_categories = interactions_with_categories.groupby('user_id')['feat'].apply(lambda x: x.value_counts().idxmax() if not x.value_counts().empty else np.nan).reset_index(name='preferred_category')

    # Merge with user_features
    user_features = user_features.merge(total_videos_watched, on='user_id', how='left')
    user_features = user_features.merge(avg_watch_ratio, on='user_id', how='left')
    user_features = user_features.merge(preferred_categories, on='user_id', how='left')

    return user_features

### Video Features

In [4]:
def create_video_based_features(item_daily_features):
    # Total Likes
    total_likes = item_daily_features.groupby('video_id')['like_cnt'].sum().reset_index(name='total_likes')

    # Average Play Duration
    avg_play_duration = item_daily_features.groupby('video_id')['play_duration'].mean().reset_index(name='avg_play_duration')

    # Video Tags
    video_tags = item_daily_features[['video_id', 'feat']].drop_duplicates()

    # Merge with item_daily_features
    item_daily_features.drop(columns=['feat'], inplace=True)

    item_daily_features = item_daily_features.merge(total_likes, on='video_id', how='left')
    item_daily_features = item_daily_features.merge(avg_play_duration, on='video_id', how='left')
    item_daily_features = item_daily_features.merge(video_tags, on='video_id', how='left')

    return item_daily_features

### Interaction Features

In [5]:
def create_interaction_based_features(interactions):
    # User-Video Watch Count
    user_video_watch_count = interactions.groupby(['user_id', 'video_id']).size().reset_index(name='user_video_watch_count')

    # User-Video Average Watch Ratio
    user_video_avg_watch_ratio = interactions.groupby(['user_id', 'video_id'])['watch_ratio'].mean().reset_index(name='user_video_avg_watch_ratio')

    # Merge with interactions
    interactions = interactions.merge(user_video_watch_count, on=['user_id', 'video_id'], how='left')
    interactions = interactions.merge(user_video_avg_watch_ratio, on=['user_id', 'video_id'], how='left')

    # Drop duplicates
    interactions = interactions.drop_duplicates(subset=['user_id', 'video_id'])

    return interactions

### Social Features

In [6]:
def create_social_based_features(user_features, interactions, item_daily_features):
    # Number of Friends
    num_friends = user_features[['user_id', 'friend_list']].copy()
    num_friends['num_friends'] = num_friends['friend_list'].apply(lambda x: len(x) if isinstance(x, list) else 0)

    # Friends' Preferences
    video_category_counts = item_daily_features.groupby('video_id')['feat'].first().reset_index()
    interactions_with_categories = interactions.merge(video_category_counts, on='video_id', how='left')
    friends_preferences = interactions_with_categories.groupby('user_id')['feat'].apply(lambda x: x.value_counts().idxmax() if not x.value_counts().empty else np.nan).reset_index(name='friends_preferred_category')

    # Merge with user_features
    user_features = user_features.merge(num_friends[['user_id', 'num_friends']], on='user_id', how='left')
    user_features = user_features.merge(friends_preferences, on='user_id', how='left')

    return user_features

### Computing features

In [7]:
# Create features for big_matrix and small_matrix
interactions = pd.concat([big_matrix, small_matrix])

print("Creating interaction-based features...")
interactions = create_interaction_based_features(interactions)

# Create user-based features
print("Creating user-based features...")
user_features = create_user_based_features(interactions, user_features)

# Create video-based features
print("Creating video-based features...")
item_daily_features = create_video_based_features(item_daily_features)

# Create social-based features
print("Creating social-based features...")
user_features = create_social_based_features(user_features, interactions, item_daily_features)

# Split interactions into training and testing sets
train_interactions, test_interactions = train_test_split(interactions, test_size=0.2, random_state=42)

Creating interaction-based features...
Creating user-based features...
Creating video-based features...
Creating social-based features...


### Save the engineered features

In [8]:
# Save the engineered features
train_interactions.to_csv(os.path.join(processed_path, "interactions_train.csv"), index=False)
test_interactions.to_csv(os.path.join(processed_path, "interactions_test.csv"), index=False)
user_features.to_csv(os.path.join(processed_path, "user_features.csv"), index=False)
item_daily_features.to_csv(os.path.join(processed_path, "video_metadata.csv"), index=False)

# Create a sample submission file
sample_submission = test_interactions[['user_id', 'video_id']].copy()
sample_submission['prediction'] = 0  # Placeholder for predictions
sample_submission.to_csv(os.path.join(processed_path, "sample_submission.csv"), index=False)

print("Feature engineering completed successfully!")


Feature engineering completed successfully!
