In [1]:
import os
import pandas as pd

from src.utils.utils import ensure_datetime
import numpy as np
from datetime import datetime

### Load the datasets

In [2]:
# Define the path to the processed data
processed_path = "../data/processed/"

# Load the processed datasets
big_matrix = pd.read_csv(os.path.join(processed_path, "big_matrix_processed.csv"))
small_matrix = pd.read_csv(os.path.join(processed_path, "small_matrix_processed.csv"))
user_features = pd.read_csv(os.path.join(processed_path, "user_features_processed.csv"))
item_daily_features = pd.read_csv(os.path.join(processed_path, "item_daily_features_processed.csv"))

# Ensure datetime columns are in the correct format
big_matrix = ensure_datetime(big_matrix, 'datetime')
small_matrix = ensure_datetime(small_matrix, 'datetime')

### User Features

In [3]:
def create_user_based_features(interactions, user_features):
    # Total Videos Watched
    total_videos_watched = interactions.groupby('user_id')['video_id'].count().reset_index(name='total_videos_watched')

    # Average Watch Ratio
    avg_watch_ratio = interactions.groupby('user_id')['watch_ratio'].mean().reset_index(name='avg_watch_ratio')

    # Preferred Video Category
    video_category_counts = item_daily_features.groupby('video_id')['feat'].first().reset_index()
    interactions_with_categories = interactions.merge(video_category_counts, on='video_id', how='left')
    preferred_categories = interactions_with_categories.groupby('user_id')['feat'].apply(lambda x: x.value_counts().idxmax() if not x.value_counts().empty else np.nan).reset_index(name='preferred_category')

    # User Activity Level
    user_activity_level = interactions.groupby('user_id')['datetime'].nunique().reset_index(name='user_activity_level')

    # Merge with user_features
    user_features = user_features.merge(total_videos_watched, on='user_id', how='left')
    user_features = user_features.merge(avg_watch_ratio, on='user_id', how='left')
    user_features = user_features.merge(preferred_categories, on='user_id', how='left')
    user_features = user_features.merge(user_activity_level, on='user_id', how='left')

    return user_features

### Video Features

In [4]:
def create_video_based_features(item_daily_features):
    # Total Likes
    total_likes = item_daily_features.groupby('video_id')['like_cnt'].sum().reset_index(name='total_likes')

    # Average Play Duration
    avg_play_duration = item_daily_features.groupby('video_id')['play_duration'].mean().reset_index(name='avg_play_duration')

    # Video Tags
    video_tags = item_daily_features[['video_id', 'feat']].drop_duplicates()

    # Video Upload Time
    video_upload_time = item_daily_features[['video_id', 'upload_dt']].drop_duplicates()

    # Merge with item_daily_features
    item_daily_features.drop(columns=['upload_dt', 'feat'], inplace=True)

    item_daily_features = item_daily_features.merge(total_likes, on='video_id', how='left')
    item_daily_features = item_daily_features.merge(avg_play_duration, on='video_id', how='left')
    item_daily_features = item_daily_features.merge(video_tags, on='video_id', how='left')
    item_daily_features = item_daily_features.merge(video_upload_time, on='video_id', how='left')

    return item_daily_features

### Interaction Features

In [5]:
def create_interaction_based_features(interactions):
    # User-Video Watch Count
    user_video_watch_count = interactions.groupby(['user_id', 'video_id']).size().reset_index(name='user_video_watch_count')

    # User-Video Average Watch Ratio
    user_video_avg_watch_ratio = interactions.groupby(['user_id', 'video_id'])['watch_ratio'].mean().reset_index(name='user_video_avg_watch_ratio')

    # Time of Day
    interactions['time_of_day'] = interactions['datetime'].dt.hour

    # Day of Week
    interactions['day_of_week'] = interactions['datetime'].dt.dayofweek

    # Merge with interactions
    interactions = interactions.merge(user_video_watch_count, on=['user_id', 'video_id'], how='left')
    interactions = interactions.merge(user_video_avg_watch_ratio, on=['user_id', 'video_id'], how='left')

    return interactions

### Social Features

In [6]:
def create_social_based_features(user_features, interactions, item_daily_features):
    # Number of Friends
    num_friends = user_features[['user_id', 'friend_list']].copy()
    num_friends['num_friends'] = num_friends['friend_list'].apply(lambda x: len(x) if isinstance(x, list) else 0)

    # Friends' Preferences
    video_category_counts = item_daily_features.groupby('video_id')['feat'].first().reset_index()
    interactions_with_categories = interactions.merge(video_category_counts, on='video_id', how='left')
    friends_preferences = interactions_with_categories.groupby('user_id')['feat'].apply(lambda x: x.value_counts().idxmax() if not x.value_counts().empty else np.nan).reset_index(name='friends_preferred_category')

    # Social Influence Score
    social_influence_score = interactions.groupby('user_id')['watch_ratio'].mean().reset_index(name='social_influence_score')

    # Merge with user_features
    user_features = user_features.merge(num_friends[['user_id', 'num_friends']], on='user_id', how='left')
    user_features = user_features.merge(friends_preferences, on='user_id', how='left')
    user_features = user_features.merge(social_influence_score, on='user_id', how='left')

    return user_features

### Computing features

In [7]:
if 'feat' not in item_daily_features.columns:
    print("Not found 'feat' in item_daily_features")
# Create features for big_matrix and small_matrix
print("Creating interaction-based features for big_matrix...")
big_matrix = create_interaction_based_features(big_matrix)

if 'feat' not in item_daily_features.columns:
    print("Not found 'feat' in item_daily_features")
print("Creating interaction-based features for small_matrix...")
small_matrix = create_interaction_based_features(small_matrix)

if 'feat' not in item_daily_features.columns:
    print("Not found 'feat' in item_daily_features")
# Create user-based features
print("Creating user-based features...")
user_features = create_user_based_features(pd.concat([big_matrix, small_matrix]), user_features)

if 'feat' not in item_daily_features.columns:
    print("Not found 'feat' in item_daily_features")
# Create video-based features
print("Creating video-based features...")
item_daily_features = create_video_based_features(item_daily_features)

if 'feat' not in item_daily_features.columns:
    print("Not found 'feat' in item_daily_features")
# Create social-based features
print("Creating social-based features...")
#user_features = create_social_based_features(user_features, pd.concat([big_matrix, small_matrix]), item_daily_features)
user_features = create_social_based_features(user_features, small_matrix, item_daily_features)

Creating interaction-based features for big_matrix...
Creating interaction-based features for small_matrix...
Creating user-based features...
Creating video-based features...
Creating social-based features...


### Save the engineered features

In [8]:
big_matrix.to_csv(os.path.join(processed_path, "big_matrix_features.csv"), index=False)
small_matrix.to_csv(os.path.join(processed_path, "small_matrix_features.csv"), index=False)
user_features.to_csv(os.path.join(processed_path, "user_features_engineered.csv"), index=False)
item_daily_features.to_csv(os.path.join(processed_path, "item_daily_features_engineered.csv"), index=False)

print("Feature engineering completed successfully!")


Feature engineering completed successfully!
