In [1]:
import pandas as pd
import numpy as np
import gc
from datetime import datetime
from scipy.sparse import csr_matrix
import json
import os

RAW_DATA_BASE_PATH = "../raw_data/KuaiRec/data/"
PROCESSED_DATA_PATH = "../data/"
MODELS_PATH = "../models/"
os.makedirs(PROCESSED_DATA_PATH, exist_ok=True)
os.makedirs(MODELS_PATH, exist_ok=True)

# --- Dtype Definitions & Column Selections
interaction_cols_initial_load = {
    'user_id': 'float32', 'video_id': 'float32', 'play_duration': 'float32',
    'video_duration': 'float32', 'time': 'str', 'date': 'float32',
    'timestamp': 'float32', 'watch_ratio': 'float32'
}
interaction_cols_final_dtypes = {
    'user_id': 'int32', 'video_id': 'int32', 'play_duration': 'int32',
    'video_duration': 'int32', 'time': 'str', 'date': 'int32',
    'timestamp': 'float32', 'watch_ratio': 'float32'
}

# User features
user_features_selected_cols = {
    'user_id': 'int32', 'user_active_degree': 'category',
    'is_lowactive_period': 'float32', # ADDED BACK
    'is_live_streamer': 'float32', 'is_video_author': 'float32',
    'follow_user_num': 'int32',      # ADDED BACK
    'fans_user_num': 'int32', 'register_days': 'int32',
}
onehot_feature_names = []
for i in range(18):
    col_name = f'onehot_feat{i}'
    user_features_selected_cols[col_name] = 'float32'
    onehot_feature_names.append(col_name)

item_categories_selected_cols = {'video_id': 'int32', 'feat': 'str'}

# Item daily features
item_daily_initial_load_cols = {
    'video_id': 'float32', 'date': 'float32', 'author_id': 'float32',
    'video_type': 'category', 'upload_dt': 'str', 'video_duration': 'float32',
    'show_cnt': 'float32', 'play_cnt': 'float32', 'like_cnt': 'float32',
    'complete_play_cnt': 'float32', 'play_progress': 'float32',
    'video_tag_id': 'float32',
}
item_daily_cols_to_keep_for_merge = [
    'video_id', 'item_stats_date', 'author_id', 'video_type', 'upload_dt_parsed',
    'video_duration_daily', 'daily_play_progress',
    'daily_play_per_show_ratio', 'daily_like_per_play_ratio', 'daily_completion_rate',
    'video_tag_id'
]

print("--- Phase 1: Data Preparation & Feature Engineering (Slightly Richer V3) ---")
print("--- Step 1.1: Loading Selected Raw Data & Initial Cleaning ---")

def post_process_interaction_df(df, final_dtypes_map):
    int_cols_with_potential_na = ['user_id', 'video_id', 'play_duration', 'video_duration', 'date']
    for col in int_cols_with_potential_na:
        if col in df.columns:
            fill_value = -1 if ('id' in col or 'date' in col) else 0
            df[col] = df[col].fillna(fill_value).astype(final_dtypes_map[col])
    for col in ['timestamp', 'watch_ratio']:
        if col in df.columns: df[col] = df[col].astype(final_dtypes_map[col])
    return df

print("Loading big_matrix.csv...")
df_big_interactions = pd.read_csv(os.path.join(RAW_DATA_BASE_PATH, "big_matrix.csv"),
                                  usecols=interaction_cols_initial_load.keys(),
                                  dtype=interaction_cols_initial_load)
df_big_interactions = post_process_interaction_df(df_big_interactions, interaction_cols_final_dtypes)
print(f"Loaded and processed big_matrix: {df_big_interactions.shape}")

print("\nLoading small_matrix.csv...")
df_small_interactions = pd.read_csv(os.path.join(RAW_DATA_BASE_PATH, "small_matrix.csv"),
                                    usecols=interaction_cols_initial_load.keys(),
                                    dtype=interaction_cols_initial_load)
df_small_interactions = post_process_interaction_df(df_small_interactions, interaction_cols_final_dtypes)
print(f"Loaded and processed small_matrix: {df_small_interactions.shape}")

print("\nLoading user_features.csv (selected columns)...")
df_user_features = pd.read_csv(os.path.join(RAW_DATA_BASE_PATH, "user_features.csv"),
                               usecols=user_features_selected_cols.keys(),
                               dtype=user_features_selected_cols)
for col in onehot_feature_names:
    df_user_features[col] = df_user_features[col].fillna(-1).astype('int16')
for col in ['is_lowactive_period', 'is_live_streamer', 'is_video_author']: # ADDED is_lowactive_period
    if col in df_user_features.columns: # Check if loaded
        if df_user_features[col].isnull().any(): df_user_features[col] = df_user_features[col].fillna(-1).astype('int8')
        else: df_user_features[col] = df_user_features[col].astype('int8')
print(f"Loaded user_features: {df_user_features.shape}")

print("\nLoading item_categories.csv...")
df_item_categories = pd.read_csv(os.path.join(RAW_DATA_BASE_PATH, "item_categories.csv"),
                                 usecols=item_categories_selected_cols.keys(),
                                 dtype=item_categories_selected_cols)
def parse_feat_list(feat_str):
    if isinstance(feat_str, list): return len(feat_str)
    if pd.isna(feat_str) or not isinstance(feat_str, str): return 0
    try: return len(eval(feat_str))
    except: return 0
df_item_categories['num_item_tags'] = df_item_categories['feat'].apply(parse_feat_list).astype('int16')
df_item_categories = df_item_categories.drop(columns=['feat'])
print(f"Loaded item_categories: {df_item_categories.shape}")

print("\nLoading item_daily_features.csv (initial load for processing)...")
df_item_daily_full_load = pd.read_csv(os.path.join(RAW_DATA_BASE_PATH, "item_daily_features.csv"),
                                usecols=item_daily_initial_load_cols.keys(),
                                dtype=item_daily_initial_load_cols)
df_item_daily_processed = df_item_daily_full_load.rename(columns={
    'date': 'item_stats_date', 'video_duration': 'video_duration_daily',
    'show_cnt': 'daily_show_cnt', 'play_cnt': 'daily_play_cnt',
    'like_cnt': 'daily_like_cnt', 'complete_play_cnt': 'daily_complete_play_cnt',
    'play_progress': 'daily_play_progress'
})
del df_item_daily_full_load
gc.collect()

# NA fill and type conversion for item_daily_processed
id_cols_daily = ['video_id', 'author_id', 'video_tag_id'] # Added video_tag_id
count_cols_daily = ['daily_show_cnt', 'daily_play_cnt', 'daily_like_cnt', 'daily_complete_play_cnt']
for col in id_cols_daily:
    if col in df_item_daily_processed.columns:
        df_item_daily_processed[col] = df_item_daily_processed[col].fillna(-1).astype('int32')
if 'item_stats_date' in df_item_daily_processed.columns: # item_stats_date was original 'date'
    df_item_daily_processed['item_stats_date'] = df_item_daily_processed['item_stats_date'].fillna(-1).astype('int32')
for col in count_cols_daily:
    if col in df_item_daily_processed.columns:
        df_item_daily_processed[col] = df_item_daily_processed[col].fillna(0).astype('int32')
if 'video_duration_daily' in df_item_daily_processed.columns:
    df_item_daily_processed['video_duration_daily'] = df_item_daily_processed['video_duration_daily'].fillna(0).astype('float32')
if 'daily_play_progress' in df_item_daily_processed.columns:
    df_item_daily_processed['daily_play_progress'] = df_item_daily_processed['daily_play_progress'].fillna(0).astype('float32')
print(f"Processed initial item_daily_features: {df_item_daily_processed.shape}")
gc.collect()

--- Phase 1: Data Preparation & Feature Engineering (Slightly Richer V3) ---
--- Step 1.1: Loading Selected Raw Data & Initial Cleaning ---
Loading big_matrix.csv...
Loaded and processed big_matrix: (12530806, 8)

Loading small_matrix.csv...
Loaded and processed small_matrix: (4676570, 8)

Loading user_features.csv (selected columns)...
Loaded user_features: (7176, 26)

Loading item_categories.csv...
Loaded item_categories: (10728, 2)

Loading item_daily_features.csv (initial load for processing)...
Processed initial item_daily_features: (343341, 12)


0

In [2]:
# --- Step 1.2: Globally process df_item_daily for derived features & make it ultra-lean ---
print("\nGlobally processing df_item_daily for derived features & slimming...")
if 'upload_dt' in df_item_daily_processed.columns:
    df_item_daily_processed['upload_dt_parsed'] = pd.to_datetime(df_item_daily_processed['upload_dt'], errors='coerce')
else:
    df_item_daily_processed['upload_dt_parsed'] = pd.NaT # Placeholder if not loaded

# Ratio calculations (with checks)
if 'daily_play_cnt' in df_item_daily_processed.columns and 'daily_show_cnt' in df_item_daily_processed.columns:
    df_item_daily_processed['daily_play_per_show_ratio'] = (df_item_daily_processed['daily_play_cnt'] / (df_item_daily_processed['daily_show_cnt'] + 1e-6)).astype('float32')
else:
    df_item_daily_processed['daily_play_per_show_ratio'] = np.float32(0.0)
if 'daily_like_cnt' in df_item_daily_processed.columns and 'daily_play_cnt' in df_item_daily_processed.columns:
    df_item_daily_processed['daily_like_per_play_ratio'] = (df_item_daily_processed['daily_like_cnt'] / (df_item_daily_processed['daily_play_cnt'] + 1e-6)).astype('float32')
else:
    df_item_daily_processed['daily_like_per_play_ratio'] = np.float32(0.0)
if 'daily_complete_play_cnt' in df_item_daily_processed.columns and 'daily_play_cnt' in df_item_daily_processed.columns:
    df_item_daily_processed['daily_completion_rate'] = (df_item_daily_processed['daily_complete_play_cnt'] / (df_item_daily_processed['daily_play_cnt'] + 1e-6)).astype('float32')
else:
    df_item_daily_processed['daily_completion_rate'] = np.float32(0.0)

if 'video_type' in df_item_daily_processed.columns and df_item_daily_processed['video_type'].dtype.name != 'category':
    df_item_daily_processed['video_type'] = df_item_daily_processed['video_type'].astype('category')
    if 'Unknown' not in df_item_daily_processed['video_type'].cat.categories:
        df_item_daily_processed['video_type'] = df_item_daily_processed['video_type'].cat.add_categories('Unknown')

actual_cols_to_keep = [col for col in item_daily_cols_to_keep_for_merge if col in df_item_daily_processed.columns]
df_item_daily_lean_for_merge = df_item_daily_processed[actual_cols_to_keep].copy()
print(f"Slimmed df_item_daily_lean_for_merge for merging. Shape: {df_item_daily_lean_for_merge.shape}, Columns: {df_item_daily_lean_for_merge.columns.tolist()}")

print("\nChecking for duplicate keys in df_item_daily_lean_for_merge...")
key_cols_daily = ['video_id', 'item_stats_date']
actual_key_cols_daily = [col for col in key_cols_daily if col in df_item_daily_lean_for_merge.columns]
if len(actual_key_cols_daily) == len(key_cols_daily):
    duplicate_daily_keys = df_item_daily_lean_for_merge.duplicated(subset=actual_key_cols_daily, keep=False)
    num_duplicate_daily_keys = duplicate_daily_keys.sum()
    print(f"Number of rows involved in duplicate ({', '.join(actual_key_cols_daily)}) keys: {num_duplicate_daily_keys}")
    if num_duplicate_daily_keys > 0:
        df_item_daily_lean_for_merge = df_item_daily_lean_for_merge.drop_duplicates(subset=actual_key_cols_daily, keep='first').copy()
        print(f"Shape of df_item_daily_lean_for_merge after dropping duplicates: {df_item_daily_lean_for_merge.shape}")
else:
    print(f"Warning: Key columns for daily features duplicate check not found. Skipping.")
del df_item_daily_processed
gc.collect()


def engineer_features_for_interactions(df_interactions_raw, df_user_features_ref, df_item_categories_ref, df_item_daily_ref_lean):
    print(f"Engineering features for interaction table with shape: {df_interactions_raw.shape}")
    df = df_interactions_raw.copy()

    df['interaction_datetime'] = pd.to_datetime(df['time'], errors='coerce')
    df['interaction_hour'] = df['interaction_datetime'].dt.hour.fillna(-1).astype('int8')
    df['interaction_day_of_week'] = df['interaction_datetime'].dt.dayofweek.fillna(-1).astype('int8')
    df = df.rename(columns={'date': 'interaction_date', 'video_duration': 'video_duration_interaction'})

    print("Merging user features...")
    df = pd.merge(df, df_user_features_ref, on='user_id', how='left')
    print(f"Shape after user features merge: {df.shape}")
    gc.collect()
    
    print("Merging static item features...")
    df = pd.merge(df, df_item_categories_ref, on='video_id', how='left')
    df['num_item_tags'] = df['num_item_tags'].fillna(0).astype('int16')
    print(f"Shape after static item features merge: {df.shape}")
    gc.collect()

    print("Merging dynamic item features (lean)...")
    df = pd.merge(df, df_item_daily_ref_lean,
                     left_on=['video_id', 'interaction_date'],
                     right_on=['video_id', 'item_stats_date'],
                     how='left')
    print(f"Shape after dynamic item features merge: {df.shape}")
    gc.collect()

    print("Post-merge feature engineering...")
    if 'upload_dt_parsed' in df.columns and 'interaction_datetime' in df.columns:
        video_age_delta = df['interaction_datetime'] - df['upload_dt_parsed']
        df['video_age_days'] = video_age_delta.dt.days.fillna(-1).astype('int16')
    else:
        df['video_age_days'] = pd.Series([-1]*len(df), index=df.index, dtype='int16')

    # Updated list based on item_daily_cols_to_keep_for_merge
    cols_from_daily_merge_to_fill = [
        'author_id', 'video_type', 'video_duration_daily', 'daily_play_progress',
        'daily_play_per_show_ratio', 'daily_like_per_play_ratio', 'daily_completion_rate',
        'video_tag_id' # ADDED for NA handling
    ]
    for col in cols_from_daily_merge_to_fill:
        if col in df.columns:
            if col in ['author_id', 'video_tag_id']: # video_tag_id is also an ID
                df[col] = df[col].fillna(-1).astype('int32')
            elif col == 'video_type':
                if df[col].isnull().any():
                    if df[col].dtype.name != 'category':
                         df[col] = pd.Categorical(df[col]) 
                    current_categories = df[col].cat.categories.tolist()
                    if 'Unknown' not in current_categories:
                        df[col] = df[col].cat.add_categories('Unknown')
                    df[col] = df[col].fillna('Unknown')
            else: 
                df[col] = df[col].fillna(0).astype('float32')
        else:
            print(f"Warning: Column '{col}' expected from daily merge not found post-merge.")
            
    df = df.sort_values(by='timestamp').reset_index(drop=True)
    print(f"Finished engineering. Shape: {df.shape}")
    return df

# --- Apply Feature Engineering ---
print("\n--- Step 1.3: Apply Feature Engineering to Big Matrix Interactions ---")
df_big_merged = engineer_features_for_interactions(df_big_interactions, df_user_features, df_item_categories, df_item_daily_lean_for_merge)
del df_big_interactions
gc.collect()

print("\n--- Step 1.4: Apply Feature Engineering to Small Matrix Interactions ---")
df_small_eval_features = engineer_features_for_interactions(df_small_interactions, df_user_features, df_item_categories, df_item_daily_lean_for_merge)
del df_small_interactions
gc.collect()


Globally processing df_item_daily for derived features & slimming...
Slimmed df_item_daily_lean_for_merge for merging. Shape: (343341, 11), Columns: ['video_id', 'item_stats_date', 'author_id', 'video_type', 'upload_dt_parsed', 'video_duration_daily', 'daily_play_progress', 'daily_play_per_show_ratio', 'daily_like_per_play_ratio', 'daily_completion_rate', 'video_tag_id']

Checking for duplicate keys in df_item_daily_lean_for_merge...
Number of rows involved in duplicate (video_id, item_stats_date) keys: 226111
Shape of df_item_daily_lean_for_merge after dropping duplicates: (194155, 11)

--- Step 1.3: Apply Feature Engineering to Big Matrix Interactions ---
Engineering features for interaction table with shape: (12530806, 8)
Merging user features...
Shape after user features merge: (12530806, 36)
Merging static item features...
Shape after static item features merge: (12530806, 37)
Merging dynamic item features (lean)...
Shape after dynamic item features merge: (12530806, 47)
Post-mer

19

In [3]:
# --- Step 1.5: Prepare Data for ALS (from df_big_merged) ---
print("\n--- Step 1.5: Prepare Data for ALS (Candidate Generation from Big Matrix) ---")
# Check if 'user_id' and 'video_id' columns exist before proceeding
if 'user_id' not in df_big_merged.columns or 'video_id' not in df_big_merged.columns:
    raise ValueError("user_id or video_id not found in df_big_merged for ALS prep.")

unique_users_als_np = df_big_merged['user_id'].unique()
unique_videos_als_np = df_big_merged['video_id'].unique()

user_to_idx = {int(user_id): i for i, user_id in enumerate(unique_users_als_np)}
idx_to_user = {i: int(user_id) for user_id, val in user_to_idx.items() for i, user_id_original_type in enumerate(unique_users_als_np) if int(user_id_original_type) == val } # More robust creation
idx_to_user = {i: int(user_id) for i, user_id in enumerate(unique_users_als_np)} # Simpler assuming unique_users_als_np is in order of user_to_idx values.
num_users_als = len(unique_users_als_np)

video_to_idx = {int(video_id): i for i, video_id in enumerate(unique_videos_als_np)}
idx_to_video = {i: int(video_id) for i, video_id in enumerate(unique_videos_als_np)}
num_videos_als = len(unique_videos_als_np)

del unique_users_als_np, unique_videos_als_np
gc.collect()

als_user_ids = df_big_merged['user_id'].map(lambda x: user_to_idx.get(int(x)))
als_item_ids = df_big_merged['video_id'].map(lambda x: video_to_idx.get(int(x)))

# Check for NaNs introduced by .get() if some IDs were not in the mapping (should not happen if maps are from same df)
if als_user_ids.isnull().any() or als_item_ids.isnull().any():
    print("Warning: NaNs found in ALS mapped IDs. This indicates an issue with ID mapping.")
    als_user_ids = als_user_ids.fillna(-1).astype(int) # Handle potential NaNs, though ideally should not occur
    als_item_ids = als_item_ids.fillna(-1).astype(int)

als_ratings = df_big_merged['watch_ratio'].astype('float32')
als_ratings_clipped = np.maximum(als_ratings, 0.001)

interaction_matrix_als = csr_matrix((als_ratings_clipped, (als_user_ids, als_item_ids)),
                                    shape=(num_users_als, num_videos_als))
print(f"ALS Sparse Matrix Shape: {interaction_matrix_als.shape}")

with open(os.path.join(PROCESSED_DATA_PATH, 'user_to_idx_als.json'), 'w') as f: json.dump(user_to_idx, f)
with open(os.path.join(PROCESSED_DATA_PATH, 'video_to_idx_als.json'), 'w') as f: json.dump(video_to_idx, f)
with open(os.path.join(PROCESSED_DATA_PATH, 'idx_to_user_als.json'), 'w') as f: json.dump(idx_to_user, f)
with open(os.path.join(PROCESSED_DATA_PATH, 'idx_to_video_als.json'), 'w') as f: json.dump(idx_to_video, f)
del als_user_ids, als_item_ids, als_ratings, als_ratings_clipped
gc.collect()


--- Step 1.5: Prepare Data for ALS (Candidate Generation from Big Matrix) ---
ALS Sparse Matrix Shape: (7176, 10728)


128

In [4]:
# --- Step 1.6: Define Train/Test Split for LightGBM (from df_big_merged) & Save All Processed Data ---
print("\n--- Step 1.6: Define Train/Test Split (from Big Matrix) & Prepare LightGBM Data ---")
split_point = int(len(df_big_merged) * 0.8)
train_df_lgbm = df_big_merged.iloc[:split_point].copy()
test_df_lgbm = df_big_merged.iloc[split_point:].copy()
del df_big_merged
gc.collect()

print(f"LGBM Train data shape (from big matrix): {train_df_lgbm.shape}")
print(f"LGBM Test data shape (from big matrix): {test_df_lgbm.shape}")

target_col = 'watch_ratio'
cols_to_drop_lgbm = [
    'time', 'timestamp', 'interaction_datetime', 'upload_dt', 'upload_dt_parsed',
    'item_stats_date', 'play_duration'
]
# Define categorical_features_lgbm dynamically based on what's available and intended
# Start with known categoricals
categorical_features_lgbm = ['user_id', 'video_id', 'user_active_degree', 'interaction_hour', 'interaction_day_of_week']
# Add categoricals from user_features
user_cat_flags = ['is_lowactive_period', 'is_live_streamer', 'is_video_author'] # these were made int8
for flag in user_cat_flags:
    if flag in train_df_lgbm.columns: categorical_features_lgbm.append(flag)
categorical_features_lgbm.extend(onehot_feature_names) # these are int16
# Add categoricals from item_daily_features
daily_cat_features = ['author_id', 'video_type', 'video_tag_id'] # video_tag_id added back
for feat in daily_cat_features:
    if feat in train_df_lgbm.columns: categorical_features_lgbm.append(feat)

# Ensure all categorical features are actually present and correctly typed
final_lgbm_categorical_features = []
dfs_to_prep_cats = {'train_lgbm': train_df_lgbm, 'test_lgbm': test_df_lgbm, 'small_eval': df_small_eval_features}
for df_name, df_curr in dfs_to_prep_cats.items():
    print(f"Processing categoricals for {df_name}...")
    for col in categorical_features_lgbm: # Iterate over the master list
        if col in df_curr.columns:
            if df_curr[col].dtype.name != 'category':
                if df_curr[col].isnull().any():
                    if col in ['user_active_degree', 'video_type']:
                         if hasattr(df_curr[col], 'cat') and 'Unknown_Merge_NA' not in df_curr[col].cat.categories:
                             df_curr[col] = df_curr[col].cat.add_categories("Unknown_Merge_NA")
                         df_curr[col] = df_curr[col].fillna("Unknown_Merge_NA")
                    else: # ID-like or onehot indices
                         df_curr[col] = df_curr[col].fillna(-1) # Fill before astype
                df_curr[col] = df_curr[col].astype('category')
            if df_name == 'train_lgbm' and col not in final_lgbm_categorical_features : # Build final list from train_df
                final_lgbm_categorical_features.append(col)
        # else:
            # print(f"Info: Categorical feature '{col}' defined but not found in {df_name}.")

# Identify numerical features dynamically
all_cols_train = list(train_df_lgbm.columns)
numerical_features_lgbm = [
    col for col in all_cols_train
    if col not in final_lgbm_categorical_features and \
       col != target_col and \
       col not in cols_to_drop_lgbm
]
lgbm_feature_columns = final_lgbm_categorical_features + numerical_features_lgbm
print(f"Final categorical features for LightGBM: {final_lgbm_categorical_features}")
print(f"Final numerical features for LightGBM: {numerical_features_lgbm}")
print(f"Total features for LightGBM: {len(lgbm_feature_columns)}")

print("\nSaving processed LightGBM training, testing, and small_matrix evaluation data...")
X_train = train_df_lgbm[lgbm_feature_columns]
y_train = train_df_lgbm[target_col]
pd.concat([X_train, y_train.rename(target_col)], axis=1).to_parquet(os.path.join(PROCESSED_DATA_PATH, 'lightgbm_train_data.parquet'), index=False)
del X_train, y_train, train_df_lgbm
gc.collect()

X_test = test_df_lgbm[lgbm_feature_columns]
y_test = test_df_lgbm[target_col]
pd.concat([X_test, y_test.rename(target_col)], axis=1).to_parquet(os.path.join(PROCESSED_DATA_PATH, 'lightgbm_test_data.parquet'), index=False)
test_df_lgbm[['user_id', 'video_id', target_col]].to_csv(os.path.join(PROCESSED_DATA_PATH, 'ground_truth_test_big_matrix.csv'), index=False)
del X_test, y_test, test_df_lgbm
gc.collect()

# For df_small_eval_features, ensure it has all lgbm_feature_columns and the target_col
# It might be missing columns if some features were not applicable or all NaN for small matrix data
small_eval_cols_present = [col for col in lgbm_feature_columns if col in df_small_eval_features.columns]
if target_col in df_small_eval_features.columns:
    small_eval_cols_present.append(target_col)

df_small_eval_features[small_eval_cols_present].to_parquet(os.path.join(PROCESSED_DATA_PATH, 'small_matrix_eval_features_data.parquet'), index=False)
print("Data preparation and saving complete.")


--- Step 1.6: Define Train/Test Split (from Big Matrix) & Prepare LightGBM Data ---
LGBM Train data shape (from big matrix): (10024644, 48)
LGBM Test data shape (from big matrix): (2506162, 48)
Processing categoricals for train_lgbm...
Processing categoricals for test_lgbm...
Processing categoricals for small_eval...
Final categorical features for LightGBM: ['user_id', 'video_id', 'user_active_degree', 'interaction_hour', 'interaction_day_of_week', 'is_lowactive_period', 'is_live_streamer', 'is_video_author', 'onehot_feat0', 'onehot_feat1', 'onehot_feat2', 'onehot_feat3', 'onehot_feat4', 'onehot_feat5', 'onehot_feat6', 'onehot_feat7', 'onehot_feat8', 'onehot_feat9', 'onehot_feat10', 'onehot_feat11', 'onehot_feat12', 'onehot_feat13', 'onehot_feat14', 'onehot_feat15', 'onehot_feat16', 'onehot_feat17', 'author_id', 'video_type', 'video_tag_id']
Final numerical features for LightGBM: ['video_duration_interaction', 'interaction_date', 'follow_user_num', 'fans_user_num', 'register_days', 'n

In [5]:
# --- Phase 2: Model Training ---
print("\n--- Phase 2: Model Training ---")
import implicit
import joblib
import lightgbm as lgb

print("Training ALS Model...")
als_model = implicit.als.AlternatingLeastSquares(factors=100, regularization=0.1, iterations=20, use_cg=True, calculate_training_loss=True)
als_model.fit(interaction_matrix_als)
joblib.dump(als_model, os.path.join(MODELS_PATH, "als_model.joblib"))
print(f"ALS model saved to: {os.path.join(MODELS_PATH, 'als_model.joblib')}")
del interaction_matrix_als # Free up ALS matrix
gc.collect()

print("\nTraining LightGBM Model...")
train_lgbm_loaded_df = pd.read_parquet(os.path.join(PROCESSED_DATA_PATH, 'lightgbm_train_data.parquet'))
y_train_lgbm_loaded = train_lgbm_loaded_df[target_col]
# X_train_lgbm_loaded needs to use the dynamically determined lgbm_feature_columns
# However, lgbm_feature_columns was defined in the previous cell. We need to ensure it's available
# or reload it. For simplicity, assume it's available or re-derive if necessary.
# For now, just drop target. The saved parquet should only have these columns.
X_train_lgbm_loaded = train_lgbm_loaded_df.drop(columns=[target_col])
del train_lgbm_loaded_df
gc.collect()

# final_lgbm_categorical_features was also defined in the previous cell.
# For robustness, re-check dtypes. Parquet usually preserves category well.
categorical_for_lgbm_dataset = []
for col in final_lgbm_categorical_features: # Use the list built from train_df previously
    if col in X_train_lgbm_loaded.columns:
        if X_train_lgbm_loaded[col].dtype.name != 'category':
            X_train_lgbm_loaded[col] = X_train_lgbm_loaded[col].astype('category')
        categorical_for_lgbm_dataset.append(col)


lgbm_params_config = {
    'objective': 'regression_l1', 'metric': ['mae', 'rmse'], 'boosting_type': 'gbdt',
    'num_leaves': 63, 'learning_rate': 0.05, 'feature_fraction': 0.8,
    'bagging_fraction': 0.8, 'bagging_freq': 1, 'n_estimators': 1000,
    'verbose': -1, 'n_jobs': -1, 'seed': 42
}

lgb_train_dataset = lgb.Dataset(X_train_lgbm_loaded, y_train_lgbm_loaded,
                                categorical_feature=categorical_for_lgbm_dataset, # Use the refined list
                                free_raw_data=False)

params_for_training = lgbm_params_config.copy()
num_boost_round_from_params = params_for_training.pop('n_estimators', 1000)

model_lgbm_trained = lgb.train(
    params=params_for_training,
    train_set=lgb_train_dataset,
    num_boost_round=num_boost_round_from_params
)
model_lgbm_trained.save_model(os.path.join(MODELS_PATH, "lightgbm_ranker_model.txt"))
print(f"LightGBM model saved to: {os.path.join(MODELS_PATH, 'lightgbm_ranker_model.txt')}")
del X_train_lgbm_loaded, y_train_lgbm_loaded, lgb_train_dataset, model_lgbm_trained
gc.collect()

print("\n--- Model Training Complete ---")


--- Phase 2: Model Training ---
Training ALS Model...


  0%|          | 0/20 [00:00<?, ?it/s]

ALS model saved to: ../models/als_model.joblib

Training LightGBM Model...
LightGBM model saved to: ../models/lightgbm_ranker_model.txt

--- Model Training Complete ---


In [10]:
import pandas as pd
import numpy as np
import lightgbm as lgb
import gc
import os
import json

# Assuming utils.py is in ../scripts/
import sys
sys.path.append(os.path.abspath(os.path.join(os.getcwd(), '../scripts')))
from utils import precision_at_k, recall_at_k, ndcg_at_k


# --- Configuration ---
PROCESSED_DATA_PATH = "../data/"
MODELS_PATH = "../models/"
TARGET_COL = 'watch_ratio'
K_VALUES = [5, 10, 20, 50, 100, 250]
RELEVANCE_THRESHOLD = 1.0 # Watch ratio > this threshold means relevant for ranking metrics

print(f"\n--- Phase 3: Model Evaluation on Small Matrix ---")

# --- Step 3.1: Load Trained Model and Evaluation Data ---
print("Loading LightGBM model...")
lgbm_model_path = os.path.join(MODELS_PATH, "lightgbm_ranker_model.txt")
model_lgbm_loaded = lgb.Booster(model_file=lgbm_model_path)
print("Model loaded.")

print(f"\nLoading small matrix evaluation data (features + target)...")
small_matrix_eval_df_path = os.path.join(PROCESSED_DATA_PATH, 'small_matrix_eval_features_data.parquet')
df_small_eval = pd.read_parquet(small_matrix_eval_df_path)
print(f"Loaded small matrix evaluation data: {df_small_eval.shape}")
print(f"Columns in df_small_eval: {df_small_eval.columns.tolist()}")

# Identify feature columns (these should be all columns except the target)
# It's crucial that these match the features the model was trained on.
# The parquet file was saved with features + target.
lgbm_feature_columns_eval = [col for col in df_small_eval.columns if col != TARGET_COL]

# Ensure categorical features are correctly typed for prediction
# We need the list of categorical feature names used during training.
# This list should be consistent with what was defined in Step 1.6 of data prep.
# For this script, let's redefine it or ensure it's loaded/passed correctly.
# Assuming it was saved or can be reconstructed (as in the data prep script)

# Reconstruct categorical feature list (should match training)
onehot_feature_names_eval = [f'onehot_feat{i}' for i in range(18)]
categorical_features_lgbm_eval = ['user_id', 'video_id', 'user_active_degree', 'interaction_hour', 'interaction_day_of_week']
user_cat_flags_eval = ['is_lowactive_period', 'is_live_streamer', 'is_video_author']
for flag in user_cat_flags_eval:
    if flag in df_small_eval.columns: categorical_features_lgbm_eval.append(flag)
categorical_features_lgbm_eval.extend(onehot_feature_names_eval)
daily_cat_features_eval = ['author_id', 'video_type', 'video_tag_id']
for feat in daily_cat_features_eval:
    if feat in df_small_eval.columns: categorical_features_lgbm_eval.append(feat)

final_categorical_for_eval = []
print("\nVerifying and casting categorical features in evaluation data...")
for col in categorical_features_lgbm_eval:
    if col in df_small_eval.columns:
        if df_small_eval[col].dtype.name != 'category':
            # Fill NA before astype if any (should have been handled in data prep, but as a safeguard)
            if df_small_eval[col].isnull().any():
                if col in ['user_active_degree', 'video_type']:
                     # df_small_eval[col] = df_small_eval[col].cat.add_categories("Unknown_Eval_NA") # This line can cause error if not category yet
                     df_small_eval[col] = df_small_eval[col].astype(str).fillna("Unknown_Eval_NA") # Convert to str first
                else:
                     df_small_eval[col] = df_small_eval[col].fillna(-1)
            df_small_eval[col] = df_small_eval[col].astype('category')
        final_categorical_for_eval.append(col) # Keep track of actual categoricals found
    # else:
        # print(f"Info: Categorical feature '{col}' defined but not found in small_eval_df.")

# Ensure lgbm_feature_columns_eval aligns with what's in df_small_eval
lgbm_feature_columns_eval = [col for col in lgbm_feature_columns_eval if col in df_small_eval.columns]
print(f"Number of features for evaluation: {len(lgbm_feature_columns_eval)}")


--- Phase 3: Model Evaluation on Small Matrix ---
Loading LightGBM model...
Model loaded.

Loading small matrix evaluation data (features + target)...
Loaded small matrix evaluation data: (4676570, 42)
Columns in df_small_eval: ['user_id', 'video_id', 'user_active_degree', 'interaction_hour', 'interaction_day_of_week', 'is_lowactive_period', 'is_live_streamer', 'is_video_author', 'onehot_feat0', 'onehot_feat1', 'onehot_feat2', 'onehot_feat3', 'onehot_feat4', 'onehot_feat5', 'onehot_feat6', 'onehot_feat7', 'onehot_feat8', 'onehot_feat9', 'onehot_feat10', 'onehot_feat11', 'onehot_feat12', 'onehot_feat13', 'onehot_feat14', 'onehot_feat15', 'onehot_feat16', 'onehot_feat17', 'author_id', 'video_type', 'video_tag_id', 'video_duration_interaction', 'interaction_date', 'follow_user_num', 'fans_user_num', 'register_days', 'num_item_tags', 'video_duration_daily', 'daily_play_progress', 'daily_play_per_show_ratio', 'daily_like_per_play_ratio', 'daily_completion_rate', 'video_age_days', 'watch_ra

In [11]:
# --- Step 3.2: Make Predictions ---
print("\nMaking predictions on small matrix data...")
X_small_eval_features = df_small_eval[lgbm_feature_columns_eval]
actual_watch_ratios = df_small_eval[TARGET_COL]

predicted_watch_ratios = model_lgbm_loaded.predict(X_small_eval_features)
df_small_eval['predicted_watch_ratio'] = predicted_watch_ratios
print("Predictions complete.")
gc.collect()


Making predictions on small matrix data...
Predictions complete.


13

In [12]:
# --- Step 3.3: Pointwise Evaluation Metrics (RMSE, MAE) ---
print(f"\n--- Pointwise Evaluation (Overall) ---")
from sklearn.metrics import mean_squared_error, mean_absolute_error

rmse_overall = np.sqrt(mean_squared_error(actual_watch_ratios, predicted_watch_ratios))
mae_overall = mean_absolute_error(actual_watch_ratios, predicted_watch_ratios)

print(f"Overall RMSE on Small Matrix: {rmse_overall:.4f}")
print(f"Overall MAE on Small Matrix:  {mae_overall:.4f}")


--- Pointwise Evaluation (Overall) ---
Overall RMSE on Small Matrix: 1.3214
Overall MAE on Small Matrix:  0.3473


In [14]:
# --- Step 3.4: Ranking Evaluation Metrics (Per User, then Averaged) ---
print(f"\n--- Ranking Evaluation (Per User, then Averaged) ---")
print(f"Using relevance threshold: watch_ratio > {RELEVANCE_THRESHOLD}")

all_user_metrics = {k: {'precision': [], 'recall': [], 'ndcg': []} for k in K_VALUES}
unique_users_in_small_matrix = df_small_eval['user_id'].unique()
num_eval_users = len(unique_users_in_small_matrix)

print(f"Evaluating ranking for {num_eval_users} users...")

# Group data by user_id for per-user evaluation
grouped_small_eval = df_small_eval.groupby('user_id', observed=False)

for i, (user_id, user_data) in enumerate(grouped_small_eval):
    if (i + 1) % 200 == 0: # Print progress
        print(f"Processed {i+1}/{num_eval_users} users for ranking metrics...")

    # Sort items by predicted watch_ratio for this user (descending)
    user_pred_items_sorted = user_data.sort_values(by='predicted_watch_ratio', ascending=False)['video_id'].tolist()
    
    # Get true relevant items for this user
    user_true_relevant_items = set(user_data[user_data[TARGET_COL] > RELEVANCE_THRESHOLD]['video_id'].tolist())

    if not user_true_relevant_items: # Skip user if they have no relevant items by our definition
        continue

    for k in K_VALUES:
        p_at_k = precision_at_k(user_true_relevant_items, user_pred_items_sorted, k)
        r_at_k = recall_at_k(user_true_relevant_items, user_pred_items_sorted, k)
        n_at_k = ndcg_at_k(user_true_relevant_items, user_pred_items_sorted, k)
        
        all_user_metrics[k]['precision'].append(p_at_k)
        all_user_metrics[k]['recall'].append(r_at_k)
        all_user_metrics[k]['ndcg'].append(n_at_k)

print("Calculating average ranking metrics...")
avg_metrics_report = {}
for k in K_VALUES:
    avg_precision = np.mean(all_user_metrics[k]['precision']) if all_user_metrics[k]['precision'] else 0
    avg_recall = np.mean(all_user_metrics[k]['recall']) if all_user_metrics[k]['recall'] else 0
    avg_ndcg = np.mean(all_user_metrics[k]['ndcg']) if all_user_metrics[k]['ndcg'] else 0
    
    avg_metrics_report[f"Precision@{k}"] = avg_precision
    avg_metrics_report[f"Recall@{k}"] = avg_recall
    avg_metrics_report[f"nDCG@{k}"] = avg_ndcg
    
    print(f"\nMetrics for K={k}:")
    print(f"  Avg Precision@{k}: {avg_precision:.4f}")
    print(f"  Avg Recall@{k}:    {avg_recall:.4f}")
    print(f"  Avg nDCG@{k}:      {avg_ndcg:.4f}")

print("\n--- Evaluation on Small Matrix Complete ---")


--- Ranking Evaluation (Per User, then Averaged) ---
Using relevance threshold: watch_ratio > 1.0
Evaluating ranking for 1411 users...
Processed 200/1411 users for ranking metrics...
Processed 400/1411 users for ranking metrics...
Processed 600/1411 users for ranking metrics...
Processed 800/1411 users for ranking metrics...
Processed 1000/1411 users for ranking metrics...
Processed 1200/1411 users for ranking metrics...
Processed 1400/1411 users for ranking metrics...
Calculating average ranking metrics...

Metrics for K=5:
  Avg Precision@5: 0.9137
  Avg Recall@5:    0.0048
  Avg nDCG@5:      0.9146

Metrics for K=10:
  Avg Precision@10: 0.9117
  Avg Recall@10:    0.0095
  Avg nDCG@10:      0.9128

Metrics for K=20:
  Avg Precision@20: 0.9019
  Avg Recall@20:    0.0188
  Avg nDCG@20:      0.9057

Metrics for K=50:
  Avg Precision@50: 0.8571
  Avg Recall@50:    0.0443
  Avg nDCG@50:      0.8706

Metrics for K=100:
  Avg Precision@100: 0.8234
  Avg Recall@100:    0.0848
  Avg nDCG@100