# 4. Evaluation & Submission

**Goal:**
1.  Load the trained LightGBM ranking model and associated components (encoders, embeddings, feature list).
2.  Load the test user-item pairs (`interactions_test.csv`) and the necessary feature data.
3.  Prepare the feature set for the test pairs, ensuring consistency with the training feature engineering.
4.  Generate prediction scores using the LightGBM model.
5.  Evaluate the model's ranking performance using standard metrics (Precision@K, Recall@K, NDCG@K) against the ground truth.
6.  Generate the final `submission.csv` file.

## Setup
Import libraries, define paths, and load necessary evaluation functions.

In [None]:
import pandas as pd
import numpy as np
import lightgbm as lgb
import joblib
import pickle
from tqdm.notebook import tqdm # Progress bar
import ast
import os
import sys
import warnings

warnings.filterwarnings('ignore')

# Add scripts directory to path to import utils
sys.path.append('../scripts/')
try:
    from utils import precision_at_k, recall_at_k, ndcg_at_k
except ImportError:
    print("ERROR: Could not import from scripts/utils.py. Ensure the file exists.")
    # Define metrics inline as fallback 
    def precision_at_k(y_true_items, y_pred_items, k):
        pred_k = y_pred_items[:k]
        relevant_in_pred_k = len(set(pred_k) & y_true_items)
        return relevant_in_pred_k / k if k > 0 else 0
    def recall_at_k(y_true_items, y_pred_items, k):
        pred_k = y_pred_items[:k]
        relevant_in_pred_k = len(set(pred_k) & y_true_items)
        return relevant_in_pred_k / len(y_true_items) if len(y_true_items) > 0 else 0
    def ndcg_at_k(y_true_items, y_pred_items, k):
        pred_k = y_pred_items[:k]; dcg = 0.0
        for i, item in enumerate(pred_k):
            if item in y_true_items: dcg += 1.0 / np.log2(i + 2)
        idcg = 0.0; num_relevant = min(len(y_true_items), k)
        for i in range(num_relevant): idcg += 1.0 / np.log2(i + 2)
        return dcg / idcg if idcg > 0 else 0

# --- Configuration ---
DATA_DIR = "../data/"
RAW_DATA_DIR = "../raw_data/KuaiRec/data/"
MODEL_DIR = "../models/"
SUBMISSION_PATH = "../submission.csv"

# Input file paths
INTERACTIONS_TEST_PATH = os.path.join(DATA_DIR, "interactions_test.csv")
GROUND_TRUTH_PATH = os.path.join(DATA_DIR, "test_user_item_map.pkl")
USER_FEATURES_PATH = os.path.join(RAW_DATA_DIR, "user_features.csv")
VIDEO_METADATA_PATH = os.path.join(DATA_DIR, "video_metadata.csv")
INTERACTIONS_TRAIN_PATH = os.path.join(DATA_DIR, "interactions_train.csv") # Needed for counts

# Model file paths
LGBM_MODEL_PATH = os.path.join(MODEL_DIR, 'lgbm_ranker_model.joblib')
LGBM_FEATURES_PATH = os.path.join(MODEL_DIR, 'lgbm_feature_cols.pkl')
USER_ENCODER_PATH = os.path.join(MODEL_DIR, 'user_encoder.pkl')
ITEM_ENCODER_PATH = os.path.join(MODEL_DIR, 'item_encoder.pkl')
USER_EMBEDDINGS_PATH = os.path.join(MODEL_DIR, 'user_embeddings.npy')
ITEM_EMBEDDINGS_PATH = os.path.join(MODEL_DIR, 'item_embeddings.npy')

## Load Trained Models and Supporting Data
Load the ranker, feature list, encoders, embeddings, test pairs, and ground truth.

In [None]:
print("Loading models and supporting data...")
try:
    lgbm_ranker = joblib.load(LGBM_MODEL_PATH)
    with open(LGBM_FEATURES_PATH, 'rb') as f:
        feature_cols = pickle.load(f)
    print(f"Loaded LightGBM model and {len(feature_cols)} feature columns.")

    with open(USER_ENCODER_PATH, 'rb') as f:
        user_encoder = pickle.load(f)
    with open(ITEM_ENCODER_PATH, 'rb') as f:
        item_encoder = pickle.load(f)
    user_embeddings = np.load(USER_EMBEDDINGS_PATH)
    item_embeddings = np.load(ITEM_EMBEDDINGS_PATH)
    factors = user_embeddings.shape[1]
    n_users = len(user_encoder.classes_)
    n_items = len(item_encoder.classes_)
    print(f"Loaded encoders (n_users={n_users}, n_items={n_items}) and ALS embeddings (factors={factors}).")

    test_interactions = pd.read_csv(INTERACTIONS_TEST_PATH)
    print(f"Loaded {len(test_interactions)} test user-item pairs.")

    with open(GROUND_TRUTH_PATH, 'rb') as f:
        test_user_item_map = pickle.load(f)
    print(f"Loaded ground truth map for {len(test_user_item_map)} users.")

except FileNotFoundError as e:
    print(f"Error loading file: {e}")
    print("Ensure notebooks 1 and 3 were run successfully and files are in correct locations.")
    exit()

### Load Feature Data for Merging

In [None]:
print("Loading feature source data...")
try:
    user_features = pd.read_csv(USER_FEATURES_PATH)
    user_features = user_features.set_index('user_id')
    user_cat_cols = ['user_active_degree'] + [f'onehot_feat{i}' for i in range(18)]
    user_num_cols = ['is_lowactive_period', 'is_live_streamer', 'is_video_author', 
                     'follow_user_num', 'fans_user_num', 'friend_user_num', 'register_days']
    user_feature_cols = user_cat_cols + user_num_cols
    # Keep only necessary columns to reduce memory usage
    user_features = user_features[[col for col in user_feature_cols if col in user_features.columns]]
    print("Loaded user features.")
except FileNotFoundError:
    print("User features file not found. Features relying on it will be imputed.")
    user_features = None
    user_cat_cols, user_num_cols = [], []

item_metadata = pd.read_csv(VIDEO_METADATA_PATH)
def parse_list_string(s):
    try: return ast.literal_eval(s) if isinstance(s, str) else []
    except: return []
item_metadata['feat_list'] = item_metadata['feat'].apply(parse_list_string)
item_metadata['num_categories'] = item_metadata['feat_list'].apply(len)
item_metadata = item_metadata.set_index('item_id')[['num_categories']]
print("Loaded and processed item metadata.")

# Calculate interaction counts from the *training* data
print("Calculating interaction counts from training data...")
train_interactions_full = pd.read_csv(INTERACTIONS_TRAIN_PATH)
user_counts = train_interactions_full.groupby('user_id').size().rename('user_interaction_count')
item_counts = train_interactions_full.groupby('item_id').size().rename('item_interaction_count')
del train_interactions_full # Free memory
print("Calculated interaction counts.")

## Prepare Features for Test Set
Apply the *exact same* feature engineering steps used for the training data in Notebook 3.

In [None]:
print("Preparing features for the test set...")
test_data_for_ranker = test_interactions.copy()

# --- 1. Merge Features ---
if user_features is not None:
    test_data_for_ranker = test_data_for_ranker.merge(user_features, on='user_id', how='left')
test_data_for_ranker = test_data_for_ranker.merge(item_metadata, on='item_id', how='left')
test_data_for_ranker = test_data_for_ranker.merge(user_counts, on='user_id', how='left')
test_data_for_ranker = test_data_for_ranker.merge(item_counts, on='item_id', how='left')

# --- 2. Add Embeddings --- 
# Map test user/item IDs to the internal indices learned during training
# Handle IDs present in test but not seen during training (assign default index like -1 or handle later during merge)
user_map = {id_: idx for idx, id_ in enumerate(user_encoder.classes_)}
item_map = {id_: idx for idx, id_ in enumerate(item_encoder.classes_)}
test_data_for_ranker['user_idx'] = test_data_for_ranker['user_id'].map(user_map).fillna(-1).astype(int)
test_data_for_ranker['item_idx'] = test_data_for_ranker['item_id'].map(item_map).fillna(-1).astype(int)

# Create embedding DFs with RangeIndex
user_emb_df = pd.DataFrame(user_embeddings, index=pd.RangeIndex(n_users))
item_emb_df = pd.DataFrame(item_embeddings, index=pd.RangeIndex(n_items))
user_emb_df.columns = [f'user_emb_{i}' for i in range(factors)]
item_emb_df.columns = [f'item_emb_{i}' for i in range(factors)]

# Merge using internal indices
test_data_for_ranker = test_data_for_ranker.merge(user_emb_df, left_on='user_idx', right_index=True, how='left')
test_data_for_ranker = test_data_for_ranker.merge(item_emb_df, left_on='item_idx', right_index=True, how='left')

# --- 3. Impute Missing Values --- 
# Impute numerical columns
num_cols_to_impute = ['num_categories', 'user_interaction_count', 'item_interaction_count'] + user_num_cols
for col in num_cols_to_impute:
    if col in test_data_for_ranker.columns:
        test_data_for_ranker[col].fillna(0, inplace=True)

# Impute categorical user features
if user_features is not None:
    for col in user_cat_cols:
        if col in test_data_for_ranker.columns:
            test_data_for_ranker[col].fillna(-1, inplace=True)

# Impute embeddings (for users/items not seen in training)
emb_cols = [f'user_emb_{i}' for i in range(factors)] + [f'item_emb_{i}' for i in range(factors)]
test_data_for_ranker[emb_cols] = test_data_for_ranker[emb_cols].fillna(0)

print(f"NaN count after test imputation: {test_data_for_ranker.isnull().sum().sum()}")

# --- 4. Select Features and Convert Types --- 
# Ensure feature columns match the order used during training
if not all(f in test_data_for_ranker.columns for f in feature_cols):
    print("ERROR: Test data missing required feature columns!")
    print(f"Missing: {[f for f in feature_cols if f not in test_data_for_ranker.columns]}")
    exit()
X_test = test_data_for_ranker[feature_cols]

# Convert categorical features to category type (matching training)
if user_features is not None:
    for col in user_cat_cols:
        if col in X_test.columns:
             # Ensure consistency with training categories if possible, 
             # though LightGBM handles unseen values if dtype is category.
             X_test[col] = X_test[col].astype('category')

print(f"Test data feature preparation complete. Shape for prediction: {X_test.shape}")

## Generate Predictions
Use the trained LightGBM model to predict the probability of positive interaction for each test pair.

In [None]:
print("Generating predictions using LightGBM ranker...")
# Predict probability for the positive class (class 1)
predictions = lgbm_ranker.predict_proba(X_test)[:, 1]

# Create dataframe with user, item, and score
results_df = test_data_for_ranker[['user_id', 'item_id']].copy()
results_df['score'] = predictions

print(f"Generated {len(results_df)} predictions.")
print(results_df.head())

## Evaluate Model Performance
Calculate ranking metrics (Precision@K, Recall@K, NDCG@K) by comparing predicted rankings against the ground truth.

In [None]:
if not test_user_item_map:
    print("Skipping evaluation as ground truth map is not available.")
else:
    print("Evaluating model ranking performance...")
    
    # Group predictions by user and sort by score to get ranked lists
    user_predictions_grouped = results_df.sort_values('score', ascending=False).groupby('user_id')

    K_values = [10, 20, 50]
    precisions = {k: [] for k in K_values}
    recalls = {k: [] for k in K_values}
    ndcgs = {k: [] for k in K_values}

    evaluated_users_count = 0
    # Iterate through users in the ground truth map
    for user_id, true_items in tqdm(test_user_item_map.items(), desc="Evaluating Users"):
        # Check if user has predictions (they should if test data filtering was consistent)
        if user_id not in user_predictions_grouped.groups:
            # This user was in test ground truth but has no items in interactions_test.csv
            # Or was filtered out before prediction? This shouldn't happen with current logic.
            continue 

        # Get this user's predicted items, ranked by score
        user_preds_df = user_predictions_grouped.get_group(user_id)
        recommended_items = user_preds_df['item_id'].tolist()

        # If true_items is empty for a user (shouldn't happen based on prep), skip
        if not true_items:
            continue

        # Calculate metrics for different K
        for k in K_values:
            prec = precision_at_k(true_items, recommended_items, k)
            rec = recall_at_k(true_items, recommended_items, k)
            ndcg = ndcg_at_k(true_items, recommended_items, k)
            precisions[k].append(prec)
            recalls[k].append(rec)
            ndcgs[k].append(ndcg)
        
        evaluated_users_count += 1

    # --- Print Average Metrics --- 
    print(f"\n--- Evaluation Results (on {evaluated_users_count} users with ground truth) ---")
    if evaluated_users_count > 0:
        for k in K_values:
            avg_precision = np.mean(precisions[k])
            avg_recall = np.mean(recalls[k])
            avg_ndcg = np.mean(ndcgs[k])
            print(f"@{k}:")
            print(f"  Avg Precision@{k}: {avg_precision:.4f}")
            print(f"  Avg Recall@{k}   : {avg_recall:.4f}")
            print(f"  Avg NDCG@{k}     : {avg_ndcg:.4f}")
    else:
        print("No users could be evaluated. Check ground truth data and filtering steps.")

## Generate Final Submission File
Save the predictions in the required `user_id,item_id,score` format.

In [None]:
# The results_df already has the required columns
submission_df = results_df[['user_id', 'item_id', 'score']]

print(f"Saving final submission file with {len(submission_df)} predictions to: {SUBMISSION_PATH}")
submission_df.to_csv(SUBMISSION_PATH, index=False)
print("--- Submission file created successfully. --- ")