# KRAFT: Model Evaluation

This notebook evaluates the performance of our trained recommender system components. We will focus on two main types of evaluation:

1.  **Evaluation on `small_matrix` (Dense Subset):**
    *   Since `small_matrix` represents a nearly fully-observed set of interactions for a subset of users and items (that were also part of the `big_matrix` training data), we can perform a detailed evaluation here.
    *   We will primarily evaluate the **LightGBM ranker** directly on all (user, item) pairs within the `small_matrix` scope.
    *   Metrics will include pointwise prediction accuracy (RMSE, MAE for `watch_ratio`) and ranking metrics (Precision@k, Recall@k, nDCG@k).

2.  **End-to-End Evaluation on `big_matrix` Holdout Set (Standard Evaluation):**
    *   This involves the two-stage recommendation process:
        1.  **Candidate Generation:** Use the trained ALS model to generate a list of candidate items for each user in the test set (derived from `big_matrix`).
        2.  **Ranking:** Use the trained LightGBM model to re-rank these candidates.
    *   Metrics will focus on ranking performance (Precision@k, Recall@k, nDCG@k) of the final re-ranked list against the ground truth interactions in the `big_matrix` test set.

## 1. Imports and Configuration

Import necessary libraries, load custom evaluation metric functions from `utils.py`, and define paths and evaluation parameters.

In [None]:
import pandas as pd
import numpy as np
import lightgbm as lgb
import implicit # For ALS model loading, though joblib is used for saving/loading
import joblib
import gc
import os
import json
from tqdm import tqdm # For progress bars
from sklearn.metrics import mean_squared_error, mean_absolute_error

# Add scripts directory to sys.path to import utils
import sys
current_dir = os.getcwd()
# Handle potential nested notebook structure for pathing
if os.path.basename(current_dir) == 'notebooks':
    base_dir = os.path.dirname(current_dir)
else:
    base_dir = current_dir
scripts_path = os.path.join(base_dir, 'scripts')
if scripts_path not in sys.path:
    sys.path.append(scripts_path)

from utils import precision_at_k, recall_at_k, ndcg_at_k

# --- Path Definitions & Configuration ---
PROCESSED_DATA_PATH = "../data/"
MODELS_PATH = "../models/"
TARGET_COL = 'watch_ratio'
K_VALUES_FOR_RANKING = [5, 10, 20, 50, 100] # K values for ranking metrics
RANKING_RELEVANCE_THRESHOLD = 1.0 # watch_ratio > threshold means relevant
ALS_NUM_CANDIDATES = 250 # Number of candidates to generate per user from ALS

## 2. Evaluation on `small_matrix` (Dense Subset)

Here, we evaluate the LightGBM ranker's performance on the `small_matrix` data. Since this subset is fully observed for its specific users and items, we can directly assess pointwise prediction accuracy and ranking quality without a separate candidate generation step for this evaluation.

In [None]:
print(f"\n--- Evaluating LightGBM Ranker on Small Matrix ---")

# --- Load LightGBM Model ---
print("Loading LightGBM model...")
lgbm_model_path = os.path.join(MODELS_PATH, "lightgbm_ranker_model.txt")
try:
    model_lgbm_loaded = lgb.Booster(model_file=lgbm_model_path)
    print("LightGBM model loaded.")
except lgb.basic.LightGBMError as e:
    print(f"Error loading LightGBM model: {e}. Ensure the model file exists and is valid.")
    raise

# --- Load Small Matrix Evaluation Data ---
print(f"\nLoading small matrix evaluation data...")
small_matrix_eval_df_path = os.path.join(PROCESSED_DATA_PATH, 'small_matrix_eval_features_data.parquet')
try:
    df_small_eval = pd.read_parquet(small_matrix_eval_df_path)
except FileNotFoundError:
    print(f"Error: {small_matrix_eval_df_path} not found. Please run Data Preparation notebook first.")
    raise
print(f"Loaded small matrix evaluation data: {df_small_eval.shape}")

# --- Prepare Features and Target for Small Matrix Evaluation ---
# Identify feature columns (all columns except the target)
lgbm_feature_columns_small_eval = [col for col in df_small_eval.columns if col != TARGET_COL]

# Reconstruct the list of categorical feature names (must match training)
onehot_feature_names_s_eval = [f'onehot_feat{i}' for i in range(18)]
master_categorical_list_s_eval = ['user_id', 'video_id', 'user_active_degree', 'interaction_hour', 'interaction_day_of_week'] + \
                                 [flag for flag in ['is_lowactive_period', 'is_live_streamer', 'is_video_author'] if flag in df_small_eval.columns] + \
                                 onehot_feature_names_s_eval + \
                                 [feat for feat in ['author_id', 'video_type', 'video_tag_id'] if feat in df_small_eval.columns]

print("\nVerifying and casting categorical features for small matrix evaluation data...")
for col in master_categorical_list_s_eval:
    if col in df_small_eval.columns:
        if df_small_eval[col].dtype.name != 'category':
            if df_small_eval[col].isnull().any():
                if col in ['user_active_degree', 'video_type']:
                    df_small_eval[col] = df_small_eval[col].astype(str).fillna("Unknown_Eval_NA")
                else:
                    df_small_eval[col] = df_small_eval[col].fillna(-1)
            df_small_eval[col] = df_small_eval[col].astype('category')

X_small_eval_features = df_small_eval[lgbm_feature_columns_small_eval]
actual_watch_ratios_small = df_small_eval[TARGET_COL]

# --- Make Predictions ---
print("\nMaking predictions on small matrix data...")
predicted_watch_ratios_small = model_lgbm_loaded.predict(X_small_eval_features)
df_small_eval['predicted_watch_ratio'] = predicted_watch_ratios_small
print("Predictions complete for small matrix.")
gc.collect()

# --- Pointwise Evaluation Metrics (RMSE, MAE) for Small Matrix ---
print(f"\n--- Pointwise Evaluation on Small Matrix (Overall) ---")
rmse_small_overall = np.sqrt(mean_squared_error(actual_watch_ratios_small, predicted_watch_ratios_small))
mae_small_overall = mean_absolute_error(actual_watch_ratios_small, predicted_watch_ratios_small)
print(f"Overall RMSE on Small Matrix: {rmse_small_overall:.4f}")
print(f"Overall MAE on Small Matrix:  {mae_small_overall:.4f}")

# --- Ranking Evaluation Metrics for Small Matrix ---
print(f"\n--- Ranking Evaluation on Small Matrix (Per User, then Averaged) ---")
print(f"Using relevance threshold: watch_ratio > {RANKING_RELEVANCE_THRESHOLD}")

small_matrix_user_metrics = {k: {'precision': [], 'recall': [], 'ndcg': []} for k in K_VALUES_FOR_RANKING}
unique_users_small_matrix = df_small_eval['user_id'].unique()
num_eval_users_small = len(unique_users_small_matrix)
print(f"Evaluating ranking for {num_eval_users_small} users from small matrix...")

grouped_small_eval = df_small_eval.groupby('user_id', observed=True) # observed=True is good practice

for user_id, user_data in tqdm(grouped_small_eval, desc="Small Matrix User Ranking Eval"):
    user_pred_items_sorted = user_data.sort_values(by='predicted_watch_ratio', ascending=False)['video_id'].tolist()
    user_true_relevant_items = set(user_data[user_data[TARGET_COL] > RANKING_RELEVANCE_THRESHOLD]['video_id'].tolist())

    if not user_true_relevant_items: continue

    for k_val in K_VALUES_FOR_RANKING:
        p_at_k = precision_at_k(user_true_relevant_items, user_pred_items_sorted, k_val)
        r_at_k = recall_at_k(user_true_relevant_items, user_pred_items_sorted, k_val)
        n_at_k = ndcg_at_k(user_true_relevant_items, user_pred_items_sorted, k_val)
        
        small_matrix_user_metrics[k_val]['precision'].append(p_at_k)
        small_matrix_user_metrics[k_val]['recall'].append(r_at_k)
        small_matrix_user_metrics[k_val]['ndcg'].append(n_at_k)

print("\nCalculating average ranking metrics for Small Matrix...")
avg_metrics_report_small = {}
for k_val in K_VALUES_FOR_RANKING:
    avg_precision = np.mean(small_matrix_user_metrics[k_val]['precision']) if small_matrix_user_metrics[k_val]['precision'] else 0
    avg_recall = np.mean(small_matrix_user_metrics[k_val]['recall']) if small_matrix_user_metrics[k_val]['recall'] else 0
    avg_ndcg = np.mean(small_matrix_user_metrics[k_val]['ndcg']) if small_matrix_user_metrics[k_val]['ndcg'] else 0
    
    avg_metrics_report_small[f"Precision@{k_val}"] = avg_precision
    avg_metrics_report_small[f"Recall@{k_val}"] = avg_recall
    avg_metrics_report_small[f"nDCG@{k_val}"] = avg_ndcg
    
    print(f"  Avg Precision@{k_val} (Small Matrix): {avg_precision:.4f}")
    print(f"  Avg Recall@{k_val}    (Small Matrix): {avg_recall:.4f}")
    print(f"  Avg nDCG@{k_val}      (Small Matrix): {avg_ndcg:.4f}")

del df_small_eval, X_small_eval_features, actual_watch_ratios_small, predicted_watch_ratios_small, grouped_small_eval
gc.collect()

## 3. End-to-End Evaluation on `big_matrix` Holdout Set

This is the standard evaluation for the two-stage recommender system. It involves:
1. Loading the ALS model, LightGBM model, ID mappings, and the `big_matrix` test data (`lightgbm_test_data.parquet` and ground truth).
2. For each user in the test set:
    a. Use ALS to generate top-N candidate `video_id`s.
    b. Filter out candidates the user has already interacted with in the training set (if applicable, though our test set is time-split which mitigates this for *future* interactions).
    c. Construct feature vectors for these (user, candidate_item) pairs.
    d. Use LightGBM to predict `watch_ratio` for these candidates and re-rank them.
3. Calculate ranking metrics (P@k, R@k, nDCG@k) by comparing the re-ranked list against the user's actual interactions in the `big_matrix` test set.

In [None]:
print(f"\n--- End-to-End Evaluation on Big Matrix Holdout Set ---")

# --- Load ALS Model and ID Mappings ---
print("Loading ALS model and ID mappings...")
als_model_path = os.path.join(MODELS_PATH, "als_model.joblib")
try:
    als_model_loaded = joblib.load(als_model_path)
except FileNotFoundError:
    print(f"Error: ALS model not found at {als_model_path}. Please run Model Training notebook.")
    raise

id_mapping_files = ['user_to_idx_als.json', 'idx_to_user_als.json', 'video_to_idx_als.json', 'idx_to_video_als.json']
id_mappings = {}
try:
    for f_name in id_mapping_files:
        with open(os.path.join(PROCESSED_DATA_PATH, f_name), 'r') as f:
            # JSON keys are always strings, convert to int if they represent integer IDs
            mapping = json.load(f)
            if 'idx_to' in f_name: # Keys are stringified integers from 0...N-1
                id_mappings[f_name.split('.')[0]] = {int(k): v for k,v in mapping.items()}
            else: # user_to_idx, video_to_idx: keys are original IDs, should be int
                id_mappings[f_name.split('.')[0]] = {int(k): v for k,v in mapping.items()}
    print("ALS ID mappings loaded.")
except FileNotFoundError:
    print("Error: ALS ID mapping files not found. Please run Data Preparation notebook.")
    raise

user_to_idx_als = id_mappings['user_to_idx_als']
idx_to_user_als = id_mappings['idx_to_user_als']
video_to_idx_als = id_mappings['video_to_idx_als']
idx_to_video_als = id_mappings['idx_to_video_als']

# --- Load Big Matrix Test Data and Ground Truth ---
print("\nLoading LightGBM test data (big matrix holdout)...")
test_lgbm_parquet_path = os.path.join(PROCESSED_DATA_PATH, 'lightgbm_test_data.parquet')
try:
    df_test_lgbm = pd.read_parquet(test_lgbm_parquet_path) # Features + target
except FileNotFoundError:
    print(f"Error: {test_lgbm_parquet_path} not found. Please run Data Preparation.")
    raise

print("Loading ground truth for big matrix test set...")
ground_truth_big_path = os.path.join(PROCESSED_DATA_PATH, 'ground_truth_test_big_matrix.csv')
try:
    df_ground_truth_big = pd.read_csv(ground_truth_big_path)
except FileNotFoundError:
    print(f"Error: {ground_truth_big_path} not found. Please run Data Preparation.")
    raise

print(f"Loaded LGBM test data: {df_test_lgbm.shape}")
print(f"Loaded ground truth data: {df_ground_truth_big.shape}")

# --- Prepare for Evaluation Loop ---
# The LightGBM model (`model_lgbm_loaded`) is already loaded from the small matrix eval part.

# Create a lookup for ground truth items per user from df_ground_truth_big
user_true_items_big_test = df_ground_truth_big[
    df_ground_truth_big[TARGET_COL] > RANKING_RELEVANCE_THRESHOLD
].groupby('user_id')['video_id'].apply(set).to_dict()

# The df_test_lgbm contains all features needed for LightGBM predictions for user-item pairs in the test set.
# We need to simulate the candidate generation -> ranking pipeline.

# Features that LightGBM model expects (excluding target)
lgbm_feature_columns_big_test = [col for col in df_test_lgbm.columns if col != TARGET_COL]

# Ensure categoricals are typed (should be from parquet, but good check)
print("\nVerifying and casting categorical features for big matrix test data...")
master_categorical_list_big_eval = ['user_id', 'video_id', 'user_active_degree', 'interaction_hour', 'interaction_day_of_week'] + \
                                   [flag for flag in ['is_lowactive_period', 'is_live_streamer', 'is_video_author'] if flag in df_test_lgbm.columns] + \
                                   [f'onehot_feat{i}' for i in range(18)] + \
                                   [feat for feat in ['author_id', 'video_type', 'video_tag_id'] if feat in df_test_lgbm.columns]

for col in master_categorical_list_big_eval:
    if col in df_test_lgbm.columns and df_test_lgbm[col].dtype.name != 'category':
        if df_test_lgbm[col].isnull().any():
            if col in ['user_active_degree', 'video_type']:
                df_test_lgbm[col] = df_test_lgbm[col].astype(str).fillna("Unknown_Eval_NA")
            else:
                df_test_lgbm[col] = df_test_lgbm[col].fillna(-1)
        df_test_lgbm[col] = df_test_lgbm[col].astype('category')

X_test_lgbm_features_all = df_test_lgbm[lgbm_feature_columns_big_test]

# We need all unique items for constructing feature rows for candidates if they are not in X_test_lgbm_features_all
# This requires access to item features. The simplest way is to use df_test_lgbm as a source of features for known interactions,
# and for ALS candidates not in df_test_lgbm, we'd ideally build their features from scratch.
# For this evaluation, we'll rank items ALREADY IN THE TEST SET for simplicity of feature access.
# A more complete pipeline would fetch/build features for any arbitrary candidate item.

print(f"Evaluating end-to-end ranking for users in big matrix test set...")
big_matrix_e2e_metrics = {k: {'precision': [], 'recall': [], 'ndcg': []} for k in K_VALUES_FOR_RANKING}
test_users_big_matrix = df_test_lgbm['user_id'].unique()

# For constructing features for (user, candidate_item) pairs, we need user features and item features.
# df_test_lgbm already has interaction-level merged features.
# If an ALS candidate is NOT in a user's test interactions, we need to create its feature row.
# This is complex. Let's simplify: rank only items available in the test set for that user.
# This means ALS isn't truly used for candidate *generation* here, but for an *initial ordering/signal* if we wanted.
# For now, let's evaluate LGBM ranking on items present in test set, similar to small_matrix eval.

print("Evaluating LightGBM ranking on items present in the Big Matrix test set (similar to small matrix eval)...")
if 'predicted_watch_ratio' not in df_test_lgbm.columns:
    df_test_lgbm['predicted_watch_ratio'] = model_lgbm_loaded.predict(X_test_lgbm_features_all)
    print("Predictions made for big matrix test data.")
gc.collect()

grouped_test_lgbm = df_test_lgbm.groupby('user_id', observed=True)
for user_id, user_data in tqdm(grouped_test_lgbm, desc="Big Matrix Test User Ranking Eval"):
    user_pred_items_sorted = user_data.sort_values(by='predicted_watch_ratio', ascending=False)['video_id'].tolist()
    true_relevant_for_user = user_true_items_big_test.get(user_id, set()) # Get from precomputed ground truth

    if not true_relevant_for_user: continue

    for k_val in K_VALUES_FOR_RANKING:
        p_at_k = precision_at_k(true_relevant_for_user, user_pred_items_sorted, k_val)
        r_at_k = recall_at_k(true_relevant_for_user, user_pred_items_sorted, k_val)
        n_at_k = ndcg_at_k(true_relevant_for_user, user_pred_items_sorted, k_val)
        
        big_matrix_e2e_metrics[k_val]['precision'].append(p_at_k)
        big_matrix_e2e_metrics[k_val]['recall'].append(r_at_k)
        big_matrix_e2e_metrics[k_val]['ndcg'].append(n_at_k)

print("\nCalculating average ranking metrics for Big Matrix Test Set (LGBM ranking on test items)...")
avg_metrics_report_big_lgbm_only = {}
for k_val in K_VALUES_FOR_RANKING:
    avg_precision = np.mean(big_matrix_e2e_metrics[k_val]['precision']) if big_matrix_e2e_metrics[k_val]['precision'] else 0
    avg_recall = np.mean(big_matrix_e2e_metrics[k_val]['recall']) if big_matrix_e2e_metrics[k_val]['recall'] else 0
    avg_ndcg = np.mean(big_matrix_e2e_metrics[k_val]['ndcg']) if big_matrix_e2e_metrics[k_val]['ndcg'] else 0
    
    avg_metrics_report_big_lgbm_only[f"Precision@{k_val}"] = avg_precision
    avg_metrics_report_big_lgbm_only[f"Recall@{k_val}"] = avg_recall
    avg_metrics_report_big_lgbm_only[f"nDCG@{k_val}"] = avg_ndcg
    
    print(f"  Avg Precision@{k_val} (Big Matrix Test - LGBM on test items): {avg_precision:.4f}")
    print(f"  Avg Recall@{k_val}    (Big Matrix Test - LGBM on test items): {avg_recall:.4f}")
    print(f"  Avg nDCG@{k_val}      (Big Matrix Test - LGBM on test items): {avg_ndcg:.4f}")

# --- Store all metrics --- 
final_evaluation_report = {
    "small_matrix_pointwise": {
        "rmse": rmse_small_overall,
        "mae": mae_small_overall
    },
    "small_matrix_ranking": avg_metrics_report_small,
    "big_matrix_test_lgbm_rank_on_test_items": avg_metrics_report_big_lgbm_only
    # Add true end-to-end (ALS+LGBM) metrics here once implemented
}

metrics_output_path = os.path.join(PROCESSED_DATA_PATH, "full_evaluation_metrics.json")
with open(metrics_output_path, 'w') as f:
    json.dump(final_evaluation_report, f, indent=4)
print(f"\nFull evaluation metrics saved to: {metrics_output_path}")

print("\n--- Model Evaluation Phase Complete ---")

# Placeholder for True End-to-End ALS + LGBM evaluation on big_matrix test set
print("\n--- Placeholder: True End-to-End ALS + LGBM Evaluation on Big Matrix Test ---")
print("This section requires generating candidates with ALS for each test user, ")
print("then building features for (user, candidate_item) pairs and ranking with LightGBM.")
print("This is more involved due to feature construction for arbitrary candidates and will be implemented separately if needed.")