In [102]:
import os
import nltk
import re
import pandas as pd
import numpy as np

from scripts.simulation_imports import *

from openai import OpenAI
from ast import literal_eval
from sklearn.metrics.pairwise import cosine_similarity
from rank_bm25 import BM25Okapi
from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction
from nltk import word_tokenize

In [103]:
DATA_PATH = Path.home() / Path(os.environ.get("RSYS_DATA", "rsys_data/rsys_2025"))
gen_slates_dir = DATA_PATH / "gen_slates"

In [104]:
feather_file_path_wp= gen_slates_dir / "wp_llm_slates.feather"
df=pd.read_feather(feather_file_path_wp)



In [105]:
DATA_PATH = Path.home() / Path(os.environ.get("DATA_PATH"))
dataset_interaction_path = DATA_PATH / Path("MINDlarge_train/test_50.feather")
interaction_data = pd.read_feather(dataset_interaction_path)
dataset_path =DATA_PATH / Path("MINDlarge_train/test_50.feather")
category_data = pd.read_feather(dataset_path)

In [106]:
base_path = Path.home() / Path(os.environ.get("DATA_PATH"))
news_df = pd.read_feather(
            base_path / Path("MINDlarge_train/news_glove_embed_50.feather")
        )
embedding_dict = dict(zip(news_df["itemId"], news_df["embedding"]))
embedding_lookup = {tuple(item_embedding): item_id for item_id, item_embedding in embedding_dict.items() if item_embedding is not None}

Wolpertinger+LLM Slates

In [107]:
# Retrieve item IDs for candidate_docs
candidate_ids = [[embedding_lookup.get(tuple(embedding), "Not Found") for embedding in candidate_list] 
                 for candidate_list in df["candidate_docs"]]

# Retrieve item IDs for slate_docs_feature
slate_item_ids = [[embedding_lookup.get(tuple(embedding), "Not Found") for embedding in slate_list] 
                  for slate_list in df["slate_docs_feature"]]

In [108]:
def get_item_ids_and_titles(item_ids, news_df):
    # Create a dictionary of itemId -> title for faster lookup
    item_to_title = dict(zip(news_df["itemId"], news_df["title"]))
    
    # Retrieve the titles for each item_id
    item_titles = [(item_id, item_to_title.get(item_id, "Title not found")) for item_id in item_ids]
    
    return item_titles

In [109]:
df= df[df['llm_slate'].apply(lambda x: len(x) > 0)].copy()
df['initial_user_state_tuple'] = df['initial_user_state'].apply(tuple)

# Step 2: Group by initial_user_state and calculate the mean of 'hit' for each group
grouped_means_rl_llm = df.groupby('initial_user_state_tuple')['hit'].mean().reset_index()
grouped_means_rl_llm.rename(columns={'hit': 'group_mean_hit'}, inplace=True)

# Step 3: Calculate the overall average of the group means
overall_mean = grouped_means_rl_llm['group_mean_hit'].mean()

# Display the results
print("Group-level averages:")
print(grouped_means_rl_llm)
print("\nOverall average:")
print(overall_mean)

Group-level averages:
                             initial_user_state_tuple  group_mean_hit
0   (-0.0259791798889637, 0.5851805210113525, 0.00...        0.250000
1   (0.049241334199905396, 0.23491184413433075, 0....        0.333333
2   (0.06917192041873932, 0.22154483199119568, 0.0...        0.000000
3   (0.0808219462633133, 0.280564546585083, -0.035...        0.142857
4   (0.11326862126588821, 0.38108861446380615, -0....        0.142857
..                                                ...             ...
95  (0.3616601228713989, 0.1373804360628128, 0.021...        0.000000
96  (0.3617928624153137, 0.11457429826259613, 0.01...        0.000000
97  (0.36578845977783203, 0.042487733066082, 0.081...        0.500000
98  (0.39353927969932556, 0.0698390007019043, -0.0...        0.000000
99  (0.3967207372188568, 0.18588663637638092, -0.0...        0.000000

[100 rows x 2 columns]

Overall average:
0.1825281385281385


In [110]:
df['rl_slates'] = [
    [embedding_lookup.get(tuple(embedding), "Not Found") for embedding in slate_list]
    for slate_list in df['slate_docs_feature']
]


In [111]:
df['rl_hit'] = df.apply(lambda row: 1 if row['original_click'] in row['rl_slates'] else 0, axis=1)

In [112]:


# Step 2: Group by initial_user_state and calculate the mean of 'hit' for each group
grouped_means_rl = df.groupby('initial_user_state_tuple')['rl_hit'].mean().reset_index()
grouped_means_rl.rename(columns={'rl_hit': 'group_mean_rl_hit'}, inplace=True)

# Step 3: Calculate the overall average of the group means
overall_mean = grouped_means_rl['group_mean_rl_hit'].mean()

# Display the results
print("Group-level averages:")
print(grouped_means_rl)
print("\nOverall average:")
print(overall_mean)

Group-level averages:
                             initial_user_state_tuple  group_mean_rl_hit
0   (-0.0259791798889637, 0.5851805210113525, 0.00...           0.000000
1   (0.049241334199905396, 0.23491184413433075, 0....           0.000000
2   (0.06917192041873932, 0.22154483199119568, 0.0...           0.000000
3   (0.0808219462633133, 0.280564546585083, -0.035...           0.000000
4   (0.11326862126588821, 0.38108861446380615, -0....           0.142857
..                                                ...                ...
95  (0.3616601228713989, 0.1373804360628128, 0.021...           0.000000
96  (0.3617928624153137, 0.11457429826259613, 0.01...           0.000000
97  (0.36578845977783203, 0.042487733066082, 0.081...           0.000000
98  (0.39353927969932556, 0.0698390007019043, -0.0...           0.000000
99  (0.3967207372188568, 0.18588663637638092, -0.0...           0.000000

[100 rows x 2 columns]

Overall average:
0.047432900432900424


In [113]:


# df_filtered = df_filtered.reset_index(drop=True)

def hybrid_slate_optimization(row):
    """
    Replaces the 3 least relevant items in the slate using a hybrid BM25 + cosine similarity approach.

    Args:
        row: A row from the DataFrame (passed via df.apply).

    Returns:
        Updated slate as a list of item IDs.
    """

    # Retrieve item IDs using the embedding lookup
    slate_item_ids = [embedding_lookup.get(tuple(embedding), "Not Found") for embedding in row["slate_docs_feature"]]
    candidate_item_ids = [embedding_lookup.get(tuple(embedding), "Not Found") for embedding in row["candidate_docs"]]

    # Get titles for slate and candidate items
    slate_titles = [title for _, title in get_item_ids_and_titles(slate_item_ids, news_df)]
    candidate_titles = [title for _, title in get_item_ids_and_titles(candidate_item_ids, news_df)]
    

    # Tokenize titles for BM25
    slate_tokens = [text.split() for text in slate_titles]
    candidate_tokens = [text.split() for text in candidate_titles]
    
    bm25 = BM25Okapi(candidate_tokens)
    bm25_scores = np.array([bm25.get_scores(tokens) for tokens in slate_tokens])  # (N, M)

    # Compute Cosine Similarity scores
    candidate_docs_matrix = np.array(row["candidate_docs"])  # Convert to 2D numpy array
    candidate_docs_matrix = np.vstack(row["candidate_docs"])
    slate_docs_feature_matrix = np.array(row["slate_docs_feature"])  # Convert to 2D numpy array
    slate_docs_feature_matrix = np.vstack(row["slate_docs_feature"])

    similarity_matrix = cosine_similarity(slate_docs_feature_matrix,candidate_docs_matrix)  # (M, N)
    
   
   
    # Normalize and Combine Scores
    lambda_weight = 1.0  # Adjust balance between BM25 and cosine similarity
    bm25_norm = (bm25_scores - bm25_scores.min()) / (bm25_scores.max() - bm25_scores.min() + 1e-6)
    # bm25_norm = bm25_norm.T 
    cosine_norm = (similarity_matrix - similarity_matrix.min()) / (similarity_matrix.max() - similarity_matrix.min() + 1e-6)
  
    final_scores = lambda_weight * bm25_norm + (1 - lambda_weight) * cosine_norm  # (N, M)
    

    # Identify 3 least relevant slate items
    avg_slate_relevance = final_scores.mean(axis=1)  # Average score per slate item
    least_relevant_indices = np.argsort(avg_slate_relevance)[:9]  # Indices of 3 least relevant slate items

    # Select 3 best candidates
    best_candidate_indices = np.argsort(final_scores.max(axis=0))[-9:]  # Indices of top 3 candidates

    # Replace the least relevant slate items with best candidates
    updated_slate_item_ids = slate_item_ids[:]
    for slate_idx, candidate_idx in zip(least_relevant_indices, best_candidate_indices):
        updated_slate_item_ids[slate_idx] = candidate_item_ids[candidate_idx]  # Replace with best candidate ID
    

    return updated_slate_item_ids

# Apply function to DataFrame

df["slate_reranked"] = df.apply(hybrid_slate_optimization, axis=1)


# # Save the updated DataFrame
# df.to_csv("updated_slate_data.csv", index=False)





In [114]:
df['reranked_hit'] = df.apply(lambda row: 1 if row['original_click'] in row['slate_reranked'] else 0, axis=1)

In [115]:


# Step 2: Group by initial_user_state and calculate the mean of 'hit' for each group
grouped_means_bm25 = df.groupby('initial_user_state_tuple')['reranked_hit'].mean().reset_index()
grouped_means_bm25.rename(columns={'reranked_hit': 'group_mean_reranked_hit'}, inplace=True)

# Step 3: Calculate the overall average of the group means
overall_mean = grouped_means_bm25['group_mean_reranked_hit'].mean()

# Display the results
print("Group-level averages:")
print(grouped_means_bm25)
print("\nOverall average:")
print(overall_mean)

Group-level averages:
                             initial_user_state_tuple  group_mean_reranked_hit
0   (-0.0259791798889637, 0.5851805210113525, 0.00...                 0.000000
1   (0.049241334199905396, 0.23491184413433075, 0....                 0.000000
2   (0.06917192041873932, 0.22154483199119568, 0.0...                 0.000000
3   (0.0808219462633133, 0.280564546585083, -0.035...                 0.000000
4   (0.11326862126588821, 0.38108861446380615, -0....                 0.142857
..                                                ...                      ...
95  (0.3616601228713989, 0.1373804360628128, 0.021...                 0.000000
96  (0.3617928624153137, 0.11457429826259613, 0.01...                 0.000000
97  (0.36578845977783203, 0.042487733066082, 0.081...                 0.000000
98  (0.39353927969932556, 0.0698390007019043, -0.0...                 0.000000
99  (0.3967207372188568, 0.18588663637638092, -0.0...                 0.000000

[100 rows x 2 columns]

Overa

In [116]:
# Create a dictionary for quick lookup of category and subcategory
item_to_category = dict(zip(news_df['itemId'], news_df['category']))
item_to_subcategory = dict(zip(news_df['itemId'], news_df['subcategory']))

# Calculate total unique categories and subcategories in the dataset
total_categories = news_df['category'].nunique()
total_subcategories = news_df['subcategory'].nunique()

# Function to calculate diversity metrics for a given slate
def calculate_diversity(slate, item_to_category, item_to_subcategory):
    categories = set()
    subcategories = set()
    
    for item in slate:
        if item in item_to_category:
            categories.add(item_to_category[item])
        if item in item_to_subcategory:
            subcategories.add(item_to_subcategory[item])
    
    return len(categories), len(subcategories)

# Function to calculate S-Recall as a ratio
def calculate_s_recall(df, column, item_to_category, item_to_subcategory, total_categories, total_subcategories):
    results = []
    
    for slate in df[column]:
        category_diversity, subcategory_diversity = calculate_diversity(slate, item_to_category, item_to_subcategory)
        
        # Calculate S-Recall as a ratio
        s_recall_category = category_diversity / len(slate)
        s_recall_subcategory = subcategory_diversity / len(slate)
        
        results.append((s_recall_category, s_recall_subcategory))
    
    return results

# Calculate S-Recall for each slate column
llm_s_recall = calculate_s_recall(df, 'llm_slate', item_to_category, item_to_subcategory, total_categories, total_subcategories)
rl_s_recall = calculate_s_recall(df, 'rl_slates', item_to_category, item_to_subcategory, total_categories, total_subcategories)
slate_reranked_recall = calculate_s_recall(df, 'slate_reranked', item_to_category, item_to_subcategory, total_categories, total_subcategories)

# Calculate average S-Recall for each column
def calculate_average_s_recall(s_recall_results):
    avg_category = sum([x[0] for x in s_recall_results]) / len(s_recall_results)
    avg_subcategory = sum([x[1] for x in s_recall_results]) / len(s_recall_results)
    return avg_category, avg_subcategory

llm_avg_category, llm_avg_subcategory = calculate_average_s_recall(llm_s_recall)
rl_avg_category, rl_avg_subcategory = calculate_average_s_recall(rl_s_recall)
slate_avg_category, slate_avg_subcategory = calculate_average_s_recall(slate_reranked_recall)

# Print average S-Recall for each column
print("Average S-Recall for llm_slate:")
print(f"Category Level: {llm_avg_category:.4f}, Subcategory Level: {llm_avg_subcategory:.4f}")

print("\nAverage S-Recall for rl_slates:")
print(f"Category Level: {rl_avg_category:.4f}, Subcategory Level: {rl_avg_subcategory:.4f}")

print("\nAverage S-Recall for slate_reranked:")
print(f"Category Level: {slate_avg_category:.4f}, Subcategory Level: {slate_avg_subcategory:.4f}")

# Calculate number of subcategories for each category
subcategory_count = news_df.groupby('category')['subcategory'].nunique().reset_index()
subcategory_count.columns = ['category', 'subcategory_count']

print("\nNumber of subcategories for each category:")
print(subcategory_count)

Average S-Recall for llm_slate:
Category Level: 0.3851, Subcategory Level: 0.6452

Average S-Recall for rl_slates:
Category Level: 0.5696, Subcategory Level: 0.8658

Average S-Recall for slate_reranked:
Category Level: 0.5266, Subcategory Level: 0.7889

Number of subcategories for each category:
         category  subcategory_count
0           autos                 25
1   entertainment                 14
2         finance                 33
3    foodanddrink                 16
4           games                  1
5          health                 23
6            kids                  6
7       lifestyle                 53
8      middleeast                  1
9          movies                  7
10          music                 11
11           news                 38
12   northamerica                  1
13         sports                 34
14         travel                 16
15             tv                 10
16          video                 15
17        weather                  3


In [117]:

# Create a dictionary to map categories to their subcategories
category_to_subcategories = news_df.groupby('category')['subcategory'].unique().to_dict()

# Function to calculate diversity metrics for a given slate
def calculate_diversity(slate, item_to_category, item_to_subcategory):
    categories = set()
    subcategories = set()
    
    for item in slate:
        if item in item_to_category:
            categories.add(item_to_category[item])
        if item in item_to_subcategory:
            subcategories.add(item_to_subcategory[item])
    
    return categories, subcategories

# Function to calculate S-Recall as a ratio
def calculate_s_recall(df, column, item_to_category, item_to_subcategory, category_to_subcategories):
    results = []
    
    for slate in df[column]:
        categories_in_slate, subcategories_in_slate = calculate_diversity(slate, item_to_category, item_to_subcategory)
        
        # Calculate S-Recall at category level
        s_recall_category = len(categories_in_slate) / len(slate)
        
        # Calculate S-Recall at subcategory level (contextual to categories in the slate)
        total_subcategories_in_categories = set()
        for category in categories_in_slate:
            total_subcategories_in_categories.update(category_to_subcategories[category])
        
        s_recall_subcategory = len(subcategories_in_slate) / len(total_subcategories_in_categories)
        
        results.append((s_recall_category, s_recall_subcategory))
    
    return results

# Calculate S-Recall for each slate column
llm_s_recall = calculate_s_recall(df, 'llm_slate', item_to_category, item_to_subcategory, category_to_subcategories)
rl_s_recall = calculate_s_recall(df, 'rl_slates', item_to_category, item_to_subcategory, category_to_subcategories)
slate_reranked_recall = calculate_s_recall(df, 'slate_reranked', item_to_category, item_to_subcategory, category_to_subcategories)

# Calculate average S-Recall for each column
def calculate_average_s_recall(s_recall_results):
    avg_category = sum([x[0] for x in s_recall_results]) / len(s_recall_results)
    avg_subcategory = sum([x[1] for x in s_recall_results]) / len(s_recall_results)
    return avg_category, avg_subcategory

llm_avg_category, llm_avg_subcategory = calculate_average_s_recall(llm_s_recall)
rl_avg_category, rl_avg_subcategory = calculate_average_s_recall(rl_s_recall)
slate_avg_category, slate_avg_subcategory = calculate_average_s_recall(slate_reranked_recall)

# Print average S-Recall for each column
print("Average S-Recall for llm_slate:")
print(f"Category Level: {llm_avg_category:.4f}, Subcategory Level: {llm_avg_subcategory:.4f}")

print("\nAverage S-Recall for rl_slates:")
print(f"Category Level: {rl_avg_category:.4f}, Subcategory Level: {rl_avg_subcategory:.4f}")

print("\nAverage S-Recall for slate_reranked:")
print(f"Category Level: {slate_avg_category:.4f}, Subcategory Level: {slate_avg_subcategory:.4f}")

# Calculate number of subcategories for each category
subcategory_count = news_df.groupby('category')['subcategory'].nunique().reset_index()
subcategory_count.columns = ['category', 'subcategory_count']

print("\nNumber of subcategories for each category:")
print(subcategory_count)

Average S-Recall for llm_slate:
Category Level: 0.3851, Subcategory Level: 0.0700

Average S-Recall for rl_slates:
Category Level: 0.5696, Subcategory Level: 0.0598

Average S-Recall for slate_reranked:
Category Level: 0.5266, Subcategory Level: 0.0577

Number of subcategories for each category:
         category  subcategory_count
0           autos                 25
1   entertainment                 14
2         finance                 33
3    foodanddrink                 16
4           games                  1
5          health                 23
6            kids                  6
7       lifestyle                 53
8      middleeast                  1
9          movies                  7
10          music                 11
11           news                 38
12   northamerica                  1
13         sports                 34
14         travel                 16
15             tv                 10
16          video                 15
17        weather                  3


In [118]:
category_data["observed_state"] = category_data["observed_state"].apply(lambda x: tuple(x) if x is not None else ())

In [119]:
clicked_data_user_history = category_data.merge(
    df,
    left_on=['click', 'observed_state'],
    right_on=['original_click', 'initial_user_state_tuple'],
    how='right'  # Use 'inner' to keep only matching rows
)

In [120]:
# Function to extract titles from the list of tuples
def extract_titles(item_tuples):
    return [title for (_, title) in item_tuples]

# Function to compute BLEU score between two lists of titles
def compute_bleu_score(reference, candidate):
    reference_tokens = [word_tokenize(str(title)) for title in reference]
    candidate_tokens = word_tokenize(str(candidate[0]))  # Ensure candidate is a single tokenized sentence
    
    # Compute BLEU score
    smoothing = SmoothingFunction().method1
    return sentence_bleu(reference_tokens, candidate_tokens, smoothing_function=smoothing)
# Compute BLEU scores for each row
clicked_data_user_history['bleu_rl_vs_presented'] = clicked_data_user_history.apply(
    lambda row: compute_bleu_score(
        extract_titles(get_item_ids_and_titles(row['presented_slate'], news_df)),
        extract_titles(get_item_ids_and_titles(row['rl_slates'], news_df)),  # Replace None with your news_df# Replace None with your news_df
    ), axis=1
)

clicked_data_user_history['bleu_llm_vs_presented'] = clicked_data_user_history.apply(
    lambda row: compute_bleu_score(
        extract_titles(get_item_ids_and_titles(row['presented_slate'], news_df)) ,
        extract_titles(get_item_ids_and_titles(row['llm_slate'], news_df)),  # Replace None with your news_df# Replace None with your news_df
    ), axis=1
)

clicked_data_user_history['bleu_reranked_vs_presented'] = clicked_data_user_history.apply(
    lambda row: compute_bleu_score(
        extract_titles(get_item_ids_and_titles(row['presented_slate'], news_df)) ,
        extract_titles(get_item_ids_and_titles(row['slate_reranked'], news_df)),  # Replace None with your news_df# Replace None with your news_df
    ), axis=1
)

In [121]:
grouped_means_bleu_rl = clicked_data_user_history.groupby('initial_user_state_tuple')['bleu_rl_vs_presented'].mean().reset_index()
grouped_means_bleu_rl.rename(columns={'bleu_rl_vs_presented': 'group_mean_bleu_rl_vs_presented'}, inplace=True)

# Step 3: Calculate the overall average of the group means
overall_mean = grouped_means_bleu_rl['group_mean_bleu_rl_vs_presented'].mean()

# Display the results
print("Group-level averages:")
print(grouped_means_bleu_rl)
print("\nOverall average:")
print(overall_mean)

Group-level averages:
                             initial_user_state_tuple  \
0   (-0.0259791798889637, 0.5851805210113525, 0.00...   
1   (0.049241334199905396, 0.23491184413433075, 0....   
2   (0.06917192041873932, 0.22154483199119568, 0.0...   
3   (0.0808219462633133, 0.280564546585083, -0.035...   
4   (0.11326862126588821, 0.38108861446380615, -0....   
..                                                ...   
95  (0.3616601228713989, 0.1373804360628128, 0.021...   
96  (0.3617928624153137, 0.11457429826259613, 0.01...   
97  (0.36578845977783203, 0.042487733066082, 0.081...   
98  (0.39353927969932556, 0.0698390007019043, -0.0...   
99  (0.3967207372188568, 0.18588663637638092, -0.0...   

    group_mean_bleu_rl_vs_presented  
0                          1.000000  
1                          0.348927  
2                          0.020448  
3                          0.027184  
4                          0.109868  
..                              ...  
95                         

In [122]:
grouped_means_bleu_bm25 = clicked_data_user_history.groupby('initial_user_state_tuple')['bleu_reranked_vs_presented'].mean().reset_index()
grouped_means_bleu_bm25.rename(columns={'bleu_reranked_vs_presented': 'group_mean_bleu_reranked_vs_presented'}, inplace=True)

# Step 3: Calculate the overall average of the group means
overall_mean = grouped_means_bleu_bm25['group_mean_bleu_reranked_vs_presented'].mean()

# Display the results
print("Group-level averages:")
print(grouped_means_bleu_bm25)
print("\nOverall average:")
print(overall_mean)

Group-level averages:
                             initial_user_state_tuple  \
0   (-0.0259791798889637, 0.5851805210113525, 0.00...   
1   (0.049241334199905396, 0.23491184413433075, 0....   
2   (0.06917192041873932, 0.22154483199119568, 0.0...   
3   (0.0808219462633133, 0.280564546585083, -0.035...   
4   (0.11326862126588821, 0.38108861446380615, -0....   
..                                                ...   
95  (0.3616601228713989, 0.1373804360628128, 0.021...   
96  (0.3617928624153137, 0.11457429826259613, 0.01...   
97  (0.36578845977783203, 0.042487733066082, 0.081...   
98  (0.39353927969932556, 0.0698390007019043, -0.0...   
99  (0.3967207372188568, 0.18588663637638092, -0.0...   

    group_mean_bleu_reranked_vs_presented  
0                                0.274076  
1                                0.348927  
2                                0.019641  
3                                0.168887  
4                                0.308249  
..                           

In [123]:
grouped_means_bleu_llm = clicked_data_user_history.groupby('initial_user_state_tuple')['bleu_llm_vs_presented'].mean().reset_index()
grouped_means_bleu_llm.rename(columns={'bleu_llm_vs_presented': 'group_mean_bleu_llm_vs_presented'}, inplace=True)

# Step 3: Calculate the overall average of the group means
overall_mean = grouped_means_bleu_llm['group_mean_bleu_llm_vs_presented'].mean()

# Display the results
print("Group-level averages:")
print(grouped_means_bleu_llm)
print("\nOverall average:")
print(overall_mean)

Group-level averages:
                             initial_user_state_tuple  \
0   (-0.0259791798889637, 0.5851805210113525, 0.00...   
1   (0.049241334199905396, 0.23491184413433075, 0....   
2   (0.06917192041873932, 0.22154483199119568, 0.0...   
3   (0.0808219462633133, 0.280564546585083, -0.035...   
4   (0.11326862126588821, 0.38108861446380615, -0....   
..                                                ...   
95  (0.3616601228713989, 0.1373804360628128, 0.021...   
96  (0.3617928624153137, 0.11457429826259613, 0.01...   
97  (0.36578845977783203, 0.042487733066082, 0.081...   
98  (0.39353927969932556, 0.0698390007019043, -0.0...   
99  (0.3967207372188568, 0.18588663637638092, -0.0...   

    group_mean_bleu_llm_vs_presented  
0                           1.000000  
1                           0.021973  
2                           0.024448  
3                           0.041316  
4                           0.603001  
..                               ...  
95                  

In [124]:
feather_file_path_slateq= gen_slates_dir / "slateq_llm_slates.feather"
df_slateq = pd.read_feather(feather_file_path_slateq)


In [125]:
df_slateq= df_slateq[df_slateq['llm_slateq_slate'].apply(lambda x: len(x) > 0)].copy()
df_slateq['initial_user_state_tuple'] = df_slateq['initial_user_state'].apply(tuple)

# Step 2: Group by initial_user_state and calculate the mean of 'hit' for each group
grouped_means_slateq_llm = df_slateq.groupby('initial_user_state_tuple')['slateq_hit'].mean().reset_index()
grouped_means_slateq_llm.rename(columns={'slateq_hit': 'group_mean_hit'}, inplace=True)

# Step 3: Calculate the overall average of the group means
overall_mean = grouped_means_slateq_llm['group_mean_hit'].mean()

# Display the results
print("Group-level averages:")
print(grouped_means_slateq_llm)
print("\nOverall average:")
print(overall_mean)

Group-level averages:
                             initial_user_state_tuple  group_mean_hit
0   (0.13400940597057343, 0.2015896886587143, -0.0...            0.00
1   (0.15157544612884521, 0.33397215604782104, -0....            1.00
2   (0.15727820992469788, 0.2477571815252304, -0.1...            0.00
3   (0.1577419489622116, 0.21108318865299225, -0.0...            1.00
4   (0.15791304409503937, 0.29125070571899414, -0....            0.00
..                                                ...             ...
95  (0.2903972268104553, 0.15165531635284424, -0.0...            0.25
96  (0.2935115098953247, 0.13266007602214813, -0.1...            0.00
97  (0.29664674401283264, 0.14605002105236053, 0.0...            0.00
98  (0.29784202575683594, 0.2651831805706024, 0.01...            0.00
99  (0.2985374331474304, 0.1253967434167862, 0.048...            0.00

[100 rows x 2 columns]

Overall average:
0.1322950558213716


In [126]:
df_slateq['slateq_slates'] = [
    [embedding_lookup.get(tuple(embedding), "Not Found") for embedding in slate_list]
    for slate_list in df_slateq['slate_docs_feature']
]

In [127]:
df_slateq['actual_slateq_hit'] = df_slateq.apply(lambda row: 1 if row['original_click'] in row['slateq_slates'] else 0, axis=1)

In [128]:
# Step 2: Group by initial_user_state and calculate the mean of 'hit' for each group
grouped_means_slateq = df_slateq.groupby('initial_user_state_tuple')['actual_slateq_hit'].mean().reset_index()
grouped_means_slateq.rename(columns={'actual_slateq_hit': 'group_mean_actual_slateq_hit'}, inplace=True)

# Step 3: Calculate the overall average of the group means
overall_mean = grouped_means_slateq['group_mean_actual_slateq_hit'].mean()

# Display the results
print("Group-level averages:")
print(grouped_means_slateq)
print("\nOverall average:")
print(overall_mean)

Group-level averages:
                             initial_user_state_tuple  \
0   (0.13400940597057343, 0.2015896886587143, -0.0...   
1   (0.15157544612884521, 0.33397215604782104, -0....   
2   (0.15727820992469788, 0.2477571815252304, -0.1...   
3   (0.1577419489622116, 0.21108318865299225, -0.0...   
4   (0.15791304409503937, 0.29125070571899414, -0....   
..                                                ...   
95  (0.2903972268104553, 0.15165531635284424, -0.0...   
96  (0.2935115098953247, 0.13266007602214813, -0.1...   
97  (0.29664674401283264, 0.14605002105236053, 0.0...   
98  (0.29784202575683594, 0.2651831805706024, 0.01...   
99  (0.2985374331474304, 0.1253967434167862, 0.048...   

    group_mean_actual_slateq_hit  
0                            0.0  
1                            0.0  
2                            0.5  
3                            0.0  
4                            0.0  
..                           ...  
95                           0.0  
96           

In [129]:


# df_filtered = df_filtered.reset_index(drop=True)

def hybrid_slate_optimization(row):
    """
    Replaces the 3 least relevant items in the slate using a hybrid BM25 + cosine similarity approach.

    Args:
        row: A row from the DataFrame (passed via df.apply).

    Returns:
        Updated slate as a list of item IDs.
    """

    # Retrieve item IDs using the embedding lookup
    slate_item_ids = [embedding_lookup.get(tuple(embedding), "Not Found") for embedding in row["slate_docs_feature"]]
    candidate_item_ids = [embedding_lookup.get(tuple(embedding), "Not Found") for embedding in row["candidate_docs"]]

    # Get titles for slate and candidate items
    slate_titles = [title for _, title in get_item_ids_and_titles(slate_item_ids, news_df)]
    candidate_titles = [title for _, title in get_item_ids_and_titles(candidate_item_ids, news_df)]
    

    # Tokenize titles for BM25
    slate_tokens = [text.split() for text in slate_titles]
    candidate_tokens = [text.split() for text in candidate_titles]
    
    bm25 = BM25Okapi(candidate_tokens)
    bm25_scores = np.array([bm25.get_scores(tokens) for tokens in slate_tokens])  # (N, M)

    # Compute Cosine Similarity scores
    candidate_docs_matrix = np.array(row["candidate_docs"])  # Convert to 2D numpy array
    candidate_docs_matrix = np.vstack(row["candidate_docs"])
    slate_docs_feature_matrix = np.array(row["slate_docs_feature"])  # Convert to 2D numpy array
    slate_docs_feature_matrix = np.vstack(row["slate_docs_feature"])

    similarity_matrix = cosine_similarity(slate_docs_feature_matrix,candidate_docs_matrix)  # (M, N)
    
   
   
    # Normalize and Combine Scores
    lambda_weight = 1.0  # Adjust balance between BM25 and cosine similarity
    bm25_norm = (bm25_scores - bm25_scores.min()) / (bm25_scores.max() - bm25_scores.min() + 1e-6)
    # bm25_norm = bm25_norm.T 
    cosine_norm = (similarity_matrix - similarity_matrix.min()) / (similarity_matrix.max() - similarity_matrix.min() + 1e-6)
  
    final_scores = lambda_weight * bm25_norm + (1 - lambda_weight) * cosine_norm  # (N, M)
    

    # Identify 3 least relevant slate items
    avg_slate_relevance = final_scores.mean(axis=1)  # Average score per slate item
    least_relevant_indices = np.argsort(avg_slate_relevance)[:9]  # Indices of 3 least relevant slate items

    # Select 3 best candidates
    best_candidate_indices = np.argsort(final_scores.max(axis=0))[-9:]  # Indices of top 3 candidates

    # Replace the least relevant slate items with best candidates
    updated_slate_item_ids = slate_item_ids[:]
    for slate_idx, candidate_idx in zip(least_relevant_indices, best_candidate_indices):
        updated_slate_item_ids[slate_idx] = candidate_item_ids[candidate_idx]  # Replace with best candidate ID
    

    return updated_slate_item_ids

# Apply function to DataFrame

df_slateq["slateq_reranked"] = df_slateq.apply(hybrid_slate_optimization, axis=1)


# # Save the updated DataFrame
# df.to_csv("updated_slate_data.csv", index=False)





In [130]:
df_slateq['slateq_reranked_hit'] = df_slateq.apply(lambda row: 1 if row['original_click'] in row['slateq_reranked'] else 0, axis=1)

In [131]:
# Step 2: Group by initial_user_state and calculate the mean of 'hit' for each group
grouped_means_bm25_slateq = df_slateq.groupby('initial_user_state_tuple')['slateq_reranked_hit'].mean().reset_index()
grouped_means_bm25_slateq.rename(columns={'slateq_reranked_hit': 'group_mean_slateq_reranked_hit'}, inplace=True)

# Step 3: Calculate the overall average of the group means
overall_mean = grouped_means_bm25_slateq['group_mean_slateq_reranked_hit'].mean()

# Display the results
print("Group-level averages:")
print(grouped_means_bm25_slateq)
print("\nOverall average:")
print(overall_mean)

Group-level averages:
                             initial_user_state_tuple  \
0   (0.13400940597057343, 0.2015896886587143, -0.0...   
1   (0.15157544612884521, 0.33397215604782104, -0....   
2   (0.15727820992469788, 0.2477571815252304, -0.1...   
3   (0.1577419489622116, 0.21108318865299225, -0.0...   
4   (0.15791304409503937, 0.29125070571899414, -0....   
..                                                ...   
95  (0.2903972268104553, 0.15165531635284424, -0.0...   
96  (0.2935115098953247, 0.13266007602214813, -0.1...   
97  (0.29664674401283264, 0.14605002105236053, 0.0...   
98  (0.29784202575683594, 0.2651831805706024, 0.01...   
99  (0.2985374331474304, 0.1253967434167862, 0.048...   

    group_mean_slateq_reranked_hit  
0                              0.0  
1                              0.0  
2                              0.5  
3                              0.0  
4                              0.0  
..                             ...  
95                             0.0

In [132]:
# Create a dictionary for quick lookup of category and subcategory
item_to_category = dict(zip(news_df['itemId'], news_df['category']))
item_to_subcategory = dict(zip(news_df['itemId'], news_df['subcategory']))

# Calculate total unique categories and subcategories in the dataset
total_categories = news_df['category'].nunique()
total_subcategories = news_df['subcategory'].nunique()

# Function to calculate diversity metrics for a given slate
def calculate_diversity(slate, item_to_category, item_to_subcategory):
    categories = set()
    subcategories = set()
    
    for item in slate:
        if item in item_to_category:
            categories.add(item_to_category[item])
        if item in item_to_subcategory:
            subcategories.add(item_to_subcategory[item])
    
    return len(categories), len(subcategories)

# Function to calculate S-Recall as a ratio
def calculate_s_recall(df, column, item_to_category, item_to_subcategory, total_categories, total_subcategories):
    results = []
    
    for slate in df[column]:
        category_diversity, subcategory_diversity = calculate_diversity(slate, item_to_category, item_to_subcategory)
        
        # Calculate S-Recall as a ratio
        s_recall_category = category_diversity / len(slate)
        s_recall_subcategory = subcategory_diversity / len(slate)
        
        results.append((s_recall_category, s_recall_subcategory))
    
    return results

# Calculate S-Recall for each slate column
llm_s_recall = calculate_s_recall(df_slateq, 'llm_slateq_slate', item_to_category, item_to_subcategory, total_categories, total_subcategories)
rl_s_recall = calculate_s_recall(df_slateq, 'slateq_slates', item_to_category, item_to_subcategory, total_categories, total_subcategories)
slate_reranked_recall = calculate_s_recall(df_slateq, 'slateq_reranked', item_to_category, item_to_subcategory, total_categories, total_subcategories)

# Calculate average S-Recall for each column
def calculate_average_s_recall(s_recall_results):
    avg_category = sum([x[0] for x in s_recall_results]) / len(s_recall_results)
    avg_subcategory = sum([x[1] for x in s_recall_results]) / len(s_recall_results)
    return avg_category, avg_subcategory

llm_avg_category, llm_avg_subcategory = calculate_average_s_recall(llm_s_recall)
rl_avg_category, rl_avg_subcategory = calculate_average_s_recall(rl_s_recall)
slate_avg_category, slate_avg_subcategory = calculate_average_s_recall(slate_reranked_recall)

# Print average S-Recall for each column
print("Average S-Recall for llm_slate:")
print(f"Category Level: {llm_avg_category:.4f}, Subcategory Level: {llm_avg_subcategory:.4f}")

print("\nAverage S-Recall for rl_slates:")
print(f"Category Level: {rl_avg_category:.4f}, Subcategory Level: {rl_avg_subcategory:.4f}")

print("\nAverage S-Recall for slate_reranked:")
print(f"Category Level: {slate_avg_category:.4f}, Subcategory Level: {slate_avg_subcategory:.4f}")


Average S-Recall for llm_slate:
Category Level: 0.5199, Subcategory Level: 0.7714

Average S-Recall for rl_slates:
Category Level: 0.6162, Subcategory Level: 0.8601

Average S-Recall for slate_reranked:
Category Level: 0.5789, Subcategory Level: 0.7809


In [146]:

# Create a dictionary to map categories to their subcategories
category_to_subcategories = news_df.groupby('category')['subcategory'].unique().to_dict()

# Function to calculate diversity metrics for a given slate
def calculate_diversity(slate, item_to_category, item_to_subcategory):
    categories = set()
    subcategories = set()
    
    for item in slate:
        if item in item_to_category:
            categories.add(item_to_category[item])
        if item in item_to_subcategory:
            subcategories.add(item_to_subcategory[item])
    
    return categories, subcategories

# Function to calculate S-Recall as a ratio
def calculate_s_recall(df, column, item_to_category, item_to_subcategory, category_to_subcategories):
    results = []
    
    for slate in df[column]:
        categories_in_slate, subcategories_in_slate = calculate_diversity(slate, item_to_category, item_to_subcategory)
        
        # Calculate S-Recall at category level
        s_recall_category = len(categories_in_slate) / len(slate)
        
        # Calculate S-Recall at subcategory level (contextual to categories in the slate)
        total_subcategories_in_categories = set()
        for category in categories_in_slate:
            total_subcategories_in_categories.update(category_to_subcategories[category])
        
        s_recall_subcategory = len(subcategories_in_slate) / len(total_subcategories_in_categories)
        
        results.append((s_recall_category, s_recall_subcategory))
    
    return results

# Calculate S-Recall for each slate column
llm_s_recall = calculate_s_recall(df_slateq, 'llm_slateq_slate', item_to_category, item_to_subcategory, category_to_subcategories)
rl_s_recall = calculate_s_recall(df_slateq, 'slateq_slates', item_to_category, item_to_subcategory, category_to_subcategories)
slate_reranked_recall = calculate_s_recall(df_slateq, 'slateq_reranked', item_to_category, item_to_subcategory, category_to_subcategories)

# Calculate average S-Recall for each column
def calculate_average_s_recall(s_recall_results):
    avg_category = sum([x[0] for x in s_recall_results]) / len(s_recall_results)
    avg_subcategory = sum([x[1] for x in s_recall_results]) / len(s_recall_results)
    return avg_category, avg_subcategory

llm_avg_category, llm_avg_subcategory = calculate_average_s_recall(llm_s_recall)
rl_avg_category, rl_avg_subcategory = calculate_average_s_recall(rl_s_recall)
slate_avg_category, slate_avg_subcategory = calculate_average_s_recall(slate_reranked_recall)

# Print average S-Recall for each column
print("Average S-Recall for llm_slate:")
print(f"Category Level: {llm_avg_category:.4f}, Subcategory Level: {llm_avg_subcategory:.4f}")

print("\nAverage S-Recall for rl_slates:")
print(f"Category Level: {rl_avg_category:.4f}, Subcategory Level: {rl_avg_subcategory:.4f}")

print("\nAverage S-Recall for slate_reranked:")
print(f"Category Level: {slate_avg_category:.4f}, Subcategory Level: {slate_avg_subcategory:.4f}")

# Calculate number of subcategories for each category
subcategory_count = news_df.groupby('category')['subcategory'].nunique().reset_index()
subcategory_count.columns = ['category', 'subcategory_count']

print("\nNumber of subcategories for each category:")
print(subcategory_count)

Average S-Recall for llm_slate:
Category Level: 0.5199, Subcategory Level: 0.0638

Average S-Recall for rl_slates:
Category Level: 0.6162, Subcategory Level: 0.0602

Average S-Recall for slate_reranked:
Category Level: 0.5789, Subcategory Level: 0.0575

Number of subcategories for each category:
         category  subcategory_count
0           autos                 25
1   entertainment                 14
2         finance                 33
3    foodanddrink                 16
4           games                  1
5          health                 23
6            kids                  6
7       lifestyle                 53
8      middleeast                  1
9          movies                  7
10          music                 11
11           news                 38
12   northamerica                  1
13         sports                 34
14         travel                 16
15             tv                 10
16          video                 15
17        weather                  3


In [134]:
clicked_data_user_history = category_data.merge(
    df_slateq,
    left_on=['click', 'observed_state'],
    right_on=['original_click', 'initial_user_state_tuple'],
    how='right'  # Use 'inner' to keep only matching rows
)

In [135]:
# Function to extract titles from the list of tuples
def extract_titles(item_tuples):
    return [title for (_, title) in item_tuples]

# Function to compute BLEU score between two lists of titles
def compute_bleu_score(reference, candidate):
    reference_tokens = [word_tokenize(str(title)) for title in reference]
    candidate_tokens = word_tokenize(str(candidate[0]))  # Ensure candidate is a single tokenized sentence
    
    # Compute BLEU score
    smoothing = SmoothingFunction().method1
    return sentence_bleu(reference_tokens, candidate_tokens, smoothing_function=smoothing)
# Compute BLEU scores for each row
clicked_data_user_history['bleu_rl_vs_presented'] = clicked_data_user_history.apply(
    lambda row: compute_bleu_score(
        extract_titles(get_item_ids_and_titles(row['presented_slate'], news_df)),
        extract_titles(get_item_ids_and_titles(row['slateq_slates'], news_df)),  # Replace None with your news_df# Replace None with your news_df
    ), axis=1
)

clicked_data_user_history['bleu_llm_vs_presented'] = clicked_data_user_history.apply(
    lambda row: compute_bleu_score(
        extract_titles(get_item_ids_and_titles(row['presented_slate'], news_df)) ,
        extract_titles(get_item_ids_and_titles(row['llm_slateq_slate'], news_df)),  # Replace None with your news_df# Replace None with your news_df
    ), axis=1
)

clicked_data_user_history['bleu_reranked_vs_presented'] = clicked_data_user_history.apply(
    lambda row: compute_bleu_score(
        extract_titles(get_item_ids_and_titles(row['presented_slate'], news_df)) ,
        extract_titles(get_item_ids_and_titles(row['slateq_reranked'], news_df)),  # Replace None with your news_df# Replace None with your news_df
    ), axis=1
)

In [136]:
grouped_means_bleu_rl = clicked_data_user_history.groupby('initial_user_state_tuple')['bleu_rl_vs_presented'].mean().reset_index()
grouped_means_bleu_rl.rename(columns={'bleu_rl_vs_presented': 'group_mean_bleu_rl_vs_presented'}, inplace=True)

# Step 3: Calculate the overall average of the group means
overall_mean = grouped_means_bleu_rl['group_mean_bleu_rl_vs_presented'].mean()

# Display the results
print("Group-level averages:")
print(grouped_means_bleu_rl)
print("\nOverall average:")
print(overall_mean)

Group-level averages:
                             initial_user_state_tuple  \
0   (0.13400940597057343, 0.2015896886587143, -0.0...   
1   (0.15157544612884521, 0.33397215604782104, -0....   
2   (0.15727820992469788, 0.2477571815252304, -0.1...   
3   (0.1577419489622116, 0.21108318865299225, -0.0...   
4   (0.15791304409503937, 0.29125070571899414, -0....   
..                                                ...   
95  (0.2903972268104553, 0.15165531635284424, -0.0...   
96  (0.2935115098953247, 0.13266007602214813, -0.1...   
97  (0.29664674401283264, 0.14605002105236053, 0.0...   
98  (0.29784202575683594, 0.2651831805706024, 0.01...   
99  (0.2985374331474304, 0.1253967434167862, 0.048...   

    group_mean_bleu_rl_vs_presented  
0                          0.016792  
1                          0.021105  
2                          0.500000  
3                          1.000000  
4                          0.021016  
..                              ...  
95                         

In [137]:
grouped_means_bleu_bm25 = clicked_data_user_history.groupby('initial_user_state_tuple')['bleu_reranked_vs_presented'].mean().reset_index()
grouped_means_bleu_bm25.rename(columns={'bleu_reranked_vs_presented': 'group_mean_bleu_reranked_vs_presented'}, inplace=True)

# Step 3: Calculate the overall average of the group means
overall_mean = grouped_means_bleu_bm25['group_mean_bleu_reranked_vs_presented'].mean()

# Display the results
print("Group-level averages:")
print(grouped_means_bleu_bm25)
print("\nOverall average:")
print(overall_mean)

Group-level averages:
                             initial_user_state_tuple  \
0   (0.13400940597057343, 0.2015896886587143, -0.0...   
1   (0.15157544612884521, 0.33397215604782104, -0....   
2   (0.15727820992469788, 0.2477571815252304, -0.1...   
3   (0.1577419489622116, 0.21108318865299225, -0.0...   
4   (0.15791304409503937, 0.29125070571899414, -0....   
..                                                ...   
95  (0.2903972268104553, 0.15165531635284424, -0.0...   
96  (0.2935115098953247, 0.13266007602214813, -0.1...   
97  (0.29664674401283264, 0.14605002105236053, 0.0...   
98  (0.29784202575683594, 0.2651831805706024, 0.01...   
99  (0.2985374331474304, 0.1253967434167862, 0.048...   

    group_mean_bleu_reranked_vs_presented  
0                                0.509399  
1                                0.681157  
2                                0.025915  
3                                0.021359  
4                                0.017476  
..                           

In [138]:
grouped_means_bleu_llm = clicked_data_user_history.groupby('initial_user_state_tuple')['bleu_llm_vs_presented'].mean().reset_index()
grouped_means_bleu_llm.rename(columns={'bleu_llm_vs_presented': 'group_mean_bleu_llm_vs_presented'}, inplace=True)

# Step 3: Calculate the overall average of the group means
overall_mean = grouped_means_bleu_llm['group_mean_bleu_llm_vs_presented'].mean()

# Display the results
print("Group-level averages:")
print(grouped_means_bleu_llm)
print("\nOverall average:")
print(overall_mean)

Group-level averages:
                             initial_user_state_tuple  \
0   (0.13400940597057343, 0.2015896886587143, -0.0...   
1   (0.15157544612884521, 0.33397215604782104, -0....   
2   (0.15727820992469788, 0.2477571815252304, -0.1...   
3   (0.1577419489622116, 0.21108318865299225, -0.0...   
4   (0.15791304409503937, 0.29125070571899414, -0....   
..                                                ...   
95  (0.2903972268104553, 0.15165531635284424, -0.0...   
96  (0.2935115098953247, 0.13266007602214813, -0.1...   
97  (0.29664674401283264, 0.14605002105236053, 0.0...   
98  (0.29784202575683594, 0.2651831805706024, 0.01...   
99  (0.2985374331474304, 0.1253967434167862, 0.048...   

    group_mean_bleu_llm_vs_presented  
0                           0.022188  
1                           0.672187  
2                           0.026315  
3                           1.000000  
4                           0.021973  
..                               ...  
95                  

In [139]:
feather_file_path_llm= gen_slates_dir / "llm_slates.feather"
df_llm = pd.read_feather(feather_file_path_llm)



In [140]:
df_llm= df_llm[df_llm['llm_gen_slate'].apply(lambda x: len(x) > 0)].copy()
df_llm['initial_user_state_tuple'] = df_llm['initial_user_state'].apply(tuple)

# Step 2: Group by initial_user_state and calculate the mean of 'hit' for each group
grouped_means_rl_llm = df_llm.groupby('initial_user_state_tuple')['llm_hit'].mean().reset_index()
grouped_means_rl_llm.rename(columns={'llm_hit': 'group_mean_hit'}, inplace=True)

# Step 3: Calculate the overall average of the group means
overall_mean = grouped_means_rl_llm['group_mean_hit'].mean()

# Display the results
print("Group-level averages:")
print(grouped_means_rl_llm)
print("\nOverall average:")
print(overall_mean)

Group-level averages:
                             initial_user_state_tuple  group_mean_hit
0   (-0.0259791798889637, 0.5851805210113525, 0.00...        0.500000
1   (0.049241334199905396, 0.23491184413433075, 0....        0.333333
2   (0.06917192041873932, 0.22154483199119568, 0.0...        0.000000
3   (0.0808219462633133, 0.280564546585083, -0.035...        0.142857
4   (0.11326862126588821, 0.38108861446380615, -0....        0.285714
..                                                ...             ...
95  (0.3616601228713989, 0.1373804360628128, 0.021...        0.000000
96  (0.3617928624153137, 0.11457429826259613, 0.01...        0.000000
97  (0.36578845977783203, 0.042487733066082, 0.081...        0.500000
98  (0.39353927969932556, 0.0698390007019043, -0.0...        0.000000
99  (0.3967207372188568, 0.18588663637638092, -0.0...        0.000000

[100 rows x 2 columns]

Overall average:
0.16706782106782106


In [141]:
# Create a dictionary for quick lookup of category and subcategory
item_to_category = dict(zip(news_df['itemId'], news_df['category']))
item_to_subcategory = dict(zip(news_df['itemId'], news_df['subcategory']))

# Calculate total unique categories and subcategories in the dataset
total_categories = news_df['category'].nunique()
total_subcategories = news_df['subcategory'].nunique()

# Function to calculate diversity metrics for a given slate
def calculate_diversity(slate, item_to_category, item_to_subcategory):
    categories = set()
    subcategories = set()
    
    for item in slate:
        if item in item_to_category:
            categories.add(item_to_category[item])
        if item in item_to_subcategory:
            subcategories.add(item_to_subcategory[item])
    
    return len(categories), len(subcategories)

# Function to calculate S-Recall as a ratio
def calculate_s_recall(df, column, item_to_category, item_to_subcategory, total_categories, total_subcategories):
    results = []
    
    for slate in df[column]:
        category_diversity, subcategory_diversity = calculate_diversity(slate, item_to_category, item_to_subcategory)
        
        # Calculate S-Recall as a ratio
        s_recall_category = category_diversity / len(slate)
        s_recall_subcategory = subcategory_diversity / len(slate)
        
        results.append((s_recall_category, s_recall_subcategory))
    
    return results

# Calculate S-Recall for each slate column
llm_s_recall = calculate_s_recall(df_llm, 'llm_gen_slate', item_to_category, item_to_subcategory, total_categories, total_subcategories)

# Calculate average S-Recall for each column
def calculate_average_s_recall(s_recall_results):
    avg_category = sum([x[0] for x in s_recall_results]) / len(s_recall_results)
    avg_subcategory = sum([x[1] for x in s_recall_results]) / len(s_recall_results)
    return avg_category, avg_subcategory

llm_avg_category, llm_avg_subcategory = calculate_average_s_recall(llm_s_recall)


# Print average S-Recall for each column
print("Average S-Recall for llm_slate:")
print(f"Category Level: {llm_avg_category:.4f}, Subcategory Level: {llm_avg_subcategory:.4f}")



Average S-Recall for llm_slate:
Category Level: 0.3645, Subcategory Level: 0.6131


In [142]:

# Create a dictionary to map categories to their subcategories
category_to_subcategories = news_df.groupby('category')['subcategory'].unique().to_dict()

# Function to calculate diversity metrics for a given slate
def calculate_diversity(slate, item_to_category, item_to_subcategory):
    categories = set()
    subcategories = set()
    
    for item in slate:
        if item in item_to_category:
            categories.add(item_to_category[item])
        if item in item_to_subcategory:
            subcategories.add(item_to_subcategory[item])
    
    return categories, subcategories

# Function to calculate S-Recall as a ratio
def calculate_s_recall(df, column, item_to_category, item_to_subcategory, category_to_subcategories):
    results = []
    
    for slate in df[column]:
        categories_in_slate, subcategories_in_slate = calculate_diversity(slate, item_to_category, item_to_subcategory)
        
        # Calculate S-Recall at category level
        s_recall_category = len(categories_in_slate) / len(slate)
        
        # Calculate S-Recall at subcategory level (contextual to categories in the slate)
        total_subcategories_in_categories = set()
        for category in categories_in_slate:
            total_subcategories_in_categories.update(category_to_subcategories[category])
        
        s_recall_subcategory = len(subcategories_in_slate) / len(total_subcategories_in_categories)
        
        results.append((s_recall_category, s_recall_subcategory))
    
    return results

# Calculate S-Recall for each slate column
llm_s_recall = calculate_s_recall(df_llm, 'llm_gen_slate', item_to_category, item_to_subcategory, category_to_subcategories)

# Calculate average S-Recall for each column
def calculate_average_s_recall(s_recall_results):
    avg_category = sum([x[0] for x in s_recall_results]) / len(s_recall_results)
    avg_subcategory = sum([x[1] for x in s_recall_results]) / len(s_recall_results)
    return avg_category, avg_subcategory

llm_avg_category, llm_avg_subcategory = calculate_average_s_recall(llm_s_recall)
rl_avg_category, rl_avg_subcategory = calculate_average_s_recall(rl_s_recall)
slate_avg_category, slate_avg_subcategory = calculate_average_s_recall(slate_reranked_recall)

# Print average S-Recall for each column
print("Average S-Recall for llm_slate:")
print(f"Category Level: {llm_avg_category:.4f}, Subcategory Level: {llm_avg_subcategory:.4f}")



Average S-Recall for llm_slate:
Category Level: 0.3645, Subcategory Level: 0.0676


In [143]:
clicked_data_user_history = category_data.merge(
    df_llm,
    left_on=['click', 'observed_state'],
    right_on=['original_click', 'initial_user_state_tuple'],
    how='right'  # Use 'inner' to keep only matching rows
)

In [144]:
# Function to extract titles from the list of tuples
def extract_titles(item_tuples):
    return [title for (_, title) in item_tuples]

# Function to compute BLEU score between two lists of titles
def compute_bleu_score(reference, candidate):
    reference_tokens = [word_tokenize(str(title)) for title in reference]
    candidate_tokens = word_tokenize(str(candidate[0]))  # Ensure candidate is a single tokenized sentence
    
    # Compute BLEU score
    smoothing = SmoothingFunction().method1
    return sentence_bleu(reference_tokens, candidate_tokens, smoothing_function=smoothing)


clicked_data_user_history['bleu_llm_vs_presented'] = clicked_data_user_history.apply(
    lambda row: compute_bleu_score(
        extract_titles(get_item_ids_and_titles(row['presented_slate'], news_df)) ,
        extract_titles(get_item_ids_and_titles(row['llm_gen_slate'], news_df)),  # Replace None with your news_df# Replace None with your news_df
    ), axis=1
)


In [145]:
grouped_means_bleu_llm = clicked_data_user_history.groupby('initial_user_state_tuple')['bleu_llm_vs_presented'].mean().reset_index()
grouped_means_bleu_llm.rename(columns={'bleu_llm_vs_presented': 'group_mean_bleu_llm_vs_presented'}, inplace=True)

# Step 3: Calculate the overall average of the group means
overall_mean = grouped_means_bleu_llm['group_mean_bleu_llm_vs_presented'].mean()

# Display the results
print("Group-level averages:")
print(grouped_means_bleu_llm)
print("\nOverall average:")
print(overall_mean)

Group-level averages:
                             initial_user_state_tuple  \
0   (-0.0259791798889637, 0.5851805210113525, 0.00...   
1   (0.049241334199905396, 0.23491184413433075, 0....   
2   (0.06917192041873932, 0.22154483199119568, 0.0...   
3   (0.0808219462633133, 0.280564546585083, -0.035...   
4   (0.11326862126588821, 0.38108861446380615, -0....   
..                                                ...   
95  (0.3616601228713989, 0.1373804360628128, 0.021...   
96  (0.3617928624153137, 0.11457429826259613, 0.01...   
97  (0.36578845977783203, 0.042487733066082, 0.081...   
98  (0.39353927969932556, 0.0698390007019043, -0.0...   
99  (0.3967207372188568, 0.18588663637638092, -0.0...   

    group_mean_bleu_llm_vs_presented  
0                           1.000000  
1                           0.021973  
2                           0.018409  
3                           0.041316  
4                           0.585378  
..                               ...  
95                  