In [None]:
from pathlib import Path
import os
from dotenv import load_dotenv
import pandas as pd
import math
load_dotenv()
base_path = Path.home() / Path(os.environ.get("DATA_PATH"))
print(base_path)

Load the behavior tsv file

In [None]:
df = pd.read_csv(
   base_path/ Path("behaviors.tsv"), 
    sep="\t",
    names=["impressionId","userId","timestamp","click_history","impressions"])

print(f"The dataset originally consist of {len(df)} number of interactions.")


df.head()

In [None]:
news = pd.read_csv(
    base_path/ Path("news.tsv"), 
    sep="\t",
    names=["itemId","category","subcategory","title","abstract","url","title_entities","abstract_entities"])
print(f"The article data consist in total of {len(news)} number of articles.")
news

In [None]:
categories = news['category'].unique()
news["num_category"] = news["category"].factorize()[0]
article_category = news.set_index("itemId")["num_category"].to_dict()

In [None]:
df['impressions'].dtype

Split the impressions in two columns clicked items and presented slate

In [None]:
def process_impression(impression_list):
    list_of_strings = impression_list.split()
    click = [x.split('-')[0] for x in list_of_strings if x.split('-')[1] == '1']
    non_click = [x.split('-')[0] for x in list_of_strings]
    return click,non_click


df['click'], df['presented_slate'] = zip(*df['impressions'].map(process_impression))
df

In [None]:
# news = pd.read_csv(
#     base_path/ Path("news.tsv"), 
#     sep="\t",
#     names=["itemId","category","subcategory","title","abstract","url","title_entities","abstract_entities"])
# print(f"The article data consist in total of {len(news)} number of articles.")
# news

Only keep the valid article ids from the news.tsv file

In [None]:
valid_article_ids = set(news['itemId'])
print(len(news['itemId']))
def filter_click_list(click_list):
    return [item for item in click_list if item in valid_article_ids]
def filter_click_history_list(click_list):
    if pd.notna(click_list['click_history']):
     return [item for item in click_list['click_history'].split() if item in valid_article_ids]
    else:
     return []
behaviors_df=df

behaviors_df['click'] = behaviors_df['click'].apply(filter_click_list)
behaviors_df['presented_slate'] = behaviors_df['presented_slate'].apply(filter_click_list)
behaviors_df['click_history'] = behaviors_df.apply(filter_click_history_list,axis=1)
# behaviors_df[behaviors_df['click'].apply(lambda x: any(item in valid_article_ids for item in x))]
# filtered_behaviors_df = filtered_behaviors_df[filtered_behaviors_df['presented_slate'].apply(lambda x: any(item in valid_article_ids for item in x))]
filtered_behaviors_df = behaviors_df[behaviors_df['click'].apply(lambda x: len(x) > 0)]
filtered_behaviors_df


In [None]:
news[news['itemId'] == 'N82719']

Get the dataframe for the user choice model. Only keep impressions with a single click as they dont induce a session

In [None]:
filtered_df_choice_data = filtered_behaviors_df[
    (df['click'].apply(lambda x: len(x) == 1 or (len(x) == 0 and isinstance(x, list))))
].reset_index()

In [None]:
filtered_df_choice_data

Keep this for the RL algorithm

In [None]:
filtered_df_2 = filtered_behaviors_df[
    (df['click'].apply(lambda x: len(x) != 1 or (len(x) == 0 and isinstance(x, list))))
].reset_index()

In [None]:
filtered_df_2

Separate the test data

<!-- Interaction contains impression of size between 5 and 10 while interaction_all between 5 and greater. -->

In [None]:
test_data = filtered_df_2.sample(n=100000, random_state=42)
filtered_df_2 = filtered_df_2.drop(test_data.index)


Use the entropy_based_diversity to calculate the diversity of user in the test data

In [None]:
import numpy as np
def click_history_diversity(row) :
        items_hist = row["click_history"]
        categories = [
            article_category.get(article_id, 0) for article_id in items_hist
        ]
        category_counts = [categories.count(i) for i in range(0, 18)]
        total_count = sum(category_counts)
        probs = [count / total_count for count in category_counts]
        entropy = 0
        for prob in probs:
            if prob > 0:
                entropy-= prob * math.log2(prob)
        # score = sum(1 for x in count_categories if x > 0) / 18
        return entropy

def entropy_based_diversity(row):

  # Normalize the counts to get probabilities
    items_hist = row["click_history"]
    categories = [
        article_category.get(article_id, 0) for article_id in items_hist
    ]
    category_counts = [categories.count(i) for i in range(0, 18)]
    probs = category_counts / np.sum(category_counts)

    # Handle zero probabilities (avoid log of zero)
    probs = np.where(probs > 0, probs, 1e-10)

    # Calculate entropy
    entropy = -np.sum(probs * np.log2(probs))

    # Normalize entropy (optional, comment out if not needed)
    diversity_score = entropy / np.log2(len(category_counts))

    return diversity_score
test_data['diversity_score']=test_data.apply(entropy_based_diversity,axis=1)

In [None]:
test_data

In [None]:
test_data['diversity_score'].hist(bins=30)

In [None]:
test_data['diversity_score'].min()

Cold start users

In [None]:
zero_scores = test_data[test_data['diversity_score'] == 1.3542876093347509e-08]
zero_scores.reset_index(inplace=True, drop=True)
# feather_path_test=base_path/ Path("MINDlarge_train/entropy_coldstart_test_50.feather")
# zero_scores.to_feather(feather_path_test)
zero_scores

In [None]:
non_zero_scores = test_data[test_data['diversity_score'] > 1.3542876093347509e-07]
non_zero_scores

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt
non_zero_scores['diversity_score'].hist(bins=60)
plt.xlabel('Categorical entropy')
plt.ylabel('Frequency')

In [None]:
q1 = non_zero_scores['diversity_score'].quantile(0.25)
q1

Specialist users in the first quantile

In [None]:
specialists=non_zero_scores[non_zero_scores['diversity_score'] < q1]
specialists.reset_index(inplace=True, drop=True)
# feather_path_test=base_path/ Path("MINDlarge_train/entropy_specialist_test_50.feather")
# specialists.to_feather(feather_path_test)
specialists

Generalist users in the second quantile

In [None]:
generalists=non_zero_scores[non_zero_scores['diversity_score'] > q1]
generalists.reset_index(inplace=True, drop=True)
# feather_path_test=base_path/ Path("MINDlarge_train/entropy_generalist_test_50.feather")
# generalists.to_feather(feather_path_test)
generalists

In [None]:
news_df = pd.read_feather(
            base_path / Path("MINDlarge_train/news_glove_embed_50.feather")
        )
embedding_dict = dict(zip(news_df["itemId"], news_df["embedding"]))

In [None]:
# # # Diversity Dissimilarity training data
# import torch
# from torch._tensor import Tensor
# def diversity_dissimilarity(items_hist):
#     item_list_hist = [
#             embedding_dict.get(key, [])
#             for key in items_hist
#             if embedding_dict.get(key, []) is not None
#             and len(embedding_dict.get(key, [])) > 0
#         ]
#     item_tensor = [
#         torch.tensor(array, dtype=torch.float) for array in item_list_hist
#     ]
#     if len(item_tensor) >= 2:
#         tensors = item_tensor
#         n = len(tensors)
#         similarity_matrix = torch.zeros(n, n)

#         # Compute similarity matrix
#         for i in range(n):
#             for j in range(i + 1, n):
#                 similarity_matrix[i, j] = torch.dot(tensors[i], tensors[j]) / (
#                     torch.norm(tensors[i]) * torch.norm(tensors[j])
#                 )
#                 similarity_matrix[j, i] = similarity_matrix[
#                     i, j
#                 ]  # Similarity matrix is symmetric

#         # Calculate diversity
#         total_diversity = (
#             torch.sum(1 - similarity_matrix) - n
#         )  # Exclude diagonal elements
#         diversity_measure = total_diversity / ((n / 2) * (n - 1))
#     else:
#         diversity_measure = torch.tensor(0.0)

#     return diversity_measure

filtered_df_2['diversity_score'] = filtered_df_2.apply(entropy_based_diversity,axis=1)

In [None]:
filtered_df_2

In [None]:
filtered_df_2_3=filtered_df_2
filtered_df_2_3

In [None]:

# filtered_df_2_3['diversity_score'] = filtered_df_2_3['diversity_score'].astype(float)

In [None]:
filtered_df_2_3

In [None]:
feather_path=base_path/ Path("MINDlarge_train/div_entropy_50.feather")
filtered_df_2_3.to_feather(feather_path)

In [None]:
data_div=pd.read_feather(feather_path)
data_div

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt

sns.kdeplot(data_div['diversity_score'])
plt.show()

In [None]:
threshold = 0.5  # Replace with your desired threshold value
count_above_threshold = (data_div['diversity_score'] > threshold).sum()

print(f"Number of rows with diversity_score greater than {threshold}:", count_above_threshold)

In [None]:
data_div['diversity_score'].min()

In [None]:
data_div1 = data_div[data_div['diversity_score'] > 1.3542876093347509e-07]

In [None]:
data_div1['diversity_score'].hist(bins=60)
plt.xlabel('Categorical entropy')
plt.ylabel('Frequency')


In [None]:
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import StandardScaler

# Assuming 'data_div' is your DataFrame
scaler = MinMaxScaler(feature_range=(0, 2.2))  # Scales between 0 and 1

# Fit the scaler to the 'diversity_score' column
# scaler = StandardScaler()
scaler.fit(data_div[['diversity_score']])


# Transform the data and store it in a new column named 'scaled_diversity'
data_div['scaled_diversity'] = scaler.transform(data_div[['diversity_score']])
data_div

In [None]:
def custom_scaling(value, threshold=0.25, new_max=1):
  """Scales a value to be closer to new_max if it's above threshold.

  Args:
      value: The value to be scaled.
      threshold: The threshold value above which scaling is applied.
      new_max: The new maximum value for scaled data (defaults to 1).

  Returns:
      The scaled value.
  """

  if value <= threshold:
    return value  # No change for values below the threshold
  else:
    # Linear scaling to compress values between threshold and new_max
    slope = (new_max - threshold) / (1 - threshold)
    scaled_value = slope * (value - threshold) + threshold
    return min(scaled_value, new_max)  # Ensures no value exceeds new_max

# Assuming 'data_div' is your DataFrame
data_div['scaled_diversity'] = data_div['diversity_score'].apply(custom_scaling)

In [None]:
print(data_div['scaled_diversity'].max())
data_div['scaled_diversity'].hist(bins=60)
plt.xlabel('Values in column')
plt.ylabel('Frequency')
plt.title('Diversity score based on user history')

In [None]:
filtered_df_2_1=filtered_df_2.explode('click', ignore_index=True)
filtered_df_2_1=filtered_df_2_1.drop(columns=['index'])
filtered_df_2_1
# feather_path=base_path/ Path("MINDlarge_train/interaction_all.feather")
# filtered_df_2_1.to_feather(feather_path)

In [None]:
zero_scores=zero_scores.explode('click', ignore_index=True)
zero_scores=zero_scores.drop(columns=['index'])
zero_scores

In [None]:
generalists=generalists.explode('click', ignore_index=True)
generalists=generalists.drop(columns=['index'])
generalists

In [None]:
specialists=specialists.explode('click', ignore_index=True)
specialists=specialists.drop(columns=['index'])
specialists

In [None]:
filtered_df_2_1

In [None]:
# news['num_category'] = news['category'].factorize()[0]
# article_category_map = news.set_index('itemId')['num_category'].to_dict()

# def get_clicked_category_list(row):
#     # if pd.notna(row['click_history']):
#         # Use a list comprehension to create the list of categories
#     categories = [article_category_map.get(article_id, 0) for article_id in row['click']]
#     count_categories = [categories.count(i) for i in range(0, 18)]
#     # else:
#     #     # If 'click_history' is NaN, assign a list of zeros
#     #     count_categories = []
#     return count_categories
# def get_clicked_hist_category_list(row):
#     # if pd.notna(row['click_history']):
#         # Use a list comprehension to create the list of categories
#     categories = [article_category_map.get(article_id, 0) for article_id in row['click_history']]
#     count_categories = [categories.count(i) for i in range(0, 18)]
#     # else:
#     #     # If 'click_history' is NaN, assign a list of zeros
#     #     count_categories = []
#     return count_categories

# # Apply the function to create the new column
# filtered_df_2['clicked_category_list'] = filtered_df_2.apply(get_clicked_category_list, axis=1)
# filtered_df_2['clicked_hist_category_list'] = filtered_df_2.apply(get_clicked_hist_category_list, axis=1)
# filtered_df_2

In [None]:
# # correlation_matrix = filtered_df_2['clicked_category_list'].apply(pd.Series).corrwith(filtered_df_2['clicked_hist_category_list'].apply(pd.Series))
# correlation_matrix = filtered_df_2.apply(lambda x: pd.Series(x['clicked_category_list']).corr(pd.Series(x['clicked_hist_category_list'])), axis=1)
# # Print the correlation matrix
# print("Correlation Matrix:")
# print(correlation_matrix)

In [None]:
# import pandas as pd
# import seaborn as sns
# import matplotlib.pyplot as plt
# sns.heatmap([correlation_matrix], annot=True, cmap='coolwarm', fmt=".2f", linewidths=.2)
# plt.title('Correlation Matrix')
# plt.show()

In [None]:
news[news['itemId'] == 'N25587']

In [None]:
# grouped_filtered_df_2 = filtered_df_2_1.sort_values(by='timestamp').groupby('userId').agg({
#     'impressions': lambda x: list(x),
#     'timestamp': lambda x: list(x),
#     'click_history': lambda x: list(x),
#     'click': lambda x: list(x),
#     'presented_slate': lambda x: list(x),   
# }).reset_index()
# grouped_filtered_df_2

In [None]:
# max_length = grouped_filtered_df_2['click'].apply(len).max()
# mean_length = grouped_filtered_df_2['click'].apply(len).mean()
# min_length = grouped_filtered_df_2['click'].apply(len).min()
# mode_length = grouped_filtered_df_2['click'].apply(len).mode().iloc[0]
# median_length = grouped_filtered_df_2['click'].apply(len).median()

# # Print the results
# print("Max length:", max_length)
# print("Mean length:", mean_length)
# print("Min length:", min_length)
# print("Mode length:", mode_length)
# print("Median length:", median_length)

In [None]:
# grouped_filtered_df_2_t = filtered_df_2_1.sort_values(by='timestamp').groupby('timestamp').agg({
#     'impressions': lambda x: list(x),
#     'click_history': lambda x: list(x),
#     'click': lambda x: list(x),
#     'presented_slate': lambda x: list(x),   
# }).reset_index()
# grouped_filtered_df_2_t

In [None]:
# max_length = grouped_filtered_df_2_t['timestamp'].apply(len).max()
# mean_length = grouped_filtered_df_2_t['timestamp'].apply(len).mean()
# min_length = grouped_filtered_df_2_t['timestamp'].apply(len).min()
# mode_length = grouped_filtered_df_2_t['timestamp'].apply(len).mode().iloc[0]
# median_length = grouped_filtered_df_2_t['timestamp'].apply(len).median()

# # Print the results
# print("Max length:", max_length)
# print("Mean length:", mean_length)
# print("Min length:", min_length)
# print("Mode length:", mode_length)
# print("Median length:", median_length)

In [None]:
categories = news['category'].unique()
news["num_category"] = news["category"].factorize()[0]
article_category = news.set_index("itemId")["num_category"].to_dict()

In [None]:
# category_num_mapping = {category: num for num, category in enumerate(news['category'].unique())}

# # Create a new column 'num' in the news DataFrame based on the mapping
# news['num_category'] = news['category'].map(category_num_mapping)

In [None]:
# category_num_mapping

In [None]:
# for r in filtered_df_2_1[0]:
#     print(r)

In [None]:
# news['num_category'] = news['category'].factorize()[0]
# article_category_map = news.set_index('itemId')['num_category'].to_dict()
# # def get_category_list(row):
# #     if pd.notna(row['click_history']):
# #         # Use a list comprehension to create the list of categories
# #         categories=[0] * 18    
# #         for article_id in row['click_history'].split():
            
# #             i=article_category_map.get(article_id, -1)
# #             if i!=-1:
# #                 categories[i]+=1
# #     else:
# #         # If 'click_history' is NaN, assign an empty list
# #         categories = []
# #     return categories
# def get_category_list(row):
#     if pd.notna(row['click_history']):
#         # Use a list comprehension to create the list of categories
#         categories = [article_category_map.get(article_id, 0) for article_id in row['click_history'].split()]
#         count_categories = [categories.count(i) for i in range(0, 18)]
#     else:
#         # If 'click_history' is NaN, assign a list of zeros
#         count_categories = []
#     return count_categories

# # Apply the function to create the new column
# filtered_df_2_1['category_list'] = filtered_df_2_1.apply(get_category_list, axis=1)


In [None]:
filtered_df_2_1

In [None]:
# article_category_map.get('N45706', 0)

In [None]:
# feather_path=base_path/ Path("MINDlarge_train/category.feather")
# filtered_df_2_1.to_feather(feather_path)

In [None]:
# print(article_category_map.get('N88753', 0))
# article_category_map
# for article_id in filtered_df_2_1['click_history']:
#     print(article_id)
#     print(type(article_id))
#     print(article_category_map.get(article_id, 0))
#     break

In [None]:
filtered_df_2

In [None]:
# import torch
# def sampled_user(count_list):
#     probabilities_tensor = torch.tensor(count_list, dtype=torch.float32)
#     probabilities_normalized = probabilities_tensor / probabilities_tensor.sum()

#     # Sample a value based on the probabilities
#     sampled_value = torch.multinomial(probabilities_normalized, 1).item()

#     # Create the sampled tensor
#     sampled_list_tensor = torch.zeros_like(probabilities_tensor, dtype=torch.float32)
#     sampled_list_tensor[sampled_value] = 1
#     return sampled_list_tensor


In [None]:
# sampled_user(filtered_df_2_1['category_list'].loc[0])

In [None]:
# user_state=torch.Tensor(filtered_df_2_1['category_list'].loc[0])
# user_state.shape
# print(user_state)


In [None]:
items=filtered_df_2_1['presented_slate'].loc[0]
items

In [None]:
# import random
# item_list = [article_category_map.get(key, []) for key in items]

# remaining_items = 30 - len(item_list)
# additional_values = random.choices(list(set(range(18)) - set(item_list)), k=remaining_items)

# item_list.extend(additional_values)

# random.shuffle(item_list)


In [None]:
# set(range(18))

In [None]:
# length_of_list = len(item_list)

# # Create a list of tensors
# tensor_list = [torch.eye(19)[value] for value in item_list]

# # Print the result
# for i, tensor in enumerate(tensor_list):
#     print(f"Tensor for value {item_list[i]}:\n{tensor}")

In [None]:
user_id = filtered_df_2_1['userId'].loc[5]
timestamp = filtered_df_2_1['timestamp'].loc[5]
clicked_items = filtered_df_2_1[(filtered_df_2_1['userId'] == user_id) & (filtered_df_2_1['timestamp'] == timestamp)]['click'].tolist()
print(clicked_items)



In [None]:
# import numpy as np
# numpy_array = np.copy(
#            filtered_df_2_1["category_list"].loc[20176]
#         )
# user_state = torch.Tensor(numpy_array)
# print(user_state)

In [None]:
filtered_df_2_1.loc[35683]

Get the glove embedding as done in in a saved feather file in article_data.ipynb

In [None]:
news_df = pd.read_feather(
            base_path / Path("MINDlarge_train/news_glove_embed_50.feather")
        )
embedding_dict = dict(zip(news_df["itemId"], news_df["embedding"]))


In [None]:
filtered_df_2_1

In [None]:
def user_embedding(row):
  
    item_embeddings = [embedding_dict[item_id] for item_id in row['click_history'] if item_id in embedding_dict]
    item_embeddings = [embedding for embedding in item_embeddings if embedding is not None]
    if item_embeddings:
        user_embedding = sum(item_embeddings) / len(item_embeddings)
    else:

        user_embedding = None
    return user_embedding

# Apply the function to create the new column
# filtered_df_2_1['observed_state'] = filtered_df_2_1.apply(user_embedding, axis=1)
zero_scores['observed_state'] = zero_scores.apply(user_embedding, axis=1)
generalists['observed_state'] = generalists.apply(user_embedding, axis=1)
specialists['observed_state'] = specialists.apply(user_embedding, axis=1)

In [None]:
filtered_df_2_1

In [None]:
filtered_df_2_1

In [None]:
has_empty_list = any(filtered_df_2_1['presented_slate'].apply(lambda x: len(x) < 2))

if has_empty_list:
    print("The 'presented_slate' column contains at least one empty list.")
else:
    print("No empty lists found in the 'presented_slate' column.")

In [None]:
# feather_path=base_path/ Path("MINDlarge_train/interaction_all_50.feather")
# filtered_df_2_1.to_feather(feather_path)

In [None]:
import numpy as np
num_rows = len(filtered_df_2_1)
random_index = np.random.randint(0, num_rows)
items = filtered_df_2_1["presented_slate"].loc[random_index]

item_list = [embedding_dict.get(key, []) for key in items]

In [None]:
len(item_list)

In [None]:
k=100
all_vectors = [np.array(vector) for vector in embedding_dict.values()]

    # Convert item_list vectors to NumPy arrays
item_list_arrays = [np.array(vector) for vector in item_list]

# Filter out vectors that are already in item_list
available_vectors = [vector for vector in all_vectors if not any(np.array_equal(vector, item) for item in item_list_arrays)]

# If there are less than k available vectors, you can decide how to handle this situation
if len(available_vectors) < k:
    raise ValueError("Not enough available vectors to select k random vectors.")


In [None]:
filtered_df_2_1

In [None]:
column_name = 'click_history'
empty_values_mask = filtered_df_2_1[column_name].apply(lambda x: isinstance(x, list) and len(x) == 0)


# Display rows with empty or None values in the specified column
rows_with_empty_values = filtered_df_2_1[empty_values_mask]
print(rows_with_empty_values)

In [None]:
embedding_dict_1 = {key: value for key, value in embedding_dict.items() if value is not None}
has_none_values = any(value is None for value in embedding_dict_1.values())
print("Dictionary has None values:", has_none_values)

In [None]:
valid_article_ids = set(embedding_dict_1.keys())

def filter_click_list(click_list):
    return [item for item in click_list if item in valid_article_ids]




# filtered_df_2_1['presented_slate'] = filtered_df_2_1['presented_slate'].apply(filter_click_list)
# filtered_df_2_1['click_history'] = filtered_df_2_1['click_history'].apply(filter_click_list)
zero_scores['presented_slate'] = zero_scores['presented_slate'].apply(filter_click_list)
zero_scores['click_history'] = zero_scores['click_history'].apply(filter_click_list)
generalists['presented_slate'] = generalists['presented_slate'].apply(filter_click_list)
generalists['click_history'] = generalists['click_history'].apply(filter_click_list)
specialists['presented_slate'] = specialists['presented_slate'].apply(filter_click_list)
specialists['click_history'] = specialists['click_history'].apply(filter_click_list)

# filtered_df_2_1[filtered_df_2_1['click'].apply(lambda x: any(item in valid_article_ids for item in x))]
# filtered_filtered_df_2_1 = filtered_filtered_df_2_1[filtered_filtered_df_2_1['presented_slate'].apply(lambda x: any(item in valid_article_ids for item in x))]
# filtered_filtered_df_2_1 = filtered_df_2_1[filtered_df_2_1['click'].apply(lambda x: len(x) > 0)]
# filtered_df_2_1

In [None]:
filtered_df_2_1['click'].dtype

In [None]:
# filtered_df_2_2 = filtered_df_2_1[filtered_df_2_1['click'].isin(valid_article_ids)]
zero_scores1 = zero_scores[zero_scores['click'].isin(valid_article_ids)]
generalists1 = generalists[generalists['click'].isin(valid_article_ids)]
specialists1 = specialists[specialists['click'].isin(valid_article_ids)]

In [None]:
zero_scores1.reset_index(inplace=True, drop=True)
generalists1.reset_index(inplace=True, drop=True)
specialists1.reset_index(inplace=True, drop=True)
# filtered_df_2_2.reset_index(inplace=True, drop=True)

In [None]:
zero_scores1

In [None]:
feather_path_test=base_path/ Path("MINDlarge_train/entropy_generalist_test_50.feather")
generalists1.to_feather(feather_path_test)

In [None]:
feather_path_test=base_path/ Path("MINDlarge_train/entropy_specialist_test_50.feather")
specialists1.to_feather(feather_path_test)

In [None]:
feather_path_test=base_path/ Path("MINDlarge_train/entropy_coldstart_test_50.feather")
zero_scores1.to_feather(feather_path_test)

In [None]:
specialists1

In [None]:
# test_data1.loc[30862]

In [None]:
# generalist_test = test_data1[test_data1['diversity_score'] > 0.4]

# # Filter rows where 'diversity_score' is less than or equal to 0.4
# specialist_test = test_data1[test_data1['diversity_score'] <= 0.4]

In [None]:
generalists.reset_index(inplace=True, drop=True)
specialists.reset_index(inplace=True, drop=True)

In [None]:

# feather_path=base_path/ Path("MINDlarge_train/interaction_all_50.feather")
# filtered_df_2_2.to_feather(feather_path)
# feather_path_test=base_path/ Path("MINDlarge_train/generalist_test_50.feather")
# generalist_test.to_feather(feather_path_test)
# feather_path_test1=base_path/ Path("MINDlarge_train/specialist_test_50.feather")
# specialist_test.to_feather(feather_path_test1)

In [None]:
# feather_path=base_path/ Path("MINDlarge_train/interaction_all_50.feather")
# filtered_df_2_2.to_feather(feather_path)

In [None]:
training_data=filtered_df_choice_data.loc[:, ["userId", "click_history"]]

In [None]:
embedding_dict_1

In [None]:
training_data = training_data.drop_duplicates(subset=["userId"])
training_data = training_data.reset_index(drop=True)
training_data

In [None]:
# import random

# all_items=set(embedding_dict_1.keys())
# # Reset the index

# def get_negative_items(click_history):
#     click_set = set(click_history)
#     return np.random.choice(list(all_items - click_set), size=10, replace=False)

# # Apply the function to the click_history column
# training_data['negative_items'] = training_data['click_history'].apply(get_negative_items)



# Drop the original click_history column
# training_data = training_data.drop(columns=["click_history"])

In [None]:
training_data

In [None]:
def split_click_history(click_history):
    if len(click_history) > 0:
        user_embedding = click_history[0]
        remaining_clicks = click_history[1:]
        return user_embedding, remaining_clicks
    else:
        return None, []

# Apply the function to the click_history column
training_data[["user_embedding", "click_history"]] = training_data["click_history"].apply(split_click_history).apply(pd.Series)

In [None]:
training_data

In [None]:
training_data=training_data.explode('click_history', ignore_index=True)


In [None]:
training_data

In [None]:
training_data['click'] = 1

In [None]:
training_sample=training_data.sample(n=80000, random_state=1)

In [None]:
embedding_dict_1

In [None]:
num_users = len(set(u for u in training_sample['userId']))
num_items = len(set(i for i in training_sample['click_history']))

In [None]:
training_sample

In [None]:
interactions=training_sample[['user_embedding','click_history']].reset_index(drop=True)
interactions['click'] = 1

In [None]:
interactions

In [None]:
print(interactions['user_embedding'].nunique())

In [None]:
# import random
# all_items=set(embedding_dict_1.keys())
# additional_item_ids = all_items - set(interactions.loc[interactions['click'] == 1, 'click_history'])
# additional_rows = []

# # For each unique user ID, randomly select k item IDs from additional item IDs
# k = 10  # Example value of k
# for user_id in interactions['user_embedding'].unique():
#     additional_item_ids_for_user = random.sample(list(additional_item_ids), k)
#     for additional_item_id in additional_item_ids_for_user:
#         additional_rows.append({'user_embedding': user_id, 'click_history': additional_item_id, 'click': 0})

# # Concatenate original DataFrame and additional DataFrame
# additional_df = pd.DataFrame(additional_rows)
# interactions1 = pd.concat([interactions, additional_df], ignore_index=True)


In [None]:
# interactions2 = interactions1.sample(frac=1).reset_index(drop=True)
# interactions2
# user_item_interactions = list(interactions.itertuples(index=False, name=None))

In [None]:
# user_item_interactions

In [None]:
# positive_interactions = {}
# for user, item in user_item_interactions:
#     if user not in positive_interactions:
#         positive_interactions[user] = []
#     positive_interactions[user].append(item)

In [None]:
all_items=set(embedding_dict_1.keys())

In [None]:
# user_item_pairs = []
# for user, pos_items in positive_interactions.items():
#     for pos_item in pos_items:
#         # Positive interaction
#         # user_item_pairs.append((user, pos_item))
        
#         # Negative interactions (randomly sampled)
#         neg_items = all_items - set(pos_items)  # All items not interacted with
#         neg_item = np.random.choice(list(neg_items))  # Randomly select a negative item
#         user_item_pairs.append((user, pos_item, neg_item))

In [None]:
# user_item_pairs

In [None]:
# user_item_pairs_list = []
# embedding_size=50
# default_embedding = np.zeros(embedding_size)
# for triple in user_item_pairs:
#     triple_with_values = tuple(embedding_dict_1.get(item_id,default_embedding) for item_id in triple)
#     user_item_pairs_list.append(triple_with_values)

In [None]:
# user_item_pairs_list

In [None]:
# import numpy as np
# import pandas as pd
# import torch
# import torch.nn as nn
# import torch.optim as optim
# from sklearn.preprocessing import LabelEncoder
# from sklearn.model_selection import train_test_split
# from torch.utils.data import DataLoader, Dataset
# from torch.utils.data import Dataset

# # Define the Neural Collaborative Filtering (NCF) model
# class NCF(nn.Module):
#     def __init__(self, num_users, num_items, embedding_dim=64, hidden_dim=64):
#         super(NCF, self).__init__()
#         self.user_embedding = nn.Embedding(num_users, embedding_dim)
#         self.item_embedding = nn.Embedding(num_items, embedding_dim)
#         self.fc_layers = nn.Sequential(
#             nn.Linear(2 * embedding_dim, hidden_dim),
#             nn.ReLU(),
#             nn.Linear(hidden_dim, 1)
#         )

#     def forward(self, user_ids, item_ids,train=True):
#         # print(user_ids)
#         user_embeds = user_ids #torch.tensor([embedding_dict_1[item_id] for item_id in user_ids])
#         item_embeds = item_ids#torch.tensor([embedding_dict_1[item_id] for item_id in item_ids])  # Retrieve item embeddings
#         if train:
#             concat_embeds = torch.cat([user_embeds, item_embeds], dim=1)
#         else:
#             concat_embeds = torch.cat([user_embeds, item_embeds], dim=0)
#         output = self.fc_layers(concat_embeds)
#         return output.squeeze()

# # Preprocess data
# # user_encoder = LabelEncoder()
# # item_encoder = LabelEncoder()
# # training_sample['user_embedding'] = user_encoder.fit_transform(training_sample['user_embedding'])
# # training_sample['item_id'] = item_encoder.fit_transform(training_sample['click_history'])
# num_users = len(set(interactions['user_embedding']))
# num_items = len(set(interactions['click_history']))

# # Split data into train and test sets
# # train_df, test_df = train_test_split(interactions2, test_size=0.2, random_state=42)


# class DataFrameDataset(Dataset):
#     def __init__(self, dataframe):
#         self.data = dataframe

#     def __len__(self):
#         return len(self.data)

#     def __getitem__(self, idx):
#         row = self.data.iloc[idx]
#         # print(row['user_embedding'])
#         default_embedding = torch.zeros(50, dtype=torch.float32)
#         user_id = torch.tensor(embedding_dict_1.get(row['user_embedding'],default_embedding))
#         item_id = torch.tensor(embedding_dict_1.get(row['click_history'],default_embedding))
#         rating = torch.tensor(row['click']).to(torch.float32)
#         return {'user_id': user_id, 'item_id': item_id, 'rating': rating}
    
# dataset=DataFrameDataset(interactions2)
# train_loader = DataLoader(dataset, batch_size=64, shuffle=True)




# # Instantiate the NCF model
# model = NCF(num_users=num_users, num_items=num_items, embedding_dim=50, hidden_dim=50)

# # Define loss function and optimizer
# criterion = nn.BCEWithLogitsLoss()  # Binary cross-entropy loss for binary classification
# optimizer = optim.Adam(model.parameters(), lr=0.001)

# # Training loop
# # for batch in train_loader:
# #     print(batch)
# num_epochs = 1
# for epoch in range(num_epochs):
#     model.train()
#     for batch in train_loader:
      
#         user_ids = batch['user_id']
        
#         item_ids = batch['item_id']
#         ratings = batch['rating']
        
#         optimizer.zero_grad()
#         outputs = model(user_ids, item_ids)
#         # print(outputs.dtype)
#         # print(ratings.dtype)
#         loss = criterion(outputs, ratings)
#         loss.backward()
#         optimizer.step()
#     print(f"epoch: {epoch}, loss: {loss}")

# # Example of making predictions
# model.eval()
# with torch.no_grad():
    
#     user_embedding = torch.tensor(embedding_dict_1.get(interactions.loc[656,'user_embedding'])) # Example user ID
#     item_id =torch.tensor(embedding_dict_1.get(interactions.loc[659,'click_history']))  # Example item ID
#     rating_prediction = torch.sigmoid(model(user_embedding, item_id,train=False))
#     print(rating_prediction)



In [None]:
# interactions2

In [None]:
# num_ones = (interactions2['click'] == 1).sum()

# print("Number of 1's in the 'click' column:", num_ones)

In [None]:
# feather_path=base_path/ Path("MINDlarge_train/choice_model_data.feather")
# interactions2.to_feather(feather_path)