# Simulation of clicks 

For each LOINC query, the system simulates user interaction by generating a series of clicks on the ranked results. The simulation assumes that users are more likely to click on documents with higher similarity to the query, reflecting realistic behavior where relevant results attract more attention. Clicks are assigned probabilistically, with higher-ranked or more similar documents having a greater chance of being selected. The cumulative click counts for each document are then used to adjust their relevance scores, allowing the ranking model to incorporate implicit feedback and better reflect potential user preferences over time.

Possible queries:
- glucose in blood
- bilirubin in plasma
- White blood cells count

In [37]:
query = "White blood cells count"

import os
import pandas as pd
import numpy as np
import random

query_filename = f"starting ranking {query}.xlsx"
file_path = os.path.join("data", query_filename)

ranking_df = pd.read_excel(file_path)

print(ranking_df.head())


# Initialize the simulated clicks column
ranking_df['simulated_clicks'] = 0

# Make sure similarity values are non-negative
similarities = ranking_df['similarity']

# Normalize similarity to sum to 1 for probabilities
probabilities = similarities / similarities.sum()

# Number of total simulated clicks
num_clicks = 1000


random.seed(2)
# Simulate clicks: choose document indices according to probabilities
chosen_indices = random.choices(
    population=range(len(ranking_df)),
    weights=probabilities,
    k=num_clicks
)

# Count clicks for each document
click_counts = np.bincount(chosen_indices, minlength=len(ranking_df))

# Assign the simulated clicks back to the DataFrame
ranking_df['simulated_clicks'] = click_counts

# Check result
print(ranking_df)



                                    long_common_name loinc_num  similarity
0  ABO group [Type] in Blood from Blood product u...   14578-9    0.664988
1  Blood group antibody screen [Presence] in Seru...     890-4    0.663357
2  Blood product unit ID [#] Blood product unit I...     934-0    0.588803
3    Blood product type Blood product type ^BPU Type     933-2    0.569553
4  Blood product disposition [Type] Blood product...     925-8    0.558776
                                     long_common_name loinc_num  similarity  \
0   ABO group [Type] in Blood from Blood product u...   14578-9    0.664988   
1   Blood group antibody screen [Presence] in Seru...     890-4    0.663357   
2   Blood product unit ID [#] Blood product unit I...     934-0    0.588803   
3     Blood product type Blood product type ^BPU Type     933-2    0.569553   
4   Blood product disposition [Type] Blood product...     925-8    0.558776   
..                                                ...       ...         ... 

In [38]:
from sklearn.preprocessing import MinMaxScaler

# Normalize the simulated clicks between 0 and 1
scaler = MinMaxScaler()
clicks_normalized = scaler.fit_transform(ranking_df[['simulated_clicks']])

# Compute new relevance: 70% similarity, 30% clicks
ranking_df['new_relevance'] = 0.7 * ranking_df['similarity'] + 0.3 * clicks_normalized.flatten()

# Sort the DataFrame by new relevance
ranking_df = ranking_df.sort_values(by='new_relevance', ascending=False).reset_index(drop=True)

# Check the top results
print(ranking_df.head())


folder_path = os.path.join("data")
os.makedirs(folder_path, exist_ok=True)  # Crea la cartella se non esiste

# Definizione del path completo del file
output_path = os.path.join(folder_path, f"click ranking {query}.xlsx")

ranking_df.to_excel(output_path, index=False)


                                    long_common_name loinc_num  similarity  \
0  ABO group [Type] in Blood from Blood product u...   14578-9    0.664988   
1  Blood group antibody screen [Presence] in Seru...     890-4    0.663357   
2       ABO group [Type] in Blood ABO group Bld Type     883-9    0.558034   
3    Blood product type Blood product type ^BPU Type     933-2    0.569553   
4  Blood product disposition [Type] Blood product...     925-8    0.558776   

   simulated_clicks  new_relevance  
0                26       0.754380  
1                26       0.753239  
2                27       0.690624  
3                24       0.665354  
4                24       0.657810  
