# Baseline Experiment - all-MiniLM-L6-v2 - Cosine Similarity

- Using all-MiniLM-L6-v2 and Cosine Similarity as the baseline - using inclusion/exclsuion criteria in protocol as query

In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import clizod_ranker as cr

In [3]:
import os
import re
import shutil
import pandas as pd
import re
import numpy as np
import json
import time
import datetime

from random import randrange
from string import Template
from os.path import join, exists
from sklearn.model_selection import train_test_split

from transformers import AutoTokenizer, AutoModel
import torch
import ast
from sklearn.metrics.pairwise import cosine_similarity

## Load Data
Loading the data cleaned from classifier-sanitize.ipynb notebook.

In [4]:
input_file_path = '../data/data_01_cleaned.csv'
df_sample = pd.read_csv(input_file_path)

print(f"There are {len(df_sample)} rows in this dataset.")
df_sample.head(3)

There are 2905 rows in this dataset.


Unnamed: 0,tag,disease,variable,target_hash,target,review,reference,id
0,0.0,cchf,rainfall,1266833a0c9bd183b54db85128f28024,Scientific Opinion on the Role of Tick Vectors...,1.0,Journal Article,0
1,0.0,cchf,rainfall,c025b214c7370b24a3776e85f6cd285b,A survey of rift valley fever and associated r...,0.0,Journal Article,1
2,0.0,cchf,rainfall,273c3107270be7343e0e43692cde82c3,Occurrence of Rickettsia spp. and Coxiella bur...,0.0,Journal Article,2


Only using rainfall and excluding dengue

In [5]:
# Unique combinations of disease and variable
df_combs = df_sample[['disease','variable']].drop_duplicates().reset_index(drop=True)
df_combs

Unnamed: 0,disease,variable
0,cchf,rainfall
1,ebola,rainfall
2,rvf,rainfall
3,lepto,rainfall


Subsetting the data for development vs full runs


## Setup prompts

In [6]:
template_reference  = {
    'cchf-rainfall': {
        'topic': 'Impact of Climate Change on CCHF: A Focus on Rainfall',
        'inclusion': "Primary research or meta-analysis\n Assesses the relationship between the rainfall and either:\n Crimean-Congo haemorrhagic fever (CCHF) incidence or prevalence\n Pathogen survival\nTransmission\nVirulence\nDemonstrated vector or maintenance host survival, development or distribution"
    },
    'ebola-rainfall': {
        'topic': 'Impact of Climate Change on Ebola: A Focus on Rainfall',
        'inclusion': "Primary research or meta-analysis\n Assesses the relationship between the rainfall and either:\n Ebola or Marburg incidence or prevalence\n Pathogen survival\nTransmission\nVirulence\nDemonstrated vector or maintenance host survival, development or distribution"
    },
    'lepto-rainfall': {
        'topic': 'Impact of Climate Change on Leptospirosis: A Focus on Rainfall',
        'inclusion': "Primary research or meta-analysis\n Assesses the relationship between the rainfall and either:\n Leptospirosis incidence or prevalence\n Pathogen survival\nTransmission\nVirulence\nDemonstrated vector or maintenance host survival, development or distribution"
    },
    'rvf-rainfall': {
        'topic': 'Impact of Climate Change on Rift Valley Fever Virus: A Focus on Rainfall',
        'inclusion': "Primary research or meta-analysis\n Assesses the relationship between the rainfall and either:\n Rift Valley Fever (RVF) virus incidence or prevalence\n Pathogen survival\nTransmission\nVirulence\nDemonstrated vector or maintenance host survival, development or distribution"
    }   
}

## Set up experiment directory and API calls

In [7]:
exp_root_dir = "../experiments/"
templates_dir = join(exp_root_dir, "templates")
results_root_dir = join(exp_root_dir, "results")
reports_root_dir = join(exp_root_dir, "reports")


## Run the model

In [8]:
# Dictionary of model names and their Hugging Face model IDs
MODEL_DICT = {
    'SBERT': 'sentence-transformers/all-MiniLM-L6-v2'
}

In [9]:
def load_model(model_name):
    model_id = MODEL_DICT[model_name]
    tokenizer = AutoTokenizer.from_pretrained(model_id)
    model = AutoModel.from_pretrained(model_id)
    return tokenizer, model

def get_embedding(text, tokenizer, model):
    # Return a PyTorch tensor, truncate the input if necessary and add padding if shorter than max length
    inputs = tokenizer(text, return_tensors="pt", truncation=True, max_length=512, padding=True)
    with torch.no_grad():
        outputs = model(**inputs)
    return outputs.last_hidden_state.mean(dim=1).squeeze().numpy()

def min_max_normalize(scores):
    min_score = min(scores)
    max_score = max(scores)
    if max_score == min_score: # to avoid division by zero
        return [1.0]*len(scores) if max_score != 0 else [0.0]*len(scores)

    return [(score - min_score) / (max_score - min_score) for score in scores]

In [11]:
exp_start_time = time.time()
tokenizer, model = load_model('SBERT')
df_results = pd.DataFrame()
for index, crow in df_combs.iterrows():
        
        disease = crow['disease']
        variable = crow['variable']
        keywords = f"{disease}-{variable}"
        
        #Grab just the rows for the given disease and variable
        df_sub = df_sample.query(f"disease == '{disease}' & variable == '{variable}'")
        data_embedding = get_embedding(df_sub['target'].tolist(), tokenizer, model)
  
        entries = template_reference[keywords]

        query_text = f"{entries['topic']} {entries['inclusion']}"

        query_embedding = get_embedding([query_text], tokenizer, model)
    
        #reshape to a (1, 384)
        question_embedding = query_embedding.reshape(1, -1)
            
        print(f"  Calculating cosine similarity for query_embedding {disease} - {variable}  -> data = {data_embedding.shape} query embedding = {query_embedding.shape}")

        cosine_similarities = cosine_similarity(data_embedding, question_embedding)

        normalized_scores = min_max_normalize(cosine_similarities.flatten())    
    
        df_scores = pd.DataFrame({ 'score':  normalized_scores } )
        print(f"Processed {disease}-{variable}: dims for sub = {df_sub.shape} score = {df_scores.shape}")

        #df_sub = df_sub.reset_index().rename(columns={"index":"id"})
        df_sub = df_sub.reset_index()
        df_scores.reset_index(drop=True, inplace=True)
        
        df_sub = pd.concat([df_sub, df_scores], axis=1)
    
        df_results = pd.concat([df_results, df_sub], axis=0)
    
elapsed_time = time.time() - exp_start_time
print(f"All completed in {str(datetime.timedelta(seconds=elapsed_time))}.")
print(f"Records count {df_results.shape}")

  Calculating cosine similarity for query_embedding cchf - rainfall  -> data = (454, 384) query embedding = (384,)
Processed cchf-rainfall: dims for sub = (454, 8) score = (454, 1)
  Calculating cosine similarity for query_embedding ebola - rainfall  -> data = (915, 384) query embedding = (384,)
Processed ebola-rainfall: dims for sub = (915, 8) score = (915, 1)
  Calculating cosine similarity for query_embedding rvf - rainfall  -> data = (537, 384) query embedding = (384,)
Processed rvf-rainfall: dims for sub = (537, 8) score = (537, 1)
  Calculating cosine similarity for query_embedding lepto - rainfall  -> data = (999, 384) query embedding = (384,)
Processed lepto-rainfall: dims for sub = (999, 8) score = (999, 1)
All completed in 0:06:07.716805.
Records count (2905, 10)


## Process results

### Read the results

### Calculate Ranking

In [12]:
df_comb_results = df_results.copy()
df_comb_results['experiment'] = 'h-05-baseline-protocol'
df_comb_results['model'] = 'all-MiniLM-L6-v2'
df_comb_results.head(4)

Unnamed: 0,index,tag,disease,variable,target_hash,target,review,reference,id,score,experiment,model
0,0,0.0,cchf,rainfall,1266833a0c9bd183b54db85128f28024,Scientific Opinion on the Role of Tick Vectors...,1.0,Journal Article,0,0.708138,h-05-baseline-protocol,all-MiniLM-L6-v2
1,1,0.0,cchf,rainfall,c025b214c7370b24a3776e85f6cd285b,A survey of rift valley fever and associated r...,0.0,Journal Article,1,0.646057,h-05-baseline-protocol,all-MiniLM-L6-v2
2,2,0.0,cchf,rainfall,273c3107270be7343e0e43692cde82c3,Occurrence of Rickettsia spp. and Coxiella bur...,0.0,Journal Article,2,0.376826,h-05-baseline-protocol,all-MiniLM-L6-v2
3,3,0.0,cchf,rainfall,e0873fc8c6aaff839a4c90eeb31bbf6f,Multisectoral Perspectives on Global Warming a...,0.0,Journal Article,3,0.571226,h-05-baseline-protocol,all-MiniLM-L6-v2


In [13]:
selected_columns = ['id', 'experiment', 'model', 'disease', 'variable', 'tag', 'score']
df_rerank = df_comb_results.loc[:,selected_columns]
df_rerank.head(5)

Unnamed: 0,id,experiment,model,disease,variable,tag,score
0,0,h-05-baseline-protocol,all-MiniLM-L6-v2,cchf,rainfall,0.0,0.708138
1,1,h-05-baseline-protocol,all-MiniLM-L6-v2,cchf,rainfall,0.0,0.646057
2,2,h-05-baseline-protocol,all-MiniLM-L6-v2,cchf,rainfall,0.0,0.376826
3,3,h-05-baseline-protocol,all-MiniLM-L6-v2,cchf,rainfall,0.0,0.571226
4,4,h-05-baseline-protocol,all-MiniLM-L6-v2,cchf,rainfall,0.0,0.492243


In [14]:
df_rerank = cr.apply_len_tie_breaker(df_sample, df_rerank, 'score')
df_rerank.head(5)

Unnamed: 0,id,experiment,model,disease,variable,tag,score,target,target_len,ranking
0,20,h-05-baseline-protocol,all-MiniLM-L6-v2,cchf,rainfall,1.0,1.0,Crimean-Congo hemorrhagic fever and its relati...,1657,1
1,19,h-05-baseline-protocol,all-MiniLM-L6-v2,cchf,rainfall,1.0,0.911071,Predicting CCHF incidence and its related fact...,1691,2
2,441,h-05-baseline-protocol,all-MiniLM-L6-v2,cchf,rainfall,1.0,0.906651,"Temporal tendency, seasonality and relationshi...",2385,3
3,221,h-05-baseline-protocol,all-MiniLM-L6-v2,cchf,rainfall,1.0,0.905035,Climatic prerequisites for changing activity i...,2414,4
4,125,h-05-baseline-protocol,all-MiniLM-L6-v2,cchf,rainfall,0.0,0.892276,Crimean–Congo Haemorrhagic Fever (CCHF) in ani...,2232,5


## Review Document

Creating separate files for each experiment

In [15]:
df_review = df_rerank.copy()
print(df_review.shape)
groups = df_review.groupby(["model", "experiment"])
for key, data in groups:
    print(f"{key[0]} - {key[1]}")

    #reporting
    df_reporting = df_review[['id', 'experiment', 'model', 'disease', 'variable', 'tag', 'score', 'ranking']].copy()
    df_reporting.to_csv(reports_root_dir + f'/report_{key[0]}_{key[1]}.csv', header=True, index=False, encoding='utf-8')

(2905, 10)
all-MiniLM-L6-v2 - h-05-baseline-protocol
