# Baseline Experiment - BM25

- Using BM25 as the baseline - using inclusion/exclsuion criteria in protocol as query

In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import clizod_ranker as cr

In [3]:
import os
import re
import shutil
import pandas as pd
import re
import numpy as np
import json
import time
import datetime


from random import randrange
from string import Template
from os.path import join, exists
from sklearn.model_selection import train_test_split

from rank_bm25 import BM25Okapi
import nltk
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
import string

nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

## Load Data
Loading the data cleaned from classifier-sanitize.ipynb notebook.

In [4]:
input_file_path = '../data/data_01_cleaned.csv'
df_sample = pd.read_csv(input_file_path)

print(f"There are {len(df_sample)} rows in this dataset.")
df_sample.head(3)

There are 2905 rows in this dataset.


Unnamed: 0,tag,disease,variable,target_hash,target,review,reference,id
0,0.0,cchf,rainfall,1266833a0c9bd183b54db85128f28024,Scientific Opinion on the Role of Tick Vectors...,1.0,Journal Article,0
1,0.0,cchf,rainfall,c025b214c7370b24a3776e85f6cd285b,A survey of rift valley fever and associated r...,0.0,Journal Article,1
2,0.0,cchf,rainfall,273c3107270be7343e0e43692cde82c3,Occurrence of Rickettsia spp. and Coxiella bur...,0.0,Journal Article,2


Only using rainfall and excluding dengue

In [5]:
# Unique combinations of disease and variable
df_combs = df_sample[['disease','variable']].drop_duplicates().reset_index(drop=True)
df_combs

Unnamed: 0,disease,variable
0,cchf,rainfall
1,ebola,rainfall
2,rvf,rainfall
3,lepto,rainfall


Subsetting the data for development vs full runs


## Setup query

In [6]:
template_reference  = {
    'cchf-rainfall': {
        'topic': 'Impact of Climate Change on CCHF: A Focus on Rainfall',
        'inclusion': "Primary research or meta-analysis\n Assesses the relationship between the rainfall and either:\n Crimean-Congo haemorrhagic fever (CCHF) incidence or prevalence\n Pathogen survival\nTransmission\nVirulence\nDemonstrated vector or maintenance host survival, development or distribution"
    },
    'ebola-rainfall': {
        'topic': 'Impact of Climate Change on Ebola: A Focus on Rainfall',
        'inclusion': "Primary research or meta-analysis\n Assesses the relationship between the rainfall and either:\n Ebola or Marburg incidence or prevalence\n Pathogen survival\nTransmission\nVirulence\nDemonstrated vector or maintenance host survival, development or distribution"
    },
    'lepto-rainfall': {
        'topic': 'Impact of Climate Change on Leptospirosis: A Focus on Rainfall',
        'inclusion': "Primary research or meta-analysis\n Assesses the relationship between the rainfall and either:\n Leptospirosis incidence or prevalence\n Pathogen survival\nTransmission\nVirulence\nDemonstrated vector or maintenance host survival, development or distribution"
    },
    'rvf-rainfall': {
        'topic': 'Impact of Climate Change on Rift Valley Fever Virus: A Focus on Rainfall',
        'inclusion': "Primary research or meta-analysis\n Assesses the relationship between the rainfall and either:\n Rift Valley Fever (RVF) virus incidence or prevalence\n Pathogen survival\nTransmission\nVirulence\nDemonstrated vector or maintenance host survival, development or distribution"
    }   
}

## Set up experiment directory

In [7]:
exp_root_dir = "../experiments/"
templates_dir = join(exp_root_dir, "templates")
results_root_dir = join(exp_root_dir, "results")
reports_root_dir = join(exp_root_dir, "reports")


## Run the model

In [8]:
def preprocess(text):
    text = text.lower()
    text = ''.join([char for char in text if char not in string.punctuation])
    words = text.split()
    stop_words = set(stopwords.words('english'))
    words = [word for word in words if word not in stop_words]
    stemmer = PorterStemmer()
    words = [stemmer.stem(word) for word in words]
    return " ".join(words)

def min_max_normalize(scores):
    min_score = min(scores)
    max_score = max(scores)
    if max_score == min_score: # to avoid division by zero
        return [1.0]*len(scores) if max_score != 0 else [0.0]*len(scores)

    return [(score - min_score) / (max_score - min_score) for score in scores]
    

In [9]:
exp_start_time = time.time()
df_results = pd.DataFrame()
for index, crow in df_combs.iterrows():
        
        disease = crow['disease']
        variable = crow['variable']
        keywords = f"{disease}-{variable}"
        
        #Grab just the rows for the given disease and variable
        df_sub = df_sample.query(f"disease == '{disease}' & variable == '{variable}'")
        documents = df_sub['target'].tolist()                
        tokenized_corpus = [preprocess(doc).split(" ") for doc in documents]  # Tokenize each document

        bm25 = BM25Okapi(tokenized_corpus)

        entries = template_reference[keywords]

        query_text = f"{entries['topic']} {entries['inclusion']}"

        processed_query = preprocess(query_text)
        tokenized_query = processed_query.split()
        bm25_scores = bm25.get_scores(tokenized_query)
        normalized_scores = min_max_normalize(bm25_scores)

    
        score_dict = { 'score': normalized_scores } 
        df_scores = pd.DataFrame(score_dict)
        print(f"Processed {disease}-{variable}: dims for sub = {df_sub.shape} score = {df_scores.shape}")

        df_sub = df_sub.reset_index()
        df_scores.reset_index(drop=True, inplace=True)
        
        df_sub = pd.concat([df_sub, df_scores], axis=1)
    
        df_results = pd.concat([df_results, df_sub], axis=0)
    
elapsed_time = time.time() - exp_start_time
print(f"All completed in {str(datetime.timedelta(seconds=elapsed_time))}.")
print(f"Records count {df_results.shape}")

Processed cchf-rainfall: dims for sub = (454, 8) score = (454, 1)
Processed ebola-rainfall: dims for sub = (915, 8) score = (915, 1)
Processed rvf-rainfall: dims for sub = (537, 8) score = (537, 1)
Processed lepto-rainfall: dims for sub = (999, 8) score = (999, 1)
All completed in 0:00:15.098284.
Records count (2905, 10)


## Process results

### Read the results

### Calculate Ranking

In [10]:
df_comb_results = df_results.copy()
df_comb_results['experiment'] = 'h-05-baseline-protocol'
df_comb_results['model'] = 'bm25'
df_comb_results.head(4)

Unnamed: 0,index,tag,disease,variable,target_hash,target,review,reference,id,score,experiment,model
0,0,0.0,cchf,rainfall,1266833a0c9bd183b54db85128f28024,Scientific Opinion on the Role of Tick Vectors...,1.0,Journal Article,0,0.521013,h-05-baseline-protocol,bm25
1,1,0.0,cchf,rainfall,c025b214c7370b24a3776e85f6cd285b,A survey of rift valley fever and associated r...,0.0,Journal Article,1,0.345682,h-05-baseline-protocol,bm25
2,2,0.0,cchf,rainfall,273c3107270be7343e0e43692cde82c3,Occurrence of Rickettsia spp. and Coxiella bur...,0.0,Journal Article,2,0.061553,h-05-baseline-protocol,bm25
3,3,0.0,cchf,rainfall,e0873fc8c6aaff839a4c90eeb31bbf6f,Multisectoral Perspectives on Global Warming a...,0.0,Journal Article,3,0.435973,h-05-baseline-protocol,bm25


In [11]:
selected_columns = ['id', 'experiment', 'model', 'disease', 'variable', 'tag', 'score']
df_rerank = df_comb_results.loc[:,selected_columns]
df_rerank.head(5)

Unnamed: 0,id,experiment,model,disease,variable,tag,score
0,0,h-05-baseline-protocol,bm25,cchf,rainfall,0.0,0.521013
1,1,h-05-baseline-protocol,bm25,cchf,rainfall,0.0,0.345682
2,2,h-05-baseline-protocol,bm25,cchf,rainfall,0.0,0.061553
3,3,h-05-baseline-protocol,bm25,cchf,rainfall,0.0,0.435973
4,4,h-05-baseline-protocol,bm25,cchf,rainfall,0.0,0.064991


In [12]:
df_rerank = cr.apply_len_tie_breaker(df_sample, df_rerank, 'score')
df_rerank.head(5)

Unnamed: 0,id,experiment,model,disease,variable,tag,score,target,target_len,ranking
0,221,h-05-baseline-protocol,bm25,cchf,rainfall,1.0,1.0,Climatic prerequisites for changing activity i...,2414,1
1,19,h-05-baseline-protocol,bm25,cchf,rainfall,1.0,0.884384,Predicting CCHF incidence and its related fact...,1691,2
2,346,h-05-baseline-protocol,bm25,cchf,rainfall,1.0,0.882204,Status of Crimean-Congo haemorrhagic fever vir...,1322,3
3,300,h-05-baseline-protocol,bm25,cchf,rainfall,0.0,0.863895,Climate change and Ixodes tick-borne diseases ...,2105,4
4,80,h-05-baseline-protocol,bm25,cchf,rainfall,1.0,0.82564,Effect of meteorological factors on Hyalomma s...,1978,5


## Review Document

Creating separate files for each experiment

In [13]:
df_review = df_rerank.copy()
print(df_review.shape)
groups = df_review.groupby(["model", "experiment"])
for key, data in groups:
    print(f"{key[0]} - {key[1]}")

    #reporting
    df_reporting = df_review[['id', 'experiment', 'model', 'disease', 'variable', 'tag', 'score', 'ranking']].copy()
    df_reporting.to_csv(reports_root_dir + f'/report_{key[0]}_{key[1]}.csv', header=True, index=False, encoding='utf-8')

(2905, 10)
bm25 - h-05-baseline-protocol
