#### GROUND TRUTH EXPLORATION & GROUND TRUTH DATASET DEVELOPMENT

This notebook is for exploring the ground truth dataset of extracted parameters taken from the Epireview github repository. The link can be found here: https://github.com/mrc-ide/epireview

The notebook performs data wrangling to get a ground truth of delay parameters for comparison to the LLM output.

In [2]:
import os
import pandas as pd

In [15]:
parameters_df = pd.read_csv('data\\ground_truth\\marburg_parameter.csv')
articles_df = pd.read_csv('data\\ground_truth\\marburg_article.csv')

In [10]:
articles_df[['covidence_id', 'article_id', 'doi']].sort_values('covidence_id').reset_index(drop=True).head(5)

Unnamed: 0,covidence_id,article_id,doi
0,1483,36,10.1016/j.meegid.2016.06.014
1,1483,29,10.1016/j.meegid.2016.06.014
2,1595,39,
3,1595,47,
4,1613,59,https://doi.org/10.1128/JVI.00069-06


In [16]:
parameters_df[['covidence_id.x', 'covidence_id.y']].sort_values('covidence_id.x').reset_index(drop=True).head(5)

Unnamed: 0,covidence_id.x,covidence_id.y
0,1483,1483
1,1595,1595
2,1615,1615
3,1649,1649
4,1649,1649


In [37]:
articles_df = pd.read_csv('data\\ground_truth\\marburg_article.csv')

# There is one article entry with no title or year of publication
articles_df[articles_df['article_title'].isnull()]

# Drop rows where article title is NaN
articles_df = articles_df.dropna(subset=['article_title'])

# Converting the year publication to a string
articles_df['year_publication'] = articles_df['year_publication'].astype('float').astype('int')

# Apply function for creating article_label column
def create_article_label(row):
    return f"{row['first_author_surname']}_{row['year_publication']}"

articles_df['article_label'] = articles_df.apply(create_article_label, axis=1)

# Filtering for articles with  delay parameters

delay_articles = articles_df[articles_df['article_label'].isin(['Martini_1973', "Ajelli_2012", 'Bausch_2006', 'Nyakarahuka_2016', 'Knust_2015'])]
display(delay_articles[['article_id', 'covidence_id', 'article_label']])

Unnamed: 0,article_id,covidence_id,article_label
13,15,1931,Nyakarahuka_2016
14,17,1931,Nyakarahuka_2016
15,20,2060,Martini_1973
22,27,2886,Ajelli_2012
37,42,2241,Knust_2015
51,57,2819,Bausch_2006


In [43]:
parameters_df = pd.read_csv('data\\ground_truth\\marburg_parameter.csv')

#print(parameters_df.columns)
columns = ['covidence_id.x', 'parameter_type', 'parameter_value', 'parameter_unit', 
           'parameter_lower_bound', 'parameter_upper_bound',
       'parameter_value_type', 'parameter_uncertainty_single_value',
       'parameter_uncertainty_singe_type', 'parameter_uncertainty_lower_value',
       'parameter_uncertainty_upper_value', 'parameter_uncertainty_type']
parameters_df = parameters_df[columns]
parameters_df = parameters_df.rename(columns={'covidence_id.x': 'covidence_id'})
display(parameters_df.head())

Unnamed: 0,covidence_id,parameter_type,parameter_value,parameter_unit,parameter_lower_bound,parameter_upper_bound,parameter_value_type,parameter_uncertainty_single_value,parameter_uncertainty_singe_type,parameter_uncertainty_lower_value,parameter_uncertainty_upper_value,parameter_uncertainty_type
0,3795,Human delay - time symptom to careseeking,4.0,Days,,,Other,,,,,
1,3795,Human delay - incubation period,,Days,7.0,8.0,,,,7.0,8.0,Range
2,3795,Human delay - time symptom to outcome,9.0,Days,,,,,,,,
3,2597,Seroprevalence - IFA,,,,,,,,,,
4,2762,Attack rate,21.0,Percentage (%),,,,,,11.0,34.0,CI95%


In [47]:
# Merging
ground_truth = pd.merge(delay_articles[['covidence_id', 'article_label']], parameters_df, how='left', on='covidence_id')

# Filtering for only delay parameters
ground_truth = ground_truth[ground_truth['parameter_type'].str.contains(r'delay', case=False, na=False)].reset_index(drop=True)
ground_truth

Unnamed: 0,covidence_id,article_label,parameter_type,parameter_value,parameter_unit,parameter_lower_bound,parameter_upper_bound,parameter_value_type,parameter_uncertainty_single_value,parameter_uncertainty_singe_type,parameter_uncertainty_lower_value,parameter_uncertainty_upper_value,parameter_uncertainty_type
0,2060,Martini_1973,Human delay - incubation period,,Days,4.0,7.0,,,,4.0,7.0,Range
1,2886,Ajelli_2012,Human delay - generation time,9.0,Days,,,Mean,,,8.2,10.0,CI95%
2,2886,Ajelli_2012,Human delay - generation time,5.4,Days,,,Standard Deviation,,,3.9,8.6,CI95%
3,2886,Ajelli_2012,Human delay - time symptom to outcome,7.0,Days,,,Median,,,5.0,9.0,Range
4,2886,Ajelli_2012,Human delay - generation time,9.3,Days,,,Mean,,,3.7,14.6,CI95%
5,2886,Ajelli_2012,Human delay - time symptom to outcome,9.0,Days,,,Median,,,0.0,56.0,Range
6,2241,Knust_2015,Human delay - time symptom to outcome,9.0,Days,6.5,9.0,Mean,,,6.5,9.0,Range
7,2241,Knust_2015,Human delay - time symptom to careseeking,4.0,Days,,,Mean,,,,,
8,2241,Knust_2015,Human delay - time symptom to outcome,22.0,Days,16.0,30.0,Mean,,,16.0,30.0,Range
9,2241,Knust_2015,Human delay - time in care,14.3,Days,4.0,22.0,Mean,,,4.0,22.0,Range
