# WHO-HealthPropaPhenKG Evaluation

In [1]:
%load_ext autoreload
%autoreload 2

## Introduction

### Standard

In [2]:
import numpy as np
import pandas as pd

## Paths

Download the following files:  
url = "https://covid19.who.int/WHO-COVID-19-global-data.csv"  
url_iso = "https://raw.githubusercontent.com/CSSEGISandData/COVID-19/master/csse_covid_19_data/UID_ISO_FIPS_LookUp_Table.csv"

In [53]:
path_who_data = "../data/groudtruth/WHO-COVID-19-global-data.csv"
path_isodata = "../data/groudtruth/UID_ISO_FIPS_LookUp_Table.csv"
path_coviddata = "../data/groudtruth/covidToWorld.csv"

In [54]:
mpox_url = "https://raw.githubusercontent.com/owid/monkeypox/main/owid-monkeypox-data.csv"
path_mpoxdata = "../data/groudtruth/mpoxToWorld.csv"

In [55]:
path_to_netwoork_gazetteer = '../data/gazetteers/world_gazetteer_en.csv'

### Reading Gazetteer

In [20]:
df_network = pd.read_csv(path_to_netwoork_gazetteer)

In [21]:
df_network['Name'] = df_network['Name'].apply(lambda x: x.replace('"','').lower())

In [22]:
df_network = df_network.drop(columns=['Unnamed: 0'])

## Reading WHO COVID data

In [56]:
data = pd.read_csv(path_who_data)

In [57]:
data = data[data["Cumulative_cases"] > 0]

In [58]:
data.loc[data["Country"] == 'Viet Nam',"Country"] = "Vietnam"

In [59]:
data

Unnamed: 0,Date_reported,Country_code,Country,WHO_region,New_cases,Cumulative_cases,New_deaths,Cumulative_deaths
8,2020-03-01,AF,Afghanistan,EMRO,1.0,1,,0
9,2020-03-08,AF,Afghanistan,EMRO,,1,,0
10,2020-03-15,AF,Afghanistan,EMRO,6.0,7,,0
11,2020-03-22,AF,Afghanistan,EMRO,17.0,24,,0
12,2020-03-29,AF,Afghanistan,EMRO,67.0,91,2.0,2
...,...,...,...,...,...,...,...,...
52315,2024-02-04,ZW,Zimbabwe,AFRO,31.0,266319,,5737
52316,2024-02-11,ZW,Zimbabwe,AFRO,9.0,266328,,5737
52317,2024-02-18,ZW,Zimbabwe,AFRO,22.0,266350,,5737
52318,2024-02-25,ZW,Zimbabwe,AFRO,5.0,266355,2.0,5739


In [60]:
data["Country"] = data["Country"].apply(lambda x: x.replace('"','').lower())

In [61]:
data = data.drop(['Country_code', 'WHO_region', 'New_cases', 'New_deaths',
       'Cumulative_deaths'],axis=1)

In [62]:
data

Unnamed: 0,Date_reported,Country,Cumulative_cases
8,2020-03-01,afghanistan,1
9,2020-03-08,afghanistan,1
10,2020-03-15,afghanistan,7
11,2020-03-22,afghanistan,24
12,2020-03-29,afghanistan,91
...,...,...,...
52315,2024-02-04,zimbabwe,266319
52316,2024-02-11,zimbabwe,266328
52317,2024-02-18,zimbabwe,266350
52318,2024-02-25,zimbabwe,266355


In [63]:
maindata = pd.merge(data,df_network,left_on="Country",right_on="Name")
maindata = maindata.drop(['Name', 'Country'],axis=1)

In [64]:
maindata['CUI:START_ID']= maindata.shape[0]*['C5203670']
maindata[':TYPE']= maindata.shape[0]*["isReported"]
maindata = maindata.rename(columns={"ID": "ID:END_ID"})

In [66]:
maindata = maindata.drop_duplicates(keep=False)

In [68]:
maindata.to_csv(path_coviddata)

## Reading WHO Mpox data

In [71]:
mpoxdata = pd.read_csv(
    mpox_url,
    sep=',',
    encoding='utf-8',
)

In [72]:
mpoxdata.head()

Unnamed: 0,location,iso_code,date,total_cases,total_deaths,new_cases,new_deaths,new_cases_smoothed,new_deaths_smoothed,new_cases_per_million,total_cases_per_million,new_cases_smoothed_per_million,new_deaths_per_million,total_deaths_per_million,new_deaths_smoothed_per_million
0,Africa,OWID_AFR,2022-05-01,27.0,2.0,0.0,0.0,0.29,0.0,0.0,0.019,0.0,0.0,0.0014,0.0
1,Africa,OWID_AFR,2022-05-02,27.0,2.0,0.0,0.0,0.29,0.0,0.0,0.019,0.0,0.0,0.0014,0.0
2,Africa,OWID_AFR,2022-05-03,27.0,2.0,0.0,0.0,0.29,0.0,0.0,0.019,0.0,0.0,0.0014,0.0
3,Africa,OWID_AFR,2022-05-04,27.0,2.0,0.0,0.0,0.29,0.0,0.0,0.019,0.0,0.0,0.0014,0.0
4,Africa,OWID_AFR,2022-05-05,27.0,2.0,0.0,0.0,0.29,0.0,0.0,0.019,0.0,0.0,0.0014,0.0


In [73]:
def monkeypoxDataEnhancement(data):
    # Drop useless columns
    maindata = data.drop(['iso_code','total_deaths',
      'new_cases', 'new_deaths', 'new_cases_smoothed', 'new_deaths_smoothed',
      'new_cases_per_million', 'total_cases_per_million',
      'new_cases_smoothed_per_million', 'new_deaths_per_million',
      'total_deaths_per_million', 'new_deaths_smoothed_per_million'],axis=1)
    #  Columns
    all_dates = maindata["date"].unique().tolist()
    all_dates.sort()
    date_list = []
    total_cases = []
    iso_code = []
    dict_countries = dict(zip(maindata["location"].unique().tolist(), [0]*len(maindata["location"])))
    # Go through every date
    for date in all_dates:
      # Get slice of matrix
      slice = data[data["date"]==date]
      # Get lines of slice
      for index, row in slice.iterrows():
        dict_countries[row['location']] = row['total_cases']
      # Add values to lists
      for key in dict_countries.keys():
        date_list.append(date)
        iso_code.append(key)
        total_cases.append(dict_countries[key])
    # Create pandas dataframe
    d = {'location':iso_code,'date':date_list, 'total_cases':total_cases}
    df = pd.DataFrame(data=d)
    # Get all isocode
    return df

In [74]:
mainmpoxdata = monkeypoxDataEnhancement(mpoxdata)
mainmpoxdata = mainmpoxdata[mainmpoxdata['total_cases']>0]
mainmpoxdata["location"] = mainmpoxdata["location"].apply(lambda x: x.replace('"','').lower())
mainmpoxdata.loc[mainmpoxdata["location"] == 'united states',"location"] = "united states of america"

In [75]:
mainmpoxdata.shape

(73782, 3)

In [76]:
mainmpoxdataplus = pd.merge(mainmpoxdata,df_network,left_on="location",right_on="Name")
mainmpoxdataplus = mainmpoxdataplus.drop(['Name', 'location'],axis=1)

In [77]:
mainmpoxdataplus['CUI:START_ID']= mainmpoxdataplus.shape[0]*['C0276180']
mainmpoxdataplus[':TYPE']= mainmpoxdataplus.shape[0]*["isReported"]
mainmpoxdataplus = mainmpoxdataplus.rename(columns={"ID": "ID:END_ID"})

In [78]:
mainmpoxdataplus = mainmpoxdataplus.drop_duplicates(keep=False)

In [79]:
mainmpoxdataplus

Unnamed: 0,date,total_cases,ID:END_ID,CUI:START_ID,:TYPE
0,2022-05-01,27.0,wkg:5039605589,C0276180,isReported
1,2022-05-01,27.0,wkg:5829877885,C0276180,isReported
4,2022-05-02,27.0,wkg:5039605589,C0276180,isReported
5,2022-05-02,27.0,wkg:5829877885,C0276180,isReported
8,2022-05-03,27.0,wkg:5039605589,C0276180,isReported
...,...,...,...,...,...
229765,2024-03-04,13.0,wkg:432424952,C0276180,isReported
229766,2024-03-05,13.0,wkg:432424952,C0276180,isReported
229767,2024-03-06,13.0,wkg:432424952,C0276180,isReported
229768,2024-03-07,13.0,wkg:432424952,C0276180,isReported


In [80]:
mainmpoxdataplus.to_csv(path_mpoxdata)