In [18]:
import pandas as pd
import numpy as np

import pycountry_convert as pc

## Covid 19

In [5]:
raw_covid_data = pd.read_csv("/Pandemic-Database/Covid 19/COVID-19_Reported_Patient_Impact_and_Hospital_Capacity_by_State_Timeseries__RAW_.csv")

geo_column = ['state']

date_column = ['date']

case_column = ['total_adult_patients_hospitalized_confirmed_covid','total_adult_patients_hospitalized_confirmed_and_suspected_covid']

clean_covid_data = raw_covid_data[geo_column + date_column + case_column]

clean_covid_data.insert(0,'Region','United States')

clean_covid_data.insert(0,'Virus Species','SARS-CoV-2')

clean_covid_data.insert(0,'Virus Genus','Betacoronavirus')

clean_covid_data.insert(0,'Virus Family','Coronaviridae')

clean_covid_data = clean_covid_data.rename(columns = {"state": "Sub-region",
                                                      "date": "Date",
                                                      "total_adult_patients_hospitalized_confirmed_covid": "Confirmed", 
                                                      "total_adult_patients_hospitalized_confirmed_and_suspected_covid": "Suspected"})

clean_covid_data['Suspected'] = clean_covid_data['Suspected'] - clean_covid_data['Confirmed']

clean_covid_data = pd.melt(clean_covid_data, 
                           id_vars=['Virus Family','Virus Genus','Virus Species','Region','Sub-region','Date'], 
                           value_vars = ['Confirmed','Suspected'])

clean_covid_data = clean_covid_data.rename(columns = {"variable": "Type",
                                                      "value": "Case Number"})

clean_covid_data = clean_covid_data.assign(Comments = np.nan)

clean_covid_data = clean_covid_data.sort_values(by = ["Region","Sub-region","Date"])

clean_covid_data = clean_covid_data.dropna(subset=['Case Number'])

clean_covid_data

Unnamed: 0,Virus Family,Virus Genus,Virus Species,Region,Sub-region,Date,Type,Case Number,Comments
18118,Coronaviridae,Betacoronavirus,SARS-CoV-2,United States,AK,2020/07/15,Confirmed,11.0,
82064,Coronaviridae,Betacoronavirus,SARS-CoV-2,United States,AK,2020/07/15,Suspected,17.0,
14471,Coronaviridae,Betacoronavirus,SARS-CoV-2,United States,AK,2020/07/16,Confirmed,14.0,
78417,Coronaviridae,Betacoronavirus,SARS-CoV-2,United States,AK,2020/07/16,Suspected,17.0,
15523,Coronaviridae,Betacoronavirus,SARS-CoV-2,United States,AK,2020/07/17,Confirmed,13.0,
...,...,...,...,...,...,...,...,...,...
77086,Coronaviridae,Betacoronavirus,SARS-CoV-2,United States,WY,2023/06/01,Suspected,0.0,
21417,Coronaviridae,Betacoronavirus,SARS-CoV-2,United States,WY,2023/06/02,Confirmed,3.0,
85363,Coronaviridae,Betacoronavirus,SARS-CoV-2,United States,WY,2023/06/02,Suspected,2.0,
16425,Coronaviridae,Betacoronavirus,SARS-CoV-2,United States,WY,2023/06/03,Confirmed,3.0,


In [6]:
clean_covid_data.to_csv('/Pandemic-Database/Covid 19/Covid-19_Clean_Data.csv',
                       index=False)

## SARS

In [64]:
SARS_data = pd.read_csv('/Users/alex/Documents/GitHub/Past-Pandemic-Metadata/SARS/sars_2003_complete_dataset_clean.csv')

SARS_data = SARS_data.rename(columns= {'Country':'Sub-region',
                              'Cumulative number of case(s)':'Cases',
                              'Number of deaths':'Deaths',
                              'Number recovered':'Recovered'})

SARS_data = pd.melt(SARS_data,
                    id_vars=['Sub-region','Date'],
                    var_name='Type',
                    value_vars=['Cases','Deaths','Recovered'],
                    value_name='Case Number')

SARS_data = SARS_data.assign(Region = np.nan)

SARS_data = SARS_data.assign(Virus_Family = "Coronaviridae")

SARS_data = SARS_data.assign(Virus_Genus = "Betacoronavirus")

SARS_data = SARS_data.assign(Virus_Species = "SARS-CoV-1")

SARS_data = SARS_data.assign(Comments = np.nan)

SARS_data = SARS_data.rename(columns= {'Virus_Family':'Virus Family',
                              'Virus_Genus':'Virus Genus',
                              'Virus_Species':'Virus Species'})

SARS_data = SARS_data[['Virus Family','Virus Genus','Virus Species','Region','Sub-region','Date','Type','Case Number','Comments']]

Continent = []

Unknown_regions = ['Hong Kong SAR, China','Taiwan, China','Republic of Ireland','Republic of Korea','Macao SAR, China']

for index, row in SARS_data.iterrows():
    if row['Sub-region'] not in Unknown_regions:
        code = pc.country_name_to_country_alpha2(row['Sub-region'])
        Continent.append(pc.convert_continent_code_to_continent_name(pc.country_alpha2_to_continent_code(code)))
    else:
        Continent.append(np.nan)
    
SARS_data['Region'] = Continent

SARS_data.loc[SARS_data['Sub-region'] == 'Hong Kong SAR, China', 'Region'] = 'Asia'
SARS_data.loc[SARS_data['Sub-region'] == 'Taiwan, China', 'Region'] = 'Asia'
SARS_data.loc[SARS_data['Sub-region'] == 'Republic of Ireland', 'Region'] = 'Europe'
SARS_data.loc[SARS_data['Sub-region'] == 'Republic of Korea', 'Region'] = 'Asia'
SARS_data.loc[SARS_data['Sub-region'] == 'Macao SAR, China', 'Region'] = 'Asia'

SARS_data

Unnamed: 0,Virus Family,Virus Genus,Virus Species,Region,Sub-region,Date,Type,Case Number,Comments
0,Coronaviridae,Betacoronavirus,SARS-CoV-1,Europe,Germany,2003-03-17,Cases,1,
1,Coronaviridae,Betacoronavirus,SARS-CoV-1,North America,Canada,2003-03-17,Cases,8,
2,Coronaviridae,Betacoronavirus,SARS-CoV-1,Asia,Singapore,2003-03-17,Cases,20,
3,Coronaviridae,Betacoronavirus,SARS-CoV-1,Asia,"Hong Kong SAR, China",2003-03-17,Cases,95,
4,Coronaviridae,Betacoronavirus,SARS-CoV-1,Europe,Switzerland,2003-03-17,Cases,2,
...,...,...,...,...,...,...,...,...,...
7609,Coronaviridae,Betacoronavirus,SARS-CoV-1,Europe,Switzerland,2003-07-11,Recovered,1,
7610,Coronaviridae,Betacoronavirus,SARS-CoV-1,Asia,Thailand,2003-07-11,Recovered,7,
7611,Coronaviridae,Betacoronavirus,SARS-CoV-1,Europe,United Kingdom,2003-07-11,Recovered,4,
7612,Coronaviridae,Betacoronavirus,SARS-CoV-1,North America,United States,2003-07-11,Recovered,67,


In [66]:
SARS_data.to_csv('/Users/alex/Documents/GitHub/Past-Pandemic-Metadata/SARS/Clean_SARS_Data.csv',
                index = False)

## Dengue Fever

## Ebola

In [8]:
pd.read_excel('/Pandemic-Database/Ebola/Raw_Data/CDC-counts.xlsx')

Unnamed: 0,WHO report date,"Total Cases, Guinea","Total Deaths, Guinea","Total Cases, Liberia","Total Deaths, Liberia","Total Cases, Sierra Leone","Total Deaths, Sierra Leone",Total Cases,Total Deaths
0,2016-04-13,3814,2544,10678,4810,14124,3956,28616,11310
1,2016-03-30,3811,2543,10675,4809,14124,3956,28610,11308
2,2016-03-23,3809,2540,10675,4809,14124,3956,28608,11305
3,2016-03-03,3804,2536,10675,4809,14124,3956,28603,11301
4,2016-02-17,3804,2536,10675,4809,14124,3956,28603,11301
...,...,...,...,...,...,...,...,...,...
260,2014-04-01,122,80,8,2,0,0,130,82
261,2014-03-31,112,70,8,6,0,0,120,76
262,2014-03-27,103,66,8,6,6,5,117,77
263,2014-03-26,86,60,0,0,0,0,86,60


In [11]:
early_cdc_data = pd.read_excel('/Pandemic-Database/Ebola/Raw_Data/CDC-counts.xlsx')

early_cdc_data = early_cdc_data.rename(columns={"Total Cases, Guinea": "Confirmed, Guinea",
                                                'Total Deaths, Guinea': "Deaths, Guinea",
                                                'Total Cases, Liberia': "Confirmed, Liberia",
                                                'Total Deaths, Liberia': "Deaths, Liberia",
                                                'Total Cases, Sierra Leone': 'Confirmed, Sierra Leone',
                                                'Total Deaths, Sierra Leone': 'Deaths, Sierra Leone'
                                               })

early_cdc_data = pd.melt(early_cdc_data, 
                         id_vars=['WHO report date','Total Cases','Total Deaths'], 
                         value_vars = ['Confirmed, Guinea','Deaths, Guinea','Confirmed, Liberia','Deaths, Liberia','Confirmed, Sierra Leone','Deaths, Sierra Leone'])

early_cdc_data[['Type','Sub-region']] = early_cdc_data['variable'].str.split(",",expand=True)

early_cdc_data = early_cdc_data.assign(Region = "Africa")

early_cdc_data = early_cdc_data.assign(Virus_Family = "Filoviridae")

early_cdc_data = early_cdc_data.assign(Virus_Genus = "Ebolavirus")

early_cdc_data = early_cdc_data.assign(Virus_Species = "Ebola Virus/ Sudan Virus/ Tai Forest Virus/ Bundibugyo Virus")

early_cdc_data = early_cdc_data.assign(Comments = np.nan)

early_cdc_data['WHO report date'] = pd.to_datetime(early_cdc_data['WHO report date'])

early_cdc_data = early_cdc_data.rename(columns={"WHO report date":"Date",
                                               "value": "Case Number",
                                               'Virus_Family':'Virus Family',
                                               'Virus_Genus':'Virus Genus',
                                               'Virus_Species': 'Virus Species'})

early_cdc_data = early_cdc_data[['Virus Family','Virus Genus','Virus Species','Region','Sub-region','Date','Type','Case Number','Comments']]

early_cdc_data = early_cdc_data.sort_values(by = ["Region","Sub-region","Date"])

early_cdc_data = early_cdc_data.dropna(subset=['Case Number'])

early_cdc_data

Unnamed: 0,Virus Family,Virus Genus,Virus Species,Region,Sub-region,Date,Type,Case Number,Comments
264,Filoviridae,Ebolavirus,Ebola Virus/ Sudan Virus/ Tai Forest Virus/ Bu...,Africa,Guinea,2014-03-25,Confirmed,86,
529,Filoviridae,Ebolavirus,Ebola Virus/ Sudan Virus/ Tai Forest Virus/ Bu...,Africa,Guinea,2014-03-25,Deaths,59,
263,Filoviridae,Ebolavirus,Ebola Virus/ Sudan Virus/ Tai Forest Virus/ Bu...,Africa,Guinea,2014-03-26,Confirmed,86,
528,Filoviridae,Ebolavirus,Ebola Virus/ Sudan Virus/ Tai Forest Virus/ Bu...,Africa,Guinea,2014-03-26,Deaths,60,
262,Filoviridae,Ebolavirus,Ebola Virus/ Sudan Virus/ Tai Forest Virus/ Bu...,Africa,Guinea,2014-03-27,Confirmed,103,
...,...,...,...,...,...,...,...,...,...
1327,Filoviridae,Ebolavirus,Ebola Virus/ Sudan Virus/ Tai Forest Virus/ Bu...,Africa,Sierra Leone,2016-03-23,Deaths,3956,
1061,Filoviridae,Ebolavirus,Ebola Virus/ Sudan Virus/ Tai Forest Virus/ Bu...,Africa,Sierra Leone,2016-03-30,Confirmed,14124,
1326,Filoviridae,Ebolavirus,Ebola Virus/ Sudan Virus/ Tai Forest Virus/ Bu...,Africa,Sierra Leone,2016-03-30,Deaths,3956,
1060,Filoviridae,Ebolavirus,Ebola Virus/ Sudan Virus/ Tai Forest Virus/ Bu...,Africa,Sierra Leone,2016-04-13,Confirmed,14124,


In [12]:
early_cdc_data.to_csv('/Pandemic-Database/Ebola/Ebola_CDC_Confirmed_and_Death_Clean_Data.csv',
                      index=False)

In [15]:
late_guinea_data = pd.read_csv('/Pandemic-Database/Ebola/Processed_Data/guinea_full_data.csv',
                              index_col = "Unnamed: 0")

late_liberia_data = pd.read_csv('/Pandemic-Database/Ebola/Processed_Data/liberia_full_data.csv',
                               index_col = "Unnamed: 0")

late_sierraleone_data = pd.read_csv('/Pandemic-Database/Ebola/Processed_Data/sierraleone_full_data.csv',
                                   index_col = "Unnamed: 0")

late_data = late_guinea_data.append(late_liberia_data).append(late_sierraleone_data)

late_suspected_data = late_data[late_data['Case definition'] == ' Suspected']

late_suspected_data = late_suspected_data.reset_index(drop = True)

## There is a space in front of the case definition
late_suspected_data['Case definition'] = late_suspected_data['Case definition'].str[1:]

late_suspected_data = late_suspected_data.rename(columns={'Country':'Sub-region',
                                                          'Data as of': 'Date',
                                                          'Case definition': 'Type',
                                                          'Number of cases Cumulative': 'Case Number'})

late_suspected_data = late_suspected_data[late_suspected_data['Case Number'] != 'Not reported.']

late_suspected_data = late_suspected_data.assign(Region = "Africa")

late_suspected_data = late_suspected_data.assign(Virus_Family = "Filoviridae")

late_suspected_data = late_suspected_data.assign(Virus_Genus = "Ebolavirus")

late_suspected_data = late_suspected_data.assign(Virus_Species = "Ebola Virus/ Sudan Virus/ Tai Forest Virus/ Bundibugyo Virus")

late_suspected_data = late_suspected_data.assign(Comments = np.nan)

late_suspected_data = late_suspected_data.rename(columns={'Virus_Family':'Virus Family',
                                                          'Virus_Genus': 'Virus Genus',
                                                          'Virus_Species': 'Virus Species'})

late_suspected_data = late_suspected_data[['Virus Family','Virus Genus','Virus Species','Region','Sub-region','Date','Type','Case Number','Comments']]

late_suspected_data = late_suspected_data.dropna(subset=['Case Number'])

late_suspected_data = late_suspected_data.drop_duplicates()

late_suspected_data

Unnamed: 0,Virus Family,Virus Genus,Virus Species,Region,Sub-region,Date,Type,Case Number,Comments
0,Filoviridae,Ebolavirus,Ebola Virus/ Sudan Virus/ Tai Forest Virus/ Bu...,Africa,Guinea,2014-11-25,Suspected,21,
1,Filoviridae,Ebolavirus,Ebola Virus/ Sudan Virus/ Tai Forest Virus/ Bu...,Africa,Guinea,2014-11-28,Suspected,24,
2,Filoviridae,Ebolavirus,Ebola Virus/ Sudan Virus/ Tai Forest Virus/ Bu...,Africa,Guinea,2014-11-30,Suspected,25,
4,Filoviridae,Ebolavirus,Ebola Virus/ Sudan Virus/ Tai Forest Virus/ Bu...,Africa,Guinea,2014-12-02,Suspected,27,
5,Filoviridae,Ebolavirus,Ebola Virus/ Sudan Virus/ Tai Forest Virus/ Bu...,Africa,Guinea,2014-12-03,Suspected,21,
...,...,...,...,...,...,...,...,...,...
981,Filoviridae,Ebolavirus,Ebola Virus/ Sudan Virus/ Tai Forest Virus/ Bu...,Africa,Sierra Leone,2015-11-07,Suspected,5131,
1038,Filoviridae,Ebolavirus,Ebola Virus/ Sudan Virus/ Tai Forest Virus/ Bu...,Africa,Sierra Leone,2016-01-17,Suspected,0,
1039,Filoviridae,Ebolavirus,Ebola Virus/ Sudan Virus/ Tai Forest Virus/ Bu...,Africa,Sierra Leone,2016-01-24,Suspected,0,
1040,Filoviridae,Ebolavirus,Ebola Virus/ Sudan Virus/ Tai Forest Virus/ Bu...,Africa,Sierra Leone,2016-01-31,Suspected,0,


In [16]:
late_suspected_data.to_csv('/Pandemic-Database/Ebola/Ebola_Suspected_Clean_Data.csv',
                           index=False)

## Influenza

### ILINet

In [61]:
influenza_ilinet_data = pd.read_csv('/Pandemic-Database/Influenza/ILINet.csv',
                                    skiprows=1)
influenza_ilinet_data.head()

Unnamed: 0,REGION TYPE,REGION,YEAR,WEEK,% WEIGHTED ILI,%UNWEIGHTED ILI,AGE 0-4,AGE 25-49,AGE 25-64,AGE 5-24,AGE 50-64,AGE 65,ILITOTAL,NUM. OF PROVIDERS,TOTAL PATIENTS
0,National,X,1997,40,1.10148,1.21686,179,X,157,205,X,29,570,192,46842
1,National,X,1997,41,1.20007,1.28064,199,X,151,242,X,23,615,191,48023
2,National,X,1997,42,1.37876,1.23906,228,X,153,266,X,34,681,219,54961
3,National,X,1997,43,1.1992,1.14473,188,X,193,236,X,36,653,213,57044
4,National,X,1997,44,1.65618,1.26112,217,X,162,280,X,41,700,213,55506


In [62]:
influenza_weighted_ilinet_data = influenza_ilinet_data[['YEAR','WEEK','% WEIGHTED ILI']]

dates = influenza_weighted_ilinet_data['YEAR']*100 + influenza_weighted_ilinet_data['WEEK']
dates = pd.to_datetime(dates.astype(str) + '0', format = '%Y%W%w')

influenza_weighted_ilinet_data = influenza_weighted_ilinet_data.assign(Date = dates)
influenza_weighted_ilinet_data = influenza_weighted_ilinet_data.assign(Virus_Family = 'Orthomyxovididae')
influenza_weighted_ilinet_data = influenza_weighted_ilinet_data.assign(Virus_Genus = 'Influenza Viruses')
influenza_weighted_ilinet_data = influenza_weighted_ilinet_data.assign(Virus_Species = np.nan)
influenza_weighted_ilinet_data = influenza_weighted_ilinet_data.assign(Region = 'United States')
influenza_weighted_ilinet_data = influenza_weighted_ilinet_data.assign(Subregion = np.nan)
influenza_weighted_ilinet_data = influenza_weighted_ilinet_data.assign(Comments = np.nan)
influenza_weighted_ilinet_data = influenza_weighted_ilinet_data.assign(Type = '% Weighted ILI')

influenza_weighted_ilinet_data = influenza_weighted_ilinet_data.rename(columns={'Subregion':'Sub-region',
                                                                               '% WEIGHTED ILI': 'Case Number',
                                                                               'Virus_Family': 'Virus Family',
                                                                               'Virus_Genus': 'Virus Genus',
                                                                               'Virus_Species': 'Virus Species'})

influenza_weighted_ilinet_data = influenza_weighted_ilinet_data[['Virus Family','Virus Genus','Virus Species','Region','Sub-region','Date','Type','Case Number','Comments']]

influenza_weighted_ilinet_data

Unnamed: 0,Virus Family,Virus Genus,Virus Species,Region,Sub-region,Date,Type,Case Number,Comments
0,Orthomyxovididae,Influenza Viruses,,United States,,1997-10-12,% Weighted ILI,1.10148,
1,Orthomyxovididae,Influenza Viruses,,United States,,1997-10-19,% Weighted ILI,1.20007,
2,Orthomyxovididae,Influenza Viruses,,United States,,1997-10-26,% Weighted ILI,1.37876,
3,Orthomyxovididae,Influenza Viruses,,United States,,1997-11-02,% Weighted ILI,1.19920,
4,Orthomyxovididae,Influenza Viruses,,United States,,1997-11-09,% Weighted ILI,1.65618,
...,...,...,...,...,...,...,...,...,...
1342,Orthomyxovididae,Influenza Viruses,,United States,,2023-06-25,% Weighted ILI,1.37277,
1343,Orthomyxovididae,Influenza Viruses,,United States,,2023-07-02,% Weighted ILI,1.28216,
1344,Orthomyxovididae,Influenza Viruses,,United States,,2023-07-09,% Weighted ILI,1.23907,
1345,Orthomyxovididae,Influenza Viruses,,United States,,2023-07-16,% Weighted ILI,1.16642,


In [63]:
influenza_weighted_ilinet_data.to_csv('/Pandemic-Database/Influenza/Processed Data/ILINET_influenza_weighted_ili_data_clean.csv')

In [64]:
influenza_ili_total_data = influenza_ilinet_data[['YEAR','WEEK','ILITOTAL']]

dates = influenza_ili_total_data['YEAR']*100 + influenza_ili_total_data['WEEK']
dates = pd.to_datetime(dates.astype(str) + '0', format = '%Y%W%w')

influenza_ili_total_data = influenza_ili_total_data.assign(Date = dates)
influenza_ili_total_data = influenza_ili_total_data.assign(Virus_Family = 'Orthomyxovididae')
influenza_ili_total_data = influenza_ili_total_data.assign(Virus_Genus = 'Influenza Viruses')
influenza_ili_total_data = influenza_ili_total_data.assign(Virus_Species = np.nan)
influenza_ili_total_data = influenza_ili_total_data.assign(Region = 'United States')
influenza_ili_total_data = influenza_ili_total_data.assign(Subregion = np.nan)
influenza_ili_total_data = influenza_ili_total_data.assign(Comments = np.nan)
influenza_ili_total_data = influenza_ili_total_data.assign(Type = 'Influenza Like Illness')

influenza_ili_total_data = influenza_ili_total_data.rename(columns={'Subregion':'Sub-region',
                                                                    'ILITOTAL': 'Case Number',
                                                                    'Virus_Family':'Virus Family',
                                                                    'Virus_Genus':'Virus Genus',
                                                                    'Virus_Species':'Virus Species'})

influenza_ili_total_data = influenza_ili_total_data[['Virus Family','Virus Genus','Virus Species','Region','Sub-region','Date','Type','Case Number','Comments']]

influenza_ili_total_data

Unnamed: 0,Virus Family,Virus Genus,Virus Species,Region,Sub-region,Date,Type,Case Number,Comments
0,Orthomyxovididae,Influenza Viruses,,United States,,1997-10-12,Influenza Like Illness,570,
1,Orthomyxovididae,Influenza Viruses,,United States,,1997-10-19,Influenza Like Illness,615,
2,Orthomyxovididae,Influenza Viruses,,United States,,1997-10-26,Influenza Like Illness,681,
3,Orthomyxovididae,Influenza Viruses,,United States,,1997-11-02,Influenza Like Illness,653,
4,Orthomyxovididae,Influenza Viruses,,United States,,1997-11-09,Influenza Like Illness,700,
...,...,...,...,...,...,...,...,...,...
1342,Orthomyxovididae,Influenza Viruses,,United States,,2023-06-25,Influenza Like Illness,30869,
1343,Orthomyxovididae,Influenza Viruses,,United States,,2023-07-02,Influenza Like Illness,28473,
1344,Orthomyxovididae,Influenza Viruses,,United States,,2023-07-09,Influenza Like Illness,26596,
1345,Orthomyxovididae,Influenza Viruses,,United States,,2023-07-16,Influenza Like Illness,25819,


In [65]:
influenza_ili_total_data.to_csv('/Pandemic-Database/Influenza/Processed Data/ILINet_influenza_total_ili_data_clean.csv',
                               index = False)

### WHO Data

#### Data prior to 2015

In [66]:
who_data_prior_2015 = pd.read_csv("/Pandemic-Database/Influenza/WHO_NREVSS_Combined_prior_to_2015_16.csv",
                                 skiprows=1)

dates = who_data_prior_2015['YEAR']*100 + who_data_prior_2015['WEEK']
dates = pd.to_datetime(dates.astype(str) + '0', format = '%Y%W%w')

who_data_prior_2015 = who_data_prior_2015.assign(Date = dates)

who_data_prior_2015

Unnamed: 0,REGION TYPE,REGION,YEAR,WEEK,TOTAL SPECIMENS,PERCENT POSITIVE,A (2009 H1N1),A (H1),A (H3),A (Subtyping not Performed),A (Unable to Subtype),B,H3N2v,Date
0,National,X,1997,40,1291,0.000000,0,0,0,0,0,0,0,1997-10-12
1,National,X,1997,41,1513,0.727032,0,0,0,11,0,0,0,1997-10-19
2,National,X,1997,42,1552,1.095360,0,0,3,13,0,1,0,1997-10-26
3,National,X,1997,43,1669,0.419413,0,0,0,7,0,0,0,1997-11-02
4,National,X,1997,44,1897,0.527148,0,0,9,1,0,0,0,1997-11-09
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
935,National,X,2015,35,6115,2.273100,3,0,64,43,0,29,0,2015-09-06
936,National,X,2015,36,6786,2.180960,1,0,74,35,0,38,0,2015-09-13
937,National,X,2015,37,7694,2.573430,4,0,91,56,0,47,0,2015-09-20
938,National,X,2015,38,8855,1.885940,1,0,58,62,0,46,0,2015-09-27


In [67]:
p2015_type_A_percent = who_data_prior_2015[['Date','A (2009 H1N1)','A (H1)','A (H3)','A (Subtyping not Performed)','A (Unable to Subtype)','TOTAL SPECIMENS']]
a_percent = (p2015_type_A_percent['A (2009 H1N1)'] + p2015_type_A_percent['A (H1)'] + p2015_type_A_percent['A (H3)'] + p2015_type_A_percent['A (Subtyping not Performed)'] + p2015_type_A_percent['A (Unable to Subtype)']) / p2015_type_A_percent['TOTAL SPECIMENS']

p2015_type_A_percent = p2015_type_A_percent.assign(Case_Number = a_percent)
p2015_type_A_percent = p2015_type_A_percent.assign(Virus_Family = 'Orthomyxovididae')
p2015_type_A_percent = p2015_type_A_percent.assign(Virus_Genus = 'Influenza Viruses A')
p2015_type_A_percent = p2015_type_A_percent.assign(Virus_Species = np.nan)
p2015_type_A_percent = p2015_type_A_percent.assign(Region = 'United States')
p2015_type_A_percent = p2015_type_A_percent.assign(Subregion = np.nan)
p2015_type_A_percent = p2015_type_A_percent.assign(Comments = np.nan)
p2015_type_A_percent = p2015_type_A_percent.assign(Type = '% Positive Cases')

p2015_type_A_percent = p2015_type_A_percent.rename(columns={'Subregion':'Sub-region',
                                                            'Case_Number':'Case Number',
                                                            'Virus_Family': 'Virus Family',
                                                            'Virus_Genus': 'Virus Genus',
                                                            'Virus_Species': 'Virus Species'})

p2015_type_A_percent = p2015_type_A_percent[['Virus Family','Virus Genus','Virus Species','Region','Sub-region','Date','Type','Case Number','Comments']]
p2015_type_A_percent

Unnamed: 0,Virus Family,Virus Genus,Virus Species,Region,Sub-region,Date,Type,Case Number,Comments
0,Orthomyxovididae,Influenza Viruses A,,United States,,1997-10-12,% Positive Cases,0.000000,
1,Orthomyxovididae,Influenza Viruses A,,United States,,1997-10-19,% Positive Cases,0.007270,
2,Orthomyxovididae,Influenza Viruses A,,United States,,1997-10-26,% Positive Cases,0.010309,
3,Orthomyxovididae,Influenza Viruses A,,United States,,1997-11-02,% Positive Cases,0.004194,
4,Orthomyxovididae,Influenza Viruses A,,United States,,1997-11-09,% Positive Cases,0.005271,
...,...,...,...,...,...,...,...,...,...
935,Orthomyxovididae,Influenza Viruses A,,United States,,2015-09-06,% Positive Cases,0.017989,
936,Orthomyxovididae,Influenza Viruses A,,United States,,2015-09-13,% Positive Cases,0.016210,
937,Orthomyxovididae,Influenza Viruses A,,United States,,2015-09-20,% Positive Cases,0.019626,
938,Orthomyxovididae,Influenza Viruses A,,United States,,2015-09-27,% Positive Cases,0.013665,


In [68]:
p2015_type_B_percent = who_data_prior_2015[['Date','B','TOTAL SPECIMENS']]
b_percent = (p2015_type_B_percent['B']) / p2015_type_B_percent['TOTAL SPECIMENS']

p2015_type_B_percent = p2015_type_B_percent.assign(Case_Number = b_percent)
p2015_type_B_percent = p2015_type_B_percent.assign(Virus_Family = 'Orthomyxovididae')
p2015_type_B_percent = p2015_type_B_percent.assign(Virus_Genus = 'Influenza Viruses B')
p2015_type_B_percent = p2015_type_B_percent.assign(Virus_Species = np.nan)
p2015_type_B_percent = p2015_type_B_percent.assign(Region = 'United States')
p2015_type_B_percent = p2015_type_B_percent.assign(Subregion = np.nan)
p2015_type_B_percent = p2015_type_B_percent.assign(Comments = np.nan)
p2015_type_B_percent = p2015_type_B_percent.assign(Type = '% Positive Cases')

p2015_type_B_percent = p2015_type_B_percent.rename(columns={'Subregion':'Sub-region',
                                                            'Case_Number':'Case Number',
                                                            'Virus_Family': 'Virus Family',
                                                            'Virus_Genus': 'Virus Genus',
                                                            'Virus_Species': 'Virus Species'})

p2015_type_B_percent = p2015_type_B_percent[['Virus Family','Virus Genus','Virus Species','Region','Sub-region','Date','Type','Case Number','Comments']]
p2015_type_B_percent

Unnamed: 0,Virus Family,Virus Genus,Virus Species,Region,Sub-region,Date,Type,Case Number,Comments
0,Orthomyxovididae,Influenza Viruses B,,United States,,1997-10-12,% Positive Cases,0.000000,
1,Orthomyxovididae,Influenza Viruses B,,United States,,1997-10-19,% Positive Cases,0.000000,
2,Orthomyxovididae,Influenza Viruses B,,United States,,1997-10-26,% Positive Cases,0.000644,
3,Orthomyxovididae,Influenza Viruses B,,United States,,1997-11-02,% Positive Cases,0.000000,
4,Orthomyxovididae,Influenza Viruses B,,United States,,1997-11-09,% Positive Cases,0.000000,
...,...,...,...,...,...,...,...,...,...
935,Orthomyxovididae,Influenza Viruses B,,United States,,2015-09-06,% Positive Cases,0.004742,
936,Orthomyxovididae,Influenza Viruses B,,United States,,2015-09-13,% Positive Cases,0.005600,
937,Orthomyxovididae,Influenza Viruses B,,United States,,2015-09-20,% Positive Cases,0.006109,
938,Orthomyxovididae,Influenza Viruses B,,United States,,2015-09-27,% Positive Cases,0.005195,


In [69]:
p2015_type_H3N2v_percent = who_data_prior_2015[['Date','H3N2v','TOTAL SPECIMENS']]
H3N2v_percent = (p2015_type_H3N2v_percent['H3N2v']) / p2015_type_H3N2v_percent['TOTAL SPECIMENS']

p2015_type_H3N2v_percent = p2015_type_H3N2v_percent.assign(Case_Number = H3N2v_percent)
p2015_type_H3N2v_percent = p2015_type_H3N2v_percent.assign(Virus_Family = 'Orthomyxovididae')
p2015_type_H3N2v_percent = p2015_type_H3N2v_percent.assign(Virus_Genus = 'H3N2v')
p2015_type_H3N2v_percent = p2015_type_H3N2v_percent.assign(Virus_Species = np.nan)
p2015_type_H3N2v_percent = p2015_type_H3N2v_percent.assign(Region = 'United States')
p2015_type_H3N2v_percent = p2015_type_H3N2v_percent.assign(Subregion = np.nan)
p2015_type_H3N2v_percent = p2015_type_H3N2v_percent.assign(Comments = np.nan)
p2015_type_H3N2v_percent = p2015_type_H3N2v_percent.assign(Type = '% Positive Cases')

p2015_type_H3N2v_percent = p2015_type_H3N2v_percent.rename(columns={'Subregion':'Sub-region',
                                                                    'Case_Number':'Case Number',
                                                                    'Virus_Family': 'Virus Family',
                                                                    'Virus_Genus': 'Virus Genus',
                                                                    'Virus_Species': 'Virus Species'})

p2015_type_H3N2v_percent = p2015_type_H3N2v_percent[['Virus Family','Virus Genus','Virus Species','Region','Sub-region','Date','Type','Case Number','Comments']]
p2015_type_H3N2v_percent

Unnamed: 0,Virus Family,Virus Genus,Virus Species,Region,Sub-region,Date,Type,Case Number,Comments
0,Orthomyxovididae,H3N2v,,United States,,1997-10-12,% Positive Cases,0.0,
1,Orthomyxovididae,H3N2v,,United States,,1997-10-19,% Positive Cases,0.0,
2,Orthomyxovididae,H3N2v,,United States,,1997-10-26,% Positive Cases,0.0,
3,Orthomyxovididae,H3N2v,,United States,,1997-11-02,% Positive Cases,0.0,
4,Orthomyxovididae,H3N2v,,United States,,1997-11-09,% Positive Cases,0.0,
...,...,...,...,...,...,...,...,...,...
935,Orthomyxovididae,H3N2v,,United States,,2015-09-06,% Positive Cases,0.0,
936,Orthomyxovididae,H3N2v,,United States,,2015-09-13,% Positive Cases,0.0,
937,Orthomyxovididae,H3N2v,,United States,,2015-09-20,% Positive Cases,0.0,
938,Orthomyxovididae,H3N2v,,United States,,2015-09-27,% Positive Cases,0.0,


In [70]:
p2015_total_percent = p2015_type_A_percent.append(p2015_type_B_percent).append(p2015_type_H3N2v_percent)
p2015_total_percent

Unnamed: 0,Virus Family,Virus Genus,Virus Species,Region,Sub-region,Date,Type,Case Number,Comments
0,Orthomyxovididae,Influenza Viruses A,,United States,,1997-10-12,% Positive Cases,0.000000,
1,Orthomyxovididae,Influenza Viruses A,,United States,,1997-10-19,% Positive Cases,0.007270,
2,Orthomyxovididae,Influenza Viruses A,,United States,,1997-10-26,% Positive Cases,0.010309,
3,Orthomyxovididae,Influenza Viruses A,,United States,,1997-11-02,% Positive Cases,0.004194,
4,Orthomyxovididae,Influenza Viruses A,,United States,,1997-11-09,% Positive Cases,0.005271,
...,...,...,...,...,...,...,...,...,...
935,Orthomyxovididae,H3N2v,,United States,,2015-09-06,% Positive Cases,0.000000,
936,Orthomyxovididae,H3N2v,,United States,,2015-09-13,% Positive Cases,0.000000,
937,Orthomyxovididae,H3N2v,,United States,,2015-09-20,% Positive Cases,0.000000,
938,Orthomyxovididae,H3N2v,,United States,,2015-09-27,% Positive Cases,0.000000,


In [71]:
p2015_total_percent.to_csv('/Pandemic-Database/Influenza/Processed Data/Prior_2015_WHO_data_clean.csv',
                           index=False)

#### Data after 2015 (Clinical Lab Data)

In [72]:
clin_data = pd.read_csv('/Pandemic-Database/Influenza/WHO_NREVSS_Clinical_Labs.csv',
                        skiprows=1)

In [73]:
clin_data = clin_data[['YEAR','WEEK','PERCENT A', 'PERCENT B']]

dates = clin_data['YEAR']*100 + clin_data['WEEK']
dates = pd.to_datetime(dates.astype(str) + '0', format = '%Y%W%w')

clin_data = clin_data.assign(Date = dates)

clin_data = clin_data[['Date','PERCENT A','PERCENT B']]

clin_data

Unnamed: 0,Date,PERCENT A,PERCENT B
0,2015-10-11,0.698312,0.357469
1,2015-10-18,0.884753,0.411868
2,2015-10-25,0.721672,0.386876
3,2015-11-01,0.723942,0.384132
4,2015-11-08,0.660448,0.462994
...,...,...,...
402,2023-06-25,0.590348,0.365454
403,2023-07-02,0.744670,0.375847
404,2023-07-09,0.487211,0.327345
405,2023-07-16,0.620487,0.421179


In [74]:
clin_data = pd.melt(clin_data,
                    id_vars=['Date'],
                    var_name='Virus',
                    value_vars=['PERCENT A','PERCENT B'],
                    value_name='Case Number')

clin_data['Virus'] = np.where(clin_data['Virus'] == 'PERCENT A', 'Influenza Viruses A',' Influenza Viruses B')

clin_data

Unnamed: 0,Date,Virus,Case Number
0,2015-10-11,Influenza Viruses A,0.698312
1,2015-10-18,Influenza Viruses A,0.884753
2,2015-10-25,Influenza Viruses A,0.721672
3,2015-11-01,Influenza Viruses A,0.723942
4,2015-11-08,Influenza Viruses A,0.660448
...,...,...,...
809,2023-06-25,Influenza Viruses B,0.365454
810,2023-07-02,Influenza Viruses B,0.375847
811,2023-07-09,Influenza Viruses B,0.327345
812,2023-07-16,Influenza Viruses B,0.421179


In [75]:
clin_data = clin_data.assign(Region = 'United States')
clin_data = clin_data.assign(Subregion = np.nan)
clin_data = clin_data.assign(Comments = np.nan)
clin_data = clin_data.assign(Virus_Family = 'Orthomyxovididae')
clin_data = clin_data.assign(Virus_Species = np.nan)
clin_data = clin_data.assign(Type = '% Positive')

clin_data = clin_data.rename(columns={'Subregion':'Sub-region',
                                     'Virus_Family':'Virus Family',
                                     'Virus_Species':'Virus Species',
                                     'Virus':'Virus Genus'})

clin_data = clin_data[['Virus Family','Virus Genus','Virus Species','Region','Sub-region','Date','Type','Case Number','Comments']]
clin_data

Unnamed: 0,Virus Family,Virus Genus,Virus Species,Region,Sub-region,Date,Type,Case Number,Comments
0,Orthomyxovididae,Influenza Viruses A,,United States,,2015-10-11,% Positive,0.698312,
1,Orthomyxovididae,Influenza Viruses A,,United States,,2015-10-18,% Positive,0.884753,
2,Orthomyxovididae,Influenza Viruses A,,United States,,2015-10-25,% Positive,0.721672,
3,Orthomyxovididae,Influenza Viruses A,,United States,,2015-11-01,% Positive,0.723942,
4,Orthomyxovididae,Influenza Viruses A,,United States,,2015-11-08,% Positive,0.660448,
...,...,...,...,...,...,...,...,...,...
809,Orthomyxovididae,Influenza Viruses B,,United States,,2023-06-25,% Positive,0.365454,
810,Orthomyxovididae,Influenza Viruses B,,United States,,2023-07-02,% Positive,0.375847,
811,Orthomyxovididae,Influenza Viruses B,,United States,,2023-07-09,% Positive,0.327345,
812,Orthomyxovididae,Influenza Viruses B,,United States,,2023-07-16,% Positive,0.421179,


In [76]:
clin_data.to_csv('/Pandemic-Database/Influenza/Processed Data/After_2015_influenza_ili_percent_clinical_data_clean.csv',
                index=False)

#### Data after 2015 (Public Health Lab Data)

In [77]:
public_health_data = pd.read_csv('/Pandemic-Database/Influenza/WHO_NREVSS_Public_Health_Labs.csv',
                                 skiprows = 1)

a_percent = (public_health_data['A (2009 H1N1)'] + public_health_data['A (H3)'] + public_health_data['A (Subtyping not Performed)'])/ public_health_data['TOTAL SPECIMENS']
b_percent = (public_health_data['B'] + public_health_data['BVic'] + public_health_data['BYam'])/ public_health_data['TOTAL SPECIMENS']
H3N2v_percent = (public_health_data['H3N2v'])/ public_health_data['TOTAL SPECIMENS']

public_health_data = public_health_data.assign(a_percent = a_percent)
public_health_data = public_health_data.assign(b_percent = b_percent)
public_health_data = public_health_data.assign(H3N2v_percent = H3N2v_percent)

public_health_data = public_health_data.rename(columns={'a_percent':'PERCENT A',
                                                        'b_percent':'PERCENT B',
                                                        'H3N2v_percent':'PERCENT H3N2v'})

public_health_data = public_health_data[['YEAR','WEEK','PERCENT A','PERCENT B','PERCENT H3N2v']]

dates = public_health_data['YEAR']*100 + public_health_data['WEEK']
dates = pd.to_datetime(dates.astype(str) + '0', format = '%Y%W%w')

public_health_data = public_health_data.assign(Date = dates)

public_health_data = public_health_data.rename(columns={'PERCENT A':'Influenza Viruses A',
                                                       'PERCENT B': 'Influenza Viruses B',
                                                       'PERCENT H3N2v': 'H3N2v'})

public_health_data = pd.melt(public_health_data,
                             id_vars=['Date'],
                             var_name='Virus',
                             value_vars=['Influenza Viruses A','Influenza Viruses B', 'H3N2v'],
                             value_name='Case Number')

public_health_data = public_health_data.assign(Region = 'United States')
public_health_data = public_health_data.assign(Subregion = np.nan)
public_health_data = public_health_data.assign(Comments = np.nan)
public_health_data = public_health_data.assign(Virus_Family = 'Orthomyxovididae')
public_health_data = public_health_data.assign(Virus_Species = np.nan)
public_health_data  = public_health_data.assign(Type = '% Positive')

public_health_data = public_health_data.rename(columns={'Virus':'Virus Genus',
                                                       'Virus_Family':'Virus Family',
                                                       'Virus_Species':'Virus Species',
                                                       'Subregion':'Sub-region'})

public_health_data = public_health_data[['Virus Family','Virus Genus','Virus Species','Region','Sub-region','Date','Type','Case Number','Comments']]

public_health_data

Unnamed: 0,Virus Family,Virus Genus,Virus Species,Region,Sub-region,Date,Type,Case Number,Comments
0,Orthomyxovididae,Influenza Viruses A,,United States,,2015-10-11,% Positive,0.062335,
1,Orthomyxovididae,Influenza Viruses A,,United States,,2015-10-18,% Positive,0.041667,
2,Orthomyxovididae,Influenza Viruses A,,United States,,2015-10-25,% Positive,0.050918,
3,Orthomyxovididae,Influenza Viruses A,,United States,,2015-11-01,% Positive,0.035370,
4,Orthomyxovididae,Influenza Viruses A,,United States,,2015-11-08,% Positive,0.021160,
...,...,...,...,...,...,...,...,...,...
1216,Orthomyxovididae,H3N2v,,United States,,2023-06-25,% Positive,0.000000,
1217,Orthomyxovididae,H3N2v,,United States,,2023-07-02,% Positive,0.000000,
1218,Orthomyxovididae,H3N2v,,United States,,2023-07-09,% Positive,0.000000,
1219,Orthomyxovididae,H3N2v,,United States,,2023-07-16,% Positive,0.000000,


In [78]:
public_health_data.to_csv('/Pandemic-Database/Influenza/Processed Data/After_2015_influenza_ili_percent_public_health_data_clean.csv',
                         index = False)

## Monkeypox

In [54]:
monkeypox_data = pd.read_csv('/Pandemic-Database/Monkeypox/global_monkeypox-data.csv')

monkeypox_data = monkeypox_data[['location','date','total_cases','total_deaths']]
monkeypox_data = monkeypox_data.rename(columns={'location':'Region',
                                               'date':'Date',
                                               'total_cases': 'Cases',
                                               'total_deaths': 'Deaths'})

monkeypox_data = pd.melt(monkeypox_data,
                         id_vars=['Region','Date'],
                         var_name='Type',
                         value_vars=['Cases','Deaths'],
                         value_name='Case Number')

monkeypox_data = monkeypox_data.assign(Virus_Family = 'Poxviridae')
monkeypox_data = monkeypox_data.assign(Virus_Genus = 'Orthopoxvirus')
monkeypox_data = monkeypox_data.assign(Virus_Species = 'MPXV')
monkeypox_data = monkeypox_data.assign(Subregion = np.nan)
monkeypox_data = monkeypox_data.assign(Comments = np.nan)

monkeypox_data = monkeypox_data.rename(columns={'Subregion':'Sub-region',
                                                'Virus_Family': 'Virus Family',
                                                'Virus_Genus': 'Virus Genus', 
                                                'Virus_Species': 'Virus Species'})

monkeypox_data = monkeypox_data[['Virus Family','Virus Genus','Virus Species','Region','Sub-region','Date','Type','Case Number','Comments']]

monkeypox_data

Unnamed: 0,Virus Family,Virus Genus,Virus Species,Region,Sub-region,Date,Type,Case Number,Comments
0,Poxviridae,Orthopoxvirus,MPXV,Africa,,2022-05-01,Cases,27.0,
1,Poxviridae,Orthopoxvirus,MPXV,Africa,,2022-05-02,Cases,27.0,
2,Poxviridae,Orthopoxvirus,MPXV,Africa,,2022-05-03,Cases,27.0,
3,Poxviridae,Orthopoxvirus,MPXV,Africa,,2022-05-04,Cases,27.0,
4,Poxviridae,Orthopoxvirus,MPXV,Africa,,2022-05-05,Cases,27.0,
...,...,...,...,...,...,...,...,...,...
83457,Poxviridae,Orthopoxvirus,MPXV,World,,2023-07-20,Deaths,150.0,
83458,Poxviridae,Orthopoxvirus,MPXV,World,,2023-07-21,Deaths,150.0,
83459,Poxviridae,Orthopoxvirus,MPXV,World,,2023-07-22,Deaths,150.0,
83460,Poxviridae,Orthopoxvirus,MPXV,World,,2023-07-23,Deaths,150.0,


In [56]:
monkeypox_data.to_csv('/Pandemic-Database/Monkeypox/global_monkeypox_data_clean.csv',
                     index=False)

## Norovirus

## Zika Virus