In [1]:
# Dependencies (storing and anaysis)
import numpy as np
import pandas as pd

# Dependencies (visualization)
import matplotlib.pyplot as plt
import matplotlib.dates as mdates
import seaborn as sns

In [2]:
# COVID-19 Data

# covid_19 dataset
covid = pd.read_csv('Resources/covid_2020.csv', parse_dates=['Date'])
print(covid)   

         Province/State  Country/Region      Lat      Long       Date  \
0                 Anhui  Mainland China  31.8257  117.2264 2020-01-22   
1               Beijing  Mainland China  40.1824  116.4142 2020-01-22   
2             Chongqing  Mainland China  30.0572  107.8740 2020-01-22   
3                Fujian  Mainland China  26.0789  117.9874 2020-01-22   
4                 Gansu  Mainland China  36.0611  103.8343 2020-01-22   
...                 ...             ...      ...       ...        ...   
9085  Ramsey County, MN              US  44.9964  -93.0616 2020-03-06   
9086  Washoe County, NV              US  40.5608 -119.6035 2020-03-06   
9087   Wayne County, PA              US  41.6739  -75.2479 2020-03-06   
9088    Yolo County, CA              US  38.7646 -121.9018 2020-03-06   
9089                NaN    Vatican City  41.9029   12.4534 2020-03-06   

      Confirmed  Deaths  Recovered  
0           1.0     0.0        0.0  
1          14.0     0.0        0.0  
2           

In [3]:
 # selecting only the columns we need
covid = covid[['Date', 'Country/Region', 'Confirmed', 'Deaths', 'Recovered']]
print(covid)

           Date  Country/Region  Confirmed  Deaths  Recovered
0    2020-01-22  Mainland China        1.0     0.0        0.0
1    2020-01-22  Mainland China       14.0     0.0        0.0
2    2020-01-22  Mainland China        6.0     0.0        0.0
3    2020-01-22  Mainland China        1.0     0.0        0.0
4    2020-01-22  Mainland China        0.0     0.0        0.0
...         ...             ...        ...     ...        ...
9085 2020-03-06              US        1.0     0.0        0.0
9086 2020-03-06              US        1.0     0.0        0.0
9087 2020-03-06              US        1.0     0.0        0.0
9088 2020-03-06              US        1.0     0.0        0.0
9089 2020-03-06    Vatican City        1.0     0.0        0.0

[9090 rows x 5 columns]


In [4]:
# renaming columns with slash
covid.columns = ['Date', 'Country', 'Cases', 'Deaths', 'Recovered']
print(covid)

           Date         Country  Cases  Deaths  Recovered
0    2020-01-22  Mainland China    1.0     0.0        0.0
1    2020-01-22  Mainland China   14.0     0.0        0.0
2    2020-01-22  Mainland China    6.0     0.0        0.0
3    2020-01-22  Mainland China    1.0     0.0        0.0
4    2020-01-22  Mainland China    0.0     0.0        0.0
...         ...             ...    ...     ...        ...
9085 2020-03-06              US    1.0     0.0        0.0
9086 2020-03-06              US    1.0     0.0        0.0
9087 2020-03-06              US    1.0     0.0        0.0
9088 2020-03-06              US    1.0     0.0        0.0
9089 2020-03-06    Vatican City    1.0     0.0        0.0

[9090 rows x 5 columns]


In [5]:
# group by date and country
covid = covid.groupby(['Date', 'Country'])['Cases', 'Deaths', 'Recovered']
covid = covid.sum().reset_index()
print(covid)

           Date               Country  Cases  Deaths  Recovered
0    2020-01-22           Afghanistan    0.0     0.0        0.0
1    2020-01-22               Algeria    0.0     0.0        0.0
2    2020-01-22               Andorra    0.0     0.0        0.0
3    2020-01-22             Argentina    0.0     0.0        0.0
4    2020-01-22               Armenia    0.0     0.0        0.0
...         ...                   ...    ...     ...        ...
4450 2020-03-06                    US  278.0    14.0        8.0
4451 2020-03-06               Ukraine    1.0     0.0        0.0
4452 2020-03-06  United Arab Emirates   29.0     0.0        5.0
4453 2020-03-06          Vatican City    1.0     0.0        0.0
4454 2020-03-06               Vietnam   16.0     0.0       16.0

[4455 rows x 5 columns]


In [6]:
# only countries with cases
covid_cases = covid[covid['Cases']>0]
print(covid_cases)

           Date               Country  Cases  Deaths  Recovered
44   2020-01-22                 Japan    2.0     0.0        0.0
52   2020-01-22                 Macau    1.0     0.0        0.0
53   2020-01-22        Mainland China  547.0    17.0       28.0
84   2020-01-22           South Korea    1.0     0.0        0.0
89   2020-01-22                Taiwan    1.0     0.0        0.0
...         ...                   ...    ...     ...        ...
4450 2020-03-06                    US  278.0    14.0        8.0
4451 2020-03-06               Ukraine    1.0     0.0        0.0
4452 2020-03-06  United Arab Emirates   29.0     0.0        5.0
4453 2020-03-06          Vatican City    1.0     0.0        0.0
4454 2020-03-06               Vietnam   16.0     0.0       16.0

[1640 rows x 5 columns]


In [7]:
covid_cases.head()

Unnamed: 0,Date,Country,Cases,Deaths,Recovered
44,2020-01-22,Japan,2.0,0.0,0.0
52,2020-01-22,Macau,1.0,0.0,0.0
53,2020-01-22,Mainland China,547.0,17.0,28.0
84,2020-01-22,South Korea,1.0,0.0,0.0
89,2020-01-22,Taiwan,1.0,0.0,0.0


In [8]:
#open('u.item', encoding = "ISO-8859-1")

In [9]:
# h1n1 Data

# h1n1 dataset
h1n1 = pd.read_csv('Resources/h1n1_2009.csv',  encoding = "ISO-8859-1", parse_dates=['Date'])
print(h1n1) 

                       Country  Cases  Deaths                Date
0                      Algeria      5     0.0 2009-07-06 09:00:00
1          Antigua and Barbuda      2     0.0 2009-07-06 09:00:00
2                    Argentina   2485    60.0 2009-07-06 09:00:00
3                    Australia   5298    10.0 2009-07-06 09:00:00
4                      Austria     19     0.0 2009-07-06 09:00:00
...                        ...    ...     ...                 ...
1817                  Thailand      2     0.0 2009-05-23 08:00:00
1818                    Turkey      2     0.0 2009-05-23 08:00:00
1819            United Kingdom    117     0.0 2009-05-23 08:00:00
1820  United States of America   6552     9.0 2009-05-23 08:00:00
1821               Grand Total  12022    86.0 2009-05-23 08:00:00

[1822 rows x 4 columns]


In [10]:
 # selecting only the columns we need
h1n1 = h1n1[['Date', 'Country', 'Cases', 'Deaths']]
print(h1n1)

                    Date                   Country  Cases  Deaths
0    2009-07-06 09:00:00                   Algeria      5     0.0
1    2009-07-06 09:00:00       Antigua and Barbuda      2     0.0
2    2009-07-06 09:00:00                 Argentina   2485    60.0
3    2009-07-06 09:00:00                 Australia   5298    10.0
4    2009-07-06 09:00:00                   Austria     19     0.0
...                  ...                       ...    ...     ...
1817 2009-05-23 08:00:00                  Thailand      2     0.0
1818 2009-05-23 08:00:00                    Turkey      2     0.0
1819 2009-05-23 08:00:00            United Kingdom    117     0.0
1820 2009-05-23 08:00:00  United States of America   6552     9.0
1821 2009-05-23 08:00:00               Grand Total  12022    86.0

[1822 rows x 4 columns]


In [11]:
# group by date and country
h1n1 = h1n1.groupby(['Date', 'Country'])['Cases', 'Deaths']
h1n1 = h1n1.sum().reset_index()
print(h1n1)

                    Date                  Country  Cases  Deaths
0    2009-05-23 08:00:00                Argentina      1     0.0
1    2009-05-23 08:00:00                Australia     12     0.0
2    2009-05-23 08:00:00                  Austria      1     0.0
3    2009-05-23 08:00:00                  Belgium      7     0.0
4    2009-05-23 08:00:00                   Brazil      8     0.0
...                  ...                      ...    ...     ...
1817 2009-07-06 09:00:00       Netherlands, Aruba      5     0.0
1818 2009-07-06 09:00:00       New Caledonia, FOC     12     0.0
1819 2009-07-06 09:00:00              Puerto Rico     18     0.0
1820 2009-07-06 09:00:00        Saint Martin, FOC      1     0.0
1821 2009-07-06 09:00:00           Virgin Islands      1     0.0

[1822 rows x 4 columns]


In [12]:
# only countries with cases
h1n1_cases = h1n1[h1n1['Cases']>0]
print(h1n1_cases)

                    Date                  Country  Cases  Deaths
0    2009-05-23 08:00:00                Argentina      1     0.0
1    2009-05-23 08:00:00                Australia     12     0.0
2    2009-05-23 08:00:00                  Austria      1     0.0
3    2009-05-23 08:00:00                  Belgium      7     0.0
4    2009-05-23 08:00:00                   Brazil      8     0.0
...                  ...                      ...    ...     ...
1817 2009-07-06 09:00:00       Netherlands, Aruba      5     0.0
1818 2009-07-06 09:00:00       New Caledonia, FOC     12     0.0
1819 2009-07-06 09:00:00              Puerto Rico     18     0.0
1820 2009-07-06 09:00:00        Saint Martin, FOC      1     0.0
1821 2009-07-06 09:00:00           Virgin Islands      1     0.0

[1822 rows x 4 columns]


In [13]:
# sars Data

# sars dataset
sars = pd.read_csv('Resources/sars_2003.csv', parse_dates=['Date'])
print(sars) 

           Date               Country  Cumulative number of case(s)  \
0    2003-03-17               Germany                             1   
1    2003-03-17                Canada                             8   
2    2003-03-17             Singapore                            20   
3    2003-03-17  Hong Kong SAR, China                            95   
4    2003-03-17           Switzerland                             2   
...         ...                   ...                           ...   
2533 2003-07-11           Switzerland                             1   
2534 2003-07-11              Thailand                             9   
2535 2003-07-11        United Kingdom                             4   
2536 2003-07-11         United States                            75   
2537 2003-07-11              Viet Nam                            63   

      Number of deaths  Number recovered  
0                    0                 0  
1                    2                 0  
2                 

In [14]:
 # selecting only the columns we need
sars = sars[['Date', 'Country', 'Cumulative number of case(s)', 'Number of deaths']]
print(sars)

           Date               Country  Cumulative number of case(s)  \
0    2003-03-17               Germany                             1   
1    2003-03-17                Canada                             8   
2    2003-03-17             Singapore                            20   
3    2003-03-17  Hong Kong SAR, China                            95   
4    2003-03-17           Switzerland                             2   
...         ...                   ...                           ...   
2533 2003-07-11           Switzerland                             1   
2534 2003-07-11              Thailand                             9   
2535 2003-07-11        United Kingdom                             4   
2536 2003-07-11         United States                            75   
2537 2003-07-11              Viet Nam                            63   

      Number of deaths  
0                    0  
1                    2  
2                    0  
3                    1  
4                    0

In [15]:
# renaming columns 
sars.columns = ['Date', 'Country', 'Cases', 'Deaths']
print(sars)

           Date               Country  Cases  Deaths
0    2003-03-17               Germany      1       0
1    2003-03-17                Canada      8       2
2    2003-03-17             Singapore     20       0
3    2003-03-17  Hong Kong SAR, China     95       1
4    2003-03-17           Switzerland      2       0
...         ...                   ...    ...     ...
2533 2003-07-11           Switzerland      1       0
2534 2003-07-11              Thailand      9       2
2535 2003-07-11        United Kingdom      4       0
2536 2003-07-11         United States     75       0
2537 2003-07-11              Viet Nam     63       5

[2538 rows x 4 columns]


In [16]:
# group by date and country
sars = sars.groupby(['Date', 'Country'])['Cases', 'Deaths']
sars = sars.sum().reset_index()
print(sars)

           Date               Country  Cases  Deaths
0    2003-03-17                Canada      8       2
1    2003-03-17               Germany      1       0
2    2003-03-17  Hong Kong SAR, China     95       1
3    2003-03-17             Singapore     20       0
4    2003-03-17           Switzerland      2       0
...         ...                   ...    ...     ...
2532 2003-07-11         Taiwan, China    671      84
2533 2003-07-11              Thailand      9       2
2534 2003-07-11        United Kingdom      4       0
2535 2003-07-11         United States     75       0
2536 2003-07-11              Viet Nam     63       5

[2537 rows x 4 columns]


In [17]:
# only countries with cases
sars_cases = sars[sars['Cases']>0]
print(sars_cases)

           Date               Country  Cases  Deaths
0    2003-03-17                Canada      8       2
1    2003-03-17               Germany      1       0
2    2003-03-17  Hong Kong SAR, China     95       1
3    2003-03-17             Singapore     20       0
4    2003-03-17           Switzerland      2       0
...         ...                   ...    ...     ...
2532 2003-07-11         Taiwan, China    671      84
2533 2003-07-11              Thailand      9       2
2534 2003-07-11        United Kingdom      4       0
2535 2003-07-11         United States     75       0
2536 2003-07-11              Viet Nam     63       5

[2530 rows x 4 columns]
