In [1]:
#importing modules
import pandas as pd
import seaborn as sns
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split

In [2]:
#load data

#USA deaths by week w/ comorbidity
cdc_usa_raw = pd.read_csv(r'C:\Users\agrae\Documents\AAA_Work\Coding\Springboard\HW\000_Capstone_2\Data\USA_DEATHS_BY_WEEK.csv')

#Worldwide deaths and case numbers
who_raw = pd.read_csv(r'C:\Users\agrae\Documents\AAA_Work\Coding\Springboard\HW\000_Capstone_2\Data\WHO-COVID-19-global-data.csv')

In [3]:
who_raw.tail()

Unnamed: 0,Date_reported,Country_code,Country,WHO_region,New_cases,Cumulative_cases,New_deaths,Cumulative_deaths
113044,2021-04-19,ZW,Zimbabwe,AFRO,82,37751,1,1553
113045,2021-04-20,ZW,Zimbabwe,AFRO,108,37859,0,1553
113046,2021-04-21,ZW,Zimbabwe,AFRO,16,37875,1,1554
113047,2021-04-22,ZW,Zimbabwe,AFRO,105,37980,1,1555
113048,2021-04-23,ZW,Zimbabwe,AFRO,38,38018,0,1555


In [4]:
who_raw.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 113049 entries, 0 to 113048
Data columns (total 8 columns):
 #   Column             Non-Null Count   Dtype 
---  ------             --------------   ----- 
 0   Date_reported      113049 non-null  object
 1   Country_code       112572 non-null  object
 2   Country            113049 non-null  object
 3   WHO_region         113049 non-null  object
 4   New_cases          113049 non-null  int64 
 5   Cumulative_cases   113049 non-null  int64 
 6   New_deaths         113049 non-null  int64 
 7   Cumulative_deaths  113049 non-null  int64 
dtypes: int64(4), object(4)
memory usage: 6.9+ MB


Here we see some null values in Country Code. We'll have to deal with that.

In [5]:
who_raw.describe()

Unnamed: 0,New_cases,Cumulative_cases,New_deaths,Cumulative_deaths
count,113049.0,113049.0,113049.0,113049.0
mean,1276.959159,180002.8,27.121983,4422.662907
std,8014.842116,1172761.0,152.176415,24146.188185
min,-32952.0,0.0,-514.0,0.0
25%,0.0,13.0,0.0,0.0
50%,5.0,1650.0,0.0,27.0
75%,248.0,31658.0,4.0,561.0
max,402270.0,31530210.0,6409.0,564091.0


We have negative values for both New_cases and New_deaths. That is obviously not possible. From the WHO literature, though, we know that those columns are calculated based on reported cumulative number from one day minus the cumulative number of the next. We must be getting mis-reported numbers of cases and deaths sometimes. We'll check out the worst of those cases and see if we should do something about it.

In [6]:
who_raw[who_raw.New_cases < 0]

Unnamed: 0,Date_reported,Country_code,Country,WHO_region,New_cases,Cumulative_cases,New_deaths,Cumulative_deaths
77,2020-03-20,AF,Afghanistan,EMRO,-2,24,0,0
2483,2020-04-10,AO,Angola,AFRO,-2,17,0,2
2955,2020-04-05,AI,Anguilla,AMRO,-1,2,0,0
3420,2020-03-24,AG,Antigua and Barbuda,AMRO,-2,3,0,0
4846,2020-03-19,AW,Aruba,AMRO,-3,5,0,0
...,...,...,...,...,...,...,...,...
108557,2020-10-07,VI,United States Virgin Islands,AMRO,-5,1322,0,20
108558,2020-10-08,VI,United States Virgin Islands,AMRO,-1,1321,0,20
108566,2020-10-16,VI,United States Virgin Islands,AMRO,-1,1327,0,20
110686,2020-01-25,VN,Viet Nam,WPRO,-4,2,0,0


Only 88 rows out of over 113,000. Not bad, let's drop them and put them into our modified data frame we'll just call 'who'

In [12]:
who = who_raw[who_raw.New_cases >= 0]

In [13]:
who_raw[who_raw.New_deaths < 0].shape

(36, 8)

Only 36 rows for misreported deaths. Let's drop them too.

In [14]:
who = who[who.New_deaths >= 0]

In [16]:
who.describe()

Unnamed: 0,New_cases,Cumulative_cases,New_deaths,Cumulative_deaths
count,112927.0,112927.0,112927.0,112927.0
mean,1278.481922,180175.4,27.121264,4425.724973
std,8017.913969,1173378.0,151.832463,24157.657554
min,0.0,0.0,0.0,0.0
25%,0.0,13.0,0.0,0.0
50%,5.0,1655.0,0.0,27.0
75%,249.0,31775.0,4.0,561.0
max,402270.0,31530210.0,6409.0,564091.0


That looks far better.

In [19]:
who.Country_code.unique()

array(['AF', 'AL', 'DZ', 'AS', 'AD', 'AO', 'AI', 'AG', 'AR', 'AM', 'AW',
       'AU', 'AT', 'AZ', 'BS', 'BH', 'BD', 'BB', 'BY', 'BE', 'BZ', 'BJ',
       'BM', 'BT', 'BO', 'XA', 'BA', 'BW', 'BR', 'VG', 'BN', 'BG', 'BF',
       'BI', 'CV', 'KH', 'CM', 'CA', 'KY', 'CF', 'TD', 'CL', 'CN', 'CO',
       'KM', 'CG', 'CK', 'CR', 'CI', 'HR', 'CU', 'CW', 'CY', 'CZ', 'KP',
       'CD', 'DK', 'DJ', 'DM', 'DO', 'EC', 'EG', 'SV', 'GQ', 'ER', 'EE',
       'SZ', 'ET', 'FK', 'FO', 'FJ', 'FI', 'FR', 'GF', 'PF', 'GA', 'GM',
       'GE', 'DE', 'GH', 'GI', 'GR', 'GL', 'GD', 'GP', 'GU', 'GT', 'GG',
       'GN', 'GW', 'GY', 'HT', 'VA', 'HN', 'HU', 'IS', 'IN', 'ID', 'IR',
       'IQ', 'IE', 'IM', 'IL', 'IT', 'JM', 'JP', 'JE', 'JO', 'KZ', 'KE',
       'KI', 'XK', 'KW', 'KG', 'LA', 'LV', 'LB', 'LS', 'LR', 'LY', 'LI',
       'LT', 'LU', 'MG', 'MW', 'MY', 'MV', 'ML', 'MT', 'MH', 'MQ', 'MR',
       'MU', 'YT', 'MX', 'FM', 'MC', 'MN', 'ME', 'MS', 'MA', 'MZ', 'MM',
       nan, 'NR', 'NP', 'NL', 'NC', 'NZ', 'NI', 'NE

We have a blank space and a nan space in our country codes. Let's figure out what those are.

In [20]:
who[who.Country_code == ' ']

Unnamed: 0,Date_reported,Country_code,Country,WHO_region,New_cases,Cumulative_cases,New_deaths,Cumulative_deaths
75366,2020-01-03,,Other,Other,0,0,0,0
75367,2020-01-04,,Other,Other,0,0,0,0
75368,2020-01-05,,Other,Other,0,0,0,0
75369,2020-01-06,,Other,Other,0,0,0,0
75370,2020-01-07,,Other,Other,0,0,0,0
...,...,...,...,...,...,...,...,...
75838,2021-04-19,,Other,Other,0,745,0,13
75839,2021-04-20,,Other,Other,0,745,0,13
75840,2021-04-21,,Other,Other,0,745,0,13
75841,2021-04-22,,Other,Other,0,745,0,13


It's unclear where these are from. There are also few enough of them that it shouldn't matter too much if we remove them.

In [21]:
who = who[who.Country_code != ' ']

In [22]:
who[who.Country_code.isna()]

Unnamed: 0,Date_reported,Country_code,Country,WHO_region,New_cases,Cumulative_cases,New_deaths,Cumulative_deaths
68211,2020-01-03,,Namibia,AFRO,0,0,0,0
68212,2020-01-04,,Namibia,AFRO,0,0,0,0
68213,2020-01-05,,Namibia,AFRO,0,0,0,0
68214,2020-01-06,,Namibia,AFRO,0,0,0,0
68215,2020-01-07,,Namibia,AFRO,0,0,0,0
...,...,...,...,...,...,...,...,...
68683,2021-04-19,,Namibia,AFRO,140,46655,2,604
68684,2021-04-20,,Namibia,AFRO,118,46773,0,604
68685,2021-04-21,,Namibia,AFRO,102,46875,4,608
68686,2021-04-22,,Namibia,AFRO,79,46954,1,609


It looks like the NaN's are all from Namibia. This could be explained when we realize that the country code for Namibia is literally 'NA'. Let's fill those NaN's with with just the letters 'NA'.

In [26]:
who.fillna('NA', inplace=True)
who.info(), who.describe()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 112451 entries, 0 to 113048
Data columns (total 8 columns):
 #   Column             Non-Null Count   Dtype 
---  ------             --------------   ----- 
 0   Date_reported      112451 non-null  object
 1   Country_code       112451 non-null  object
 2   Country            112451 non-null  object
 3   WHO_region         112451 non-null  object
 4   New_cases          112451 non-null  int64 
 5   Cumulative_cases   112451 non-null  int64 
 6   New_deaths         112451 non-null  int64 
 7   Cumulative_deaths  112451 non-null  int64 
dtypes: int64(4), object(4)
memory usage: 7.7+ MB


(None,
            New_cases  Cumulative_cases     New_deaths  Cumulative_deaths
 count  112451.000000      1.124510e+05  112451.000000      112451.000000
 mean     1283.887017      1.809352e+05      27.235961        4444.411904
 std      8034.434583      1.175800e+06     152.143219       24207.022016
 min         0.000000      0.000000e+00       0.000000           0.000000
 25%         0.000000      1.300000e+01       0.000000           0.000000
 50%         6.000000      1.703000e+03       0.000000          28.000000
 75%       252.500000      3.236400e+04       4.000000         570.000000
 max    402270.000000      3.153021e+07    6409.000000      564091.000000)

Looking far better. Though, 402,270 new cases in a single day looks a little high. Let's double check that.

In [33]:
who[who.New_cases > 350000]

Unnamed: 0,Date_reported,Country_code,Country,WHO_region,New_cases,Cumulative_cases,New_deaths,Cumulative_deaths
108154,2020-12-20,US,United States of America,AMRO,402270,17314834,2747,311150


Looks like it comes from the US. Let's take a look at how the US is taking the virus in general and see if 402,270 new cases in a single day is extrememly out of the ordinary.

In [31]:
who.loc[108145:108160, :]

Unnamed: 0,Date_reported,Country_code,Country,WHO_region,New_cases,Cumulative_cases,New_deaths,Cumulative_deaths
108145,2020-12-11,US,United States of America,AMRO,230852,15203208,3390,287384
108146,2020-12-12,US,United States of America,AMRO,201681,15404889,2749,290133
108147,2020-12-13,US,United States of America,AMRO,243209,15648098,2996,293129
108148,2020-12-14,US,United States of America,AMRO,212577,15860675,2277,295406
108149,2020-12-15,US,United States of America,AMRO,180420,16041095,1434,296840
108150,2020-12-16,US,United States of America,AMRO,204281,16245376,1754,298594
108151,2020-12-17,US,United States of America,AMRO,201468,16446844,2942,301536
108152,2020-12-18,US,United States of America,AMRO,235805,16682649,3424,304960
108153,2020-12-19,US,United States of America,AMRO,229915,16912564,3443,308403
108154,2020-12-20,US,United States of America,AMRO,402270,17314834,2747,311150


It looks like that day had nearly twice as many new cases as the days around it. This seems relatively hard to believe but we'll leave it in for now.