## Notebook to do Data Wrangling and give final datasets for analysis.

In [68]:
import requests
import csv
import pandas as pd
import numpy as np
import datetime

In [69]:
COLUMNS = [
 'Date',
 'Alabama',
 'Alaska',
 'American Samoa',
 'Arizona',
 'Arkansas',
 'California',
 'Colorado',
 'Connecticut',
 'Delaware',
 'Diamond Princess',
 'District of Columbia',
 'Florida',
 'Georgia',
 'Grand Princess',
 'Guam',
 'Hawaii',
 'Idaho',
 'Illinois',
 'Indiana',
 'Iowa',
 'Kansas',
 'Kentucky',
 'Louisiana',
 'Maine',
 'Maryland',
 'Massachusetts',
 'Michigan',
 'Minnesota',
 'Mississippi',
 'Missouri',
 'Montana',
 'Nebraska',
 'Nevada',
 'New Hampshire',
 'New Jersey',
 'New Mexico',
 'New York',
 'North Carolina',
 'North Dakota',
 'Northern Mariana Islands',
 'Ohio',
 'Oklahoma',
 'Oregon',
 'Pennsylvania',
 'Puerto Rico',
 'Rhode Island',
 'South Carolina',
 'South Dakota',
 'Tennessee',
 'Texas',
 'Utah',
 'Vermont',
 'Virgin Islands',
 'Virginia',
 'Washington',
 'West Virginia',
 'Wisconsin',
 'Wyoming']

In [70]:
covid_confirmed_dataset = pd.DataFrame(columns=COLUMNS)
covid_deaths_dataset = pd.DataFrame(columns=COLUMNS)
covid_recovered_dataset = pd.DataFrame(columns=COLUMNS)

In [71]:
def append_data_to_dataset(dataset, data_frame):
    temp_data = dict()
    temp_data['Date'] = date
    for data in data_frame:
        if data[1] != '':
            temp_data[data[0].strip()] = int(float(data[1]))
        else:
            temp_data[data[0].strip()] = 0
    dataset = dataset.append(temp_data, ignore_index=True)
    return dataset


In [72]:
def get_daily_data(date):
    url = "https://raw.githubusercontent.com/CSSEGISandData/COVID-19/master/csse_covid_19_data/csse_covid_19_daily_reports_us/"+date+".csv"
    response = requests.get(url, allow_redirects=True)
    content = response.content.decode('utf-8')
    data = list(csv.reader(content.splitlines(), delimiter=","))
    df = pd.DataFrame(data[1:], columns=data[0])
    df_confirmed = np.array(df.loc[:, ["Province_State", "Confirmed"]])
    df_deaths = np.array(df.loc[:, ["Province_State", "Deaths"]])
    df_recovered = np.array(df.loc[:, ["Province_State", "Recovered"]])
    global covid_confirmed_dataset
    global covid_deaths_dataset
    global covid_recovered_dataset
    covid_confirmed_dataset = append_data_to_dataset(covid_confirmed_dataset, df_confirmed)
    covid_deaths_dataset = append_data_to_dataset(covid_deaths_dataset, df_deaths)
    covid_recovered_dataset = append_data_to_dataset(covid_recovered_dataset, df_recovered)

In [73]:
def parse_date(date):
    date_obj = datetime.datetime.strptime(date[:-3], '%Y-%m-%dT%H:%M:%S.%f')
    return "{:%m-%d-%Y}".format(date_obj)

In [74]:
start_date = "06-28-2020"
dates = np.array(pd.date_range(start_date, pd.to_datetime('today')))
try:
    for date in dates:
        temp_date = parse_date(str(date))
        get_daily_data(temp_date)
except KeyError as e:
    pass

In [75]:
covid_confirmed_dataset

Unnamed: 0,Date,Alabama,Alaska,American Samoa,Arizona,Arkansas,California,Colorado,Connecticut,Delaware,...,Tennessee,Texas,Utah,Vermont,Virgin Islands,Virginia,Washington,West Virginia,Wisconsin,Wyoming
0,2020-06-28,35441,880,0,73920,19818,215296,32307,46303,11226,...,40172,150152,21100,1202,81,61736,31752,2832,27743,1417
1,2020-06-29,37175,901,0,74545,20257,223646,32511,46362,11376,...,42297,156706,21664,1208,81,62189,32253,2870,28058,1450
2,2020-06-30,38045,937,0,79228,20777,231232,32715,46514,11474,...,43509,163060,22217,1208,81,62787,32824,2905,28659,1487
3,2020-07-01,38962,975,0,84105,21197,238681,33029,46572,11510,...,45315,172368,22716,1210,90,63203,33435,2979,29199,1514
4,2020-07-02,40111,1014,0,87445,22075,246550,33352,46646,11731,...,46890,179137,23270,1227,92,63735,34151,3053,29738,1550
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
135,2020-11-10,206567,20832,0,263133,124235,989432,138427,82953,27112,...,289749,1010364,137385,2462,1413,194912,120011,29316,293812,19242
136,2020-11-11,208637,21331,0,265163,126197,995575,142402,84741,27342,...,293381,1022336,139720,2535,1413,196506,120011,30201,301349,19374
137,2020-11-12,210637,21812,0,266562,128006,1004116,147599,85899,27546,...,296725,1031363,143639,2651,1413,198027,123356,30897,309572,20479
138,2020-11-13,213617,22405,0,269577,130318,1013566,154038,88645,28016,...,300458,1043116,145789,2743,1426,199262,125498,31639,318023,21341


In [76]:
covid_deaths_dataset

Unnamed: 0,Date,Alabama,Alaska,American Samoa,Arizona,Arkansas,California,Colorado,Connecticut,Delaware,...,Tennessee,Texas,Utah,Vermont,Virgin Islands,Virginia,Washington,West Virginia,Wisconsin,Wyoming
0,2020-06-28,919,14,0,1594,264,5932,1676,4316,507,...,584,2402,167,56,6,1732,1310,93,777,20
1,2020-06-29,929,14,0,1598,265,5983,1681,4320,507,...,592,2416,168,56,6,1740,1320,93,777,20
2,2020-06-30,950,14,0,1645,270,6082,1690,4322,509,...,604,2455,172,56,6,1763,1332,93,784,20
3,2020-07-01,972,14,0,1725,277,6169,1697,4324,509,...,609,2503,173,56,6,1786,1339,93,786,20
4,2020-07-02,985,14,0,1764,279,6265,1701,4326,510,...,620,2542,176,56,6,1816,1342,93,793,20
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
135,2020-11-10,3120,92,0,6192,2112,18066,2427,4707,722,...,3672,19337,672,59,23,3726,2482,546,2395,127
136,2020-11-11,3201,96,0,6228,2126,18108,2443,4716,724,...,3761,19469,678,59,23,3741,2482,553,2457,127
137,2020-11-12,3213,96,0,6240,2144,18135,2468,4726,732,...,3788,19767,687,59,23,3758,2507,555,2621,127
138,2020-11-13,3231,97,0,6257,2148,18205,2504,4737,734,...,3852,19785,701,59,23,3785,2519,565,2683,127


In [77]:
covid_recovered_dataset

Unnamed: 0,Date,Alabama,Alaska,American Samoa,Arizona,Arkansas,California,Colorado,Connecticut,Delaware,...,Tennessee,Texas,Utah,Vermont,Virgin Islands,Virginia,Washington,West Virginia,Wisconsin,Wyoming
0,2020-06-28,18866,521,0,8926,13270,0,4442,8053,6665,...,26159,79974,11931,946,71,8005,0,2062,21953,1057
1,2020-06-29,18866,525,0,9179,14066,0,4459,8053,6665,...,26962,81335,12205,949,71,8023,0,2196,22217,1070
2,2020-06-30,18866,526,0,9411,14531,0,4479,8053,6667,...,27599,84818,12398,953,73,8080,0,2272,22587,1097
3,2020-07-01,18866,528,0,9715,15163,0,4502,8053,6676,...,28283,87556,12707,961,73,8131,0,2284,23089,1119
4,2020-07-02,22082,535,0,10137,15698,0,4524,8210,6678,...,28938,90720,13076,960,75,8496,0,2380,23527,1139
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
135,2020-11-10,84471,7161,0,43682,109235,0,8738,9800,14276,...,256143,826116,95975,1936,1357,21716,0,21499,214469,11098
136,2020-11-11,84471,7160,0,43990,110365,0,8846,9800,14380,...,259438,831800,97269,1947,1357,21863,0,21877,219304,11234
137,2020-11-12,88038,7161,0,44363,111357,0,8980,9800,14487,...,262527,838950,98897,1958,1357,22002,0,22115,223937,11585
138,2020-11-13,88038,7161,0,44675,112383,0,9148,9800,14571,...,265459,850648,100892,1977,1357,22095,0,22543,229469,12082


In [78]:
%store covid_confirmed_dataset
%store covid_deaths_dataset
%store covid_recovered_dataset

Stored 'covid_confirmed_dataset' (DataFrame)
Stored 'covid_deaths_dataset' (DataFrame)
Stored 'covid_recovered_dataset' (DataFrame)
