In [61]:
import pandas as pd
import numpy as np
import requests

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression

# ETL

## extract_covid_data

In [3]:
url = "https://api.covidtracking.com/v1/us/daily.csv"
response = requests.get(url)

with open('covid_data.csv', 'wb') as f:
    f.write(response.content)
    f.close()

In [4]:
ls

Covid-ETL.ipynb               email-debug.ipynb
covid-analysis-and-etl.ipynb  extract.ipynb
covid_api.ipynb               testing_email_script.ipynb
covid_data.csv                testing_load.ipynb
covid_data_logs.ipynb         validate.ipynb


In [25]:
df = pd.read_csv('covid_data.csv')

In [26]:
df.head()

Unnamed: 0,date,states,positive,negative,pending,hospitalizedCurrently,hospitalizedCumulative,inIcuCurrently,inIcuCumulative,onVentilatorCurrently,...,lastModified,recovered,total,posNeg,deathIncrease,hospitalizedIncrease,negativeIncrease,positiveIncrease,totalTestResultsIncrease,hash
0,20210307,56,28756489.0,74582825.0,11808.0,40199.0,776361.0,8134.0,45475.0,2802.0,...,2021-03-07T24:00:00Z,,0,0,842,726,131835,41835,1170059,a80d0063822e251249fd9a44730c49cb23defd83
1,20210306,56,28714654.0,74450990.0,11783.0,41401.0,775635.0,8409.0,45453.0,2811.0,...,2021-03-06T24:00:00Z,,0,0,1680,503,143835,60015,1430992,dae5e558c24adb86686bbd58c08cce5f610b8bb0
2,20210305,56,28654639.0,74307155.0,12213.0,42541.0,775132.0,8634.0,45373.0,2889.0,...,2021-03-05T24:00:00Z,,0,0,2221,2781,271917,68787,1744417,724844c01659d0103801c57c0f72bf8cc8ab025c
3,20210304,56,28585852.0,74035238.0,12405.0,44172.0,772351.0,8970.0,45293.0,2973.0,...,2021-03-04T24:00:00Z,,0,0,1743,1530,177957,65487,1590984,5c549ad30f9abf48dc5de36d20fa707014be1ff3
4,20210303,56,28520365.0,73857281.0,11778.0,45462.0,770821.0,9359.0,45214.0,3094.0,...,2021-03-03T24:00:00Z,,0,0,2449,2172,267001,66836,1406795,fef6c425d2b773a9221fe353f13852f3e4a4bfb0


In [27]:
df.iloc[1]

date                                                        20210306
states                                                            56
positive                                                  28714654.0
negative                                                  74450990.0
pending                                                      11783.0
hospitalizedCurrently                                        41401.0
hospitalizedCumulative                                      775635.0
inIcuCurrently                                                8409.0
inIcuCumulative                                              45453.0
onVentilatorCurrently                                         2811.0
onVentilatorCumulative                                        4280.0
dateChecked                                     2021-03-06T24:00:00Z
death                                                       514309.0
hospitalized                                                775635.0
totalTestResults                  

In [28]:
# keep only records that include data from al 56 states
df['states'].value_counts()

56    357
6      12
4      12
51      9
1       9
5       5
2       5
7       2
3       2
40      1
32      1
26      1
16      1
12      1
11      1
8       1
Name: states, dtype: int64

In [30]:
# keep only records that include data from al 56 states
df = df[df['states'] == 56]

In [33]:
cols = ["recovered", "lastModified", "states", "dateChecked", "total", "posNeg", "hospitalized"]

# Remove unnecessary columns
df.drop(columns = cols , inplace = True)

In [35]:
# Remove unnecessary columns
cols = ["hash", "date", "positive", "negative", "positiveIncrease", "negativeIncrease", "pending", "hospitalizedCurrently", "hospitalizedIncrease", "hospitalizedCumulative",\
             "inIcuCurrently", "inIcuCumulative", "onVentilatorCurrently", "onVentilatorCumulative", "totalTestResults", "totalTestResultsIncrease","death", "deathIncrease"]

df = df[cols]
df.head()

Unnamed: 0,hash,date,positive,negative,positiveIncrease,negativeIncrease,pending,hospitalizedCurrently,hospitalizedIncrease,hospitalizedCumulative,inIcuCurrently,inIcuCumulative,onVentilatorCurrently,onVentilatorCumulative,totalTestResults,totalTestResultsIncrease,death,deathIncrease
0,a80d0063822e251249fd9a44730c49cb23defd83,20210307,28756489.0,74582825.0,41835,131835,11808.0,40199.0,726,776361.0,8134.0,45475.0,2802.0,4281.0,363825123,1170059,515151.0,842
1,dae5e558c24adb86686bbd58c08cce5f610b8bb0,20210306,28714654.0,74450990.0,60015,143835,11783.0,41401.0,503,775635.0,8409.0,45453.0,2811.0,4280.0,362655064,1430992,514309.0,1680
2,724844c01659d0103801c57c0f72bf8cc8ab025c,20210305,28654639.0,74307155.0,68787,271917,12213.0,42541.0,2781,775132.0,8634.0,45373.0,2889.0,4275.0,361224072,1744417,512629.0,2221
3,5c549ad30f9abf48dc5de36d20fa707014be1ff3,20210304,28585852.0,74035238.0,65487,177957,12405.0,44172.0,1530,772351.0,8970.0,45293.0,2973.0,4267.0,359479655,1590984,510408.0,1743
4,fef6c425d2b773a9221fe353f13852f3e4a4bfb0,20210303,28520365.0,73857281.0,66836,267001,11778.0,45462.0,2172,770821.0,9359.0,45214.0,3094.0,4260.0,357888671,1406795,508665.0,2449


In [37]:
# Rename columns
df.rename(columns ={
    "negative": "pcr_test_negative",
    "positive": "pcr_test_positive"
}, inplace = True)

In [42]:
df['date'] = df['date'].astype('str')
year  = df['date'].str[0:4]
month = df['date'].str[4:6]
day   = df.date.str[6:8]

df["date"] = pd.to_datetime(year+'-'+month+'-'+day)
df.head()

Unnamed: 0,hash,date,pcr_test_positive,pcr_test_negative,positiveIncrease,negativeIncrease,pending,hospitalizedCurrently,hospitalizedIncrease,hospitalizedCumulative,inIcuCurrently,inIcuCumulative,onVentilatorCurrently,onVentilatorCumulative,totalTestResults,totalTestResultsIncrease,death,deathIncrease
0,a80d0063822e251249fd9a44730c49cb23defd83,2021-03-07,28756489.0,74582825.0,41835,131835,11808.0,40199.0,726,776361.0,8134.0,45475.0,2802.0,4281.0,363825123,1170059,515151.0,842
1,dae5e558c24adb86686bbd58c08cce5f610b8bb0,2021-03-06,28714654.0,74450990.0,60015,143835,11783.0,41401.0,503,775635.0,8409.0,45453.0,2811.0,4280.0,362655064,1430992,514309.0,1680
2,724844c01659d0103801c57c0f72bf8cc8ab025c,2021-03-05,28654639.0,74307155.0,68787,271917,12213.0,42541.0,2781,775132.0,8634.0,45373.0,2889.0,4275.0,361224072,1744417,512629.0,2221
3,5c549ad30f9abf48dc5de36d20fa707014be1ff3,2021-03-04,28585852.0,74035238.0,65487,177957,12405.0,44172.0,1530,772351.0,8970.0,45293.0,2973.0,4267.0,359479655,1590984,510408.0,1743
4,fef6c425d2b773a9221fe353f13852f3e4a4bfb0,2021-03-03,28520365.0,73857281.0,66836,267001,11778.0,45462.0,2172,770821.0,9359.0,45214.0,3094.0,4260.0,357888671,1406795,508665.0,2449


In [43]:
# drop nan values
df = df.dropna()

In [44]:
# export to csv 
df.to_csv("clean_covid_data.csv")

In [45]:
# Export correlation dataframe
r_squared_scores = pd.DataFrame(["hospitalizedCurrently", "inIcuCurrently", "onVentilatorCurrently"],   columns=["Feature"])
r_squared_scores["R2_Score"] = None
r_squared_scores

Unnamed: 0,Feature,R2_Score
0,hospitalizedCurrently,
1,inIcuCurrently,
2,onVentilatorCurrently,


In [63]:
y = np.array(df.loc[:,'deathIncrease']).reshape(-1,1)
i = 0

In [65]:
# Train a Linear Regression model and calculate R2 score for each feature
for col in ["hospitalizedCurrently", "inIcuCurrently", "onVentilatorCurrently"]:

    x = np.array(df.loc[:, col]).reshape(-1,1)
    x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.3, random_state=42)

    model = LinearRegression()

    model.fit(x_train, y_train)
    r_squared = model.score(x_test, y_test)

    r_squared_scores.R2_Score[i] = r_squared
    i += 1

# Export R2 scores
r_squared_scores.to_csv("R2_Squared_Scores.csv")

In [67]:
r_squared_scores

Unnamed: 0,Feature,R2_Score
0,hospitalizedCurrently,0.513929
1,inIcuCurrently,0.543345
2,onVentilatorCurrently,0.483789
