# Convert Cleaned Covid Tracking Project DF to Kaggle Covid DF format

In [1]:
#Importing Modules
import numpy as np
import pandas as pd

In [2]:
#kaggle data
covid_kaggle = pd.read_csv("..\\Data_Raw\\us-counties.csv")
covid_kaggle.head()
#covid_kaggle.columns

Unnamed: 0,date,county,state,fips,cases,deaths
0,2020-01-21,Snohomish,Washington,53061.0,1,0
1,2020-01-22,Snohomish,Washington,53061.0,1,0
2,2020-01-23,Snohomish,Washington,53061.0,1,0
3,2020-01-24,Cook,Illinois,17031.0,1,0
4,2020-01-24,Snohomish,Washington,53061.0,1,0


In [4]:
#cleaned covid tracking project data
covidtp_df = pd.read_pickle('..\Data_pkl\\covid19\\covidtrackingproject_df.pkl') 
covidtp_df.head()
#covidtp_df.columns

Unnamed: 0,date,state,positive,negative,pending,hospitalizedCurrently,hospitalizedCumulative,inIcuCurrently,inIcuCumulative,onVentilatorCurrently,...,hospitalized,total,totalTestResults,posNeg,fips,deathIncrease,hospitalizedIncrease,negativeIncrease,positiveIncrease,totalTestResultsIncrease
0,2020-04-25,AK,339,15393,,32.0,,,,,...,,15732,15732,15732,2,0,0,3451,0,3451
1,2020-04-25,AL,6137,65207,,,839.0,,288.0,,...,839.0,71344,71344,71344,1,15,71,18344,305,18649
2,2020-04-25,AR,2829,35224,,104.0,291.0,,,25.0,...,291.0,38053,38053,38053,5,2,0,2387,88,2475
3,2020-04-25,AS,0,3,17.0,,,,,,...,,20,3,3,60,0,0,0,0,0
4,2020-04-25,AZ,6280,56228,,697.0,1022.0,313.0,,191.0,...,1022.0,62508,62508,62508,4,0,38,1559,235,1794


In [5]:
#drop columns not in original kaggle dataset
drop_cols_list = ['negative', 'pending',
       'hospitalizedCurrently', 'hospitalizedCumulative', 'inIcuCurrently',
       'inIcuCumulative', 'onVentilatorCurrently', 'onVentilatorCumulative',
       'recovered', 'o_hash', 'dateChecked','hospitalized', 'total',
       'totalTestResults', 'posNeg', 'deathIncrease',
       'hospitalizedIncrease', 'negativeIncrease', 'positiveIncrease',
       'totalTestResultsIncrease']
dropped_df = covidtp_df.drop(drop_cols_list, axis=1)

#rename columns from covid tracking project DF to kaggle DF
rename_cols_list = ['date', 'stateabb', 'cases', 'deaths', 'fips']
dropped_df.columns = rename_cols_list

#new tracking project dataframe
covidtp_df = dropped_df

In [6]:
#convert state abbreviations to state name
state = ['Alabama','Alaska','Arizona','Arkansas',
          'California','Colorado','Connecticut',
          'Delaware','Florida','Georgia','Hawaii',
          'Idaho','Illinois','Indiana','Iowa',
          'Kansas','Kentucky','Louisiana','Maine',
          'Maryland','Massachusetts','Michigan',
          'Minnesota','Mississippi','Missouri',
          'Montana','Nebraska','Nevada','New Hampshire',
          'New Jersey','New Mexico','New York',
          'North Carolina','North Dakota','Ohio',
          'Oklahoma','Oregon','Pennsylvania',
          'Rhode Island','South Carolina','South Dakota',
          'Tennessee','Texas','Utah','Vermont','Virginia',
          'Washington','West Virginia','Wisconsin','Wyoming',
          'District of Columbia', 'Guam','Puerto Rico',
          'Virgin Islands','American Samoa','Northern Mariana Islands']

abb=['AL','AK','AZ','AR','CA','CO','CT','DE','FL','GA','HI',
     'ID','IL','IN','IA','KS','KY','LA','ME','MD','MA','MI',
     'MN','MS','MO','MT','NE','NV','NH','NJ','NM','NY','NC',
     'ND','OH','OK','OR','PA','RI','SC','SD','TN','TX','UT',
     'VT','VA','WA','WV','WI','WY','DC','GU','PR','VI','AS','MP']

#state - abbreviation dataframe
state_abb = pd.DataFrame({'state':state, 'abbreviation':abb})
state_abb.head()

#merge state abbreviation with state name
covidtp_df1 =pd.merge(covidtp_df,state_abb, 
                      how='left', 
                      left_on=['stateabb'], right_on=['abbreviation'])

#drop state abbreviation columns
covidtp_df1 = covidtp_df1.drop(['stateabb','abbreviation'], axis=1)

In [8]:
#re-order the columns to match kaggle DF
covidtp_df = covidtp_df1.reindex(columns=['date','state','fips','cases','deaths'])

#sort date by earliest first to match kaggle DF
covidtp_df = covidtp_df.sort_values(by=['date']).reset_index().drop(['index'],axis=1)

In [9]:
#convert na's to zeros
covidtp_df.deaths=covidtp_df.deaths.fillna(0)

In [17]:
#review data
covidtp_df.describe(include='all')
covidtp_df.head()
#covidtp_df.date.unique()
#covidtp_df.cases.unique()
#covidtp_df.deaths.unique()

Unnamed: 0,date,state,fips,cases,deaths
0,2020-01-22,Washington,53,1,0
1,2020-01-23,Washington,53,1,0
2,2020-01-24,Washington,53,1,0
3,2020-01-25,Washington,53,1,0
4,2020-01-26,Washington,53,1,0


In [22]:
#save to pickle
covidtp_df.to_pickle('..\\Data_pkl\\covidtrackingproject_kaggleformat_df.pkl')