In [168]:
# import necessary libraries
import pandas as pd
import numpy as np

In [169]:
raw_data = pd.read_csv('../../data/coronavirus-covid-19-pandemic-usa-counties.csv')

## Data preparation
### Tidy data 

In [170]:
# sample data
raw_data.head()

Unnamed: 0,Admin 2 FIPS Code;Province/State;Admin 2 Level (City/County/Borough/Region);Date;Total Death;Total Confirmed;location
12119;Florida;Sumter;2020-01-31;0;0;28.70181754,-82.079427
13153;Georgia;Houston;2020-01-31;0;0;32.45802497,-83.668791
13227;Georgia;Pickens;2020-01-31;0;0;34.46589159,-84.464066
13303;Georgia;Washington;2020-01-31;0;0;32.96711864,-82.79357
16003;Idaho;Adams;2020-01-31;0;0;44.89333571,-116.454525


In [171]:
# how to make right?
raw_data = pd.read_csv('../../data/coronavirus-covid-19-pandemic-usa-counties.csv', sep=';')

In [172]:
raw_data.head() # This is better.

Unnamed: 0,Admin 2 FIPS Code,Province/State,Admin 2 Level (City/County/Borough/Region),Date,Total Death,Total Confirmed,location
0,12119.0,Florida,Sumter,2020-01-31,0,0,"28.70181754,-82.0794267"
1,13153.0,Georgia,Houston,2020-01-31,0,0,"32.45802497,-83.66879087"
2,13227.0,Georgia,Pickens,2020-01-31,0,0,"34.46589159,-84.46406611"
3,13303.0,Georgia,Washington,2020-01-31,0,0,"32.96711864,-82.79357039"
4,16003.0,Idaho,Adams,2020-01-31,0,0,"44.89333571,-116.4545247"


## Header name ?

In [173]:
raw_data.columns

Index(['Admin 2 FIPS Code', 'Province/State',
       'Admin 2 Level (City/County/Borough/Region)', 'Date', 'Total Death',
       'Total Confirmed', 'location'],
      dtype='object')

In [174]:
# US doesn't have a province, but states
# FIPS code only, Admin 2 Level is not necessary
# What is total confirmed?
raw_data.rename(columns={
    'Admin 2 FIPS Code': 'FIPS',
    'Province/State': 'State',
    'Admin 2 Level (City/County/Borough/Region)': 'City/County/Borough/Region',
    'Total Confirmed': 'infected total',
    'Total Death': 'death total',
}, inplace=True)


In [175]:
raw_data.head()

Unnamed: 0,FIPS,State,City/County/Borough/Region,Date,death total,infected total,location
0,12119.0,Florida,Sumter,2020-01-31,0,0,"28.70181754,-82.0794267"
1,13153.0,Georgia,Houston,2020-01-31,0,0,"32.45802497,-83.66879087"
2,13227.0,Georgia,Pickens,2020-01-31,0,0,"34.46589159,-84.46406611"
3,13303.0,Georgia,Washington,2020-01-31,0,0,"32.96711864,-82.79357039"
4,16003.0,Idaho,Adams,2020-01-31,0,0,"44.89333571,-116.4545247"


### Remove duplicates 

In [176]:
# check duplicates
raw_data.duplicated().sum() # no duplicates, good!

0

### Homogeneous variables

In [177]:
import json

In [178]:
name_of_state_us = json.load(open('name_state_us.json'))

In [179]:
raw_data['Abbr'] = raw_data['State'].map(name_of_state_us)


In [180]:
raw_data.head()

Unnamed: 0,FIPS,State,City/County/Borough/Region,Date,death total,infected total,location,Abbr
0,12119.0,Florida,Sumter,2020-01-31,0,0,"28.70181754,-82.0794267",FL
1,13153.0,Georgia,Houston,2020-01-31,0,0,"32.45802497,-83.66879087",GA
2,13227.0,Georgia,Pickens,2020-01-31,0,0,"34.46589159,-84.46406611",GA
3,13303.0,Georgia,Washington,2020-01-31,0,0,"32.96711864,-82.79357039",GA
4,16003.0,Idaho,Adams,2020-01-31,0,0,"44.89333571,-116.4545247",ID


In [181]:
raw_data['Abbr'].isnull().sum() # something went wrong?

81954

In [182]:
# locate must show as (lat, lon)
raw_data[['lat', 'lon']] = raw_data['location'].str.split(',', expand=True)
raw_data.drop('location', axis=1, inplace=True)

In [183]:
raw_data.head()

Unnamed: 0,FIPS,State,City/County/Borough/Region,Date,death total,infected total,Abbr,lat,lon
0,12119.0,Florida,Sumter,2020-01-31,0,0,FL,28.70181754,-82.0794267
1,13153.0,Georgia,Houston,2020-01-31,0,0,GA,32.45802497,-83.66879087
2,13227.0,Georgia,Pickens,2020-01-31,0,0,GA,34.46589159,-84.46406611
3,13303.0,Georgia,Washington,2020-01-31,0,0,GA,32.96711864,-82.79357039
4,16003.0,Idaho,Adams,2020-01-31,0,0,ID,44.89333571,-116.4545247


### Unique identifier

In [184]:
# unique identifier is Date + index of row
raw_data['id'] = raw_data['Date'].str.replace('-', '') + raw_data.index.astype(str)  

In [185]:
raw_data.head() # id is smell, but it's ok for now.

Unnamed: 0,FIPS,State,City/County/Borough/Region,Date,death total,infected total,Abbr,lat,lon,id
0,12119.0,Florida,Sumter,2020-01-31,0,0,FL,28.70181754,-82.0794267,202001310
1,13153.0,Georgia,Houston,2020-01-31,0,0,GA,32.45802497,-83.66879087,202001311
2,13227.0,Georgia,Pickens,2020-01-31,0,0,GA,34.46589159,-84.46406611,202001312
3,13303.0,Georgia,Washington,2020-01-31,0,0,GA,32.96711864,-82.79357039,202001313
4,16003.0,Idaho,Adams,2020-01-31,0,0,ID,44.89333571,-116.4545247,202001314


### Data type

In [186]:
raw_data.dtypes

FIPS                          float64
State                          object
City/County/Borough/Region     object
Date                           object
death total                     int64
infected total                  int64
Abbr                           object
lat                            object
lon                            object
id                             object
dtype: object

In [187]:
# First of all FIPS is not float, it's string
raw_data['FIPS'] = raw_data['FIPS'].astype(str).str.replace('.0', '')

# Date is not a string, it's a date
raw_data['Date'] = pd.to_datetime(raw_data['Date'])

# Lat and Lon are not string, they are floated
raw_data['lat'] = raw_data['lat'].astype(float)
raw_data['lon'] = raw_data['lon'].astype(float)

In [188]:
raw_data.head()

Unnamed: 0,FIPS,State,City/County/Borough/Region,Date,death total,infected total,Abbr,lat,lon,id
0,12119,Florida,Sumter,2020-01-31,0,0,FL,28.701818,-82.079427,202001310
1,13153,Georgia,Houston,2020-01-31,0,0,GA,32.458025,-83.668791,202001311
2,13227,Georgia,Pickens,2020-01-31,0,0,GA,34.465892,-84.464066,202001312
3,13303,Georgia,Washington,2020-01-31,0,0,GA,32.967119,-82.79357,202001313
4,16003,Idaho,Adams,2020-01-31,0,0,ID,44.893336,-116.454525,202001314


In [189]:
raw_data.dtypes

FIPS                                  object
State                                 object
City/County/Borough/Region            object
Date                          datetime64[ns]
death total                            int64
infected total                         int64
Abbr                                  object
lat                                  float64
lon                                  float64
id                                    object
dtype: object

In [190]:
# fix unique identifier
raw_data['id'] = (
    raw_data['Date'].astype(str).str.replace('-', '') +
    raw_data.index.astype(str)
)

In [191]:
raw_data.head()

Unnamed: 0,FIPS,State,City/County/Borough/Region,Date,death total,infected total,Abbr,lat,lon,id
0,12119,Florida,Sumter,2020-01-31,0,0,FL,28.701818,-82.079427,202001310
1,13153,Georgia,Houston,2020-01-31,0,0,GA,32.458025,-83.668791,202001311
2,13227,Georgia,Pickens,2020-01-31,0,0,GA,34.465892,-84.464066,202001312
3,13303,Georgia,Washington,2020-01-31,0,0,GA,32.967119,-82.79357,202001313
4,16003,Idaho,Adams,2020-01-31,0,0,ID,44.893336,-116.454525,202001314


In [192]:
# show missing values
raw_data.isnull().sum()

FIPS                              0
State                             0
City/County/Borough/Region     5652
Date                              0
death total                       0
infected total                    0
Abbr                          81954
lat                               0
lon                               0
id                                0
dtype: int64

In [193]:
# show sumter city
raw_data[raw_data['City/County/Borough/Region'] == 'Sumter']

Unnamed: 0,FIPS,State,City/County/Borough/Region,Date,death total,infected total,Abbr,lat,lon,id
0,12119,Florida,Sumter,2020-01-31,0,0,FL,28.701818,-82.079427,202001310
3459,45085,South Carolina,Sumter,2021-07-16,212,10748,SC,33.918265,-80.379423,202107163459
4299,1119,Alabama,Sumter,2021-07-27,32,1087,AL,32.591174,-88.199162,202107274299
4975,45085,South Carolina,Sumter,2021-08-04,211,11525,SC,33.918265,-80.379423,202108044975
5270,13261,Georgia,Sumter,2021-07-29,106,2715,GA,32.036506,-84.198215,202107295270
...,...,...,...,...,...,...,...,...,...,...
3097039,12119,Florida,Sumter,2022-03-16,551,21661,FL,28.701818,-82.079427,202203163097039
3097049,12119,Florida,Sumter,2022-03-10,506,21577,FL,28.701818,-82.079427,202203103097049
3097682,12119,Florida,Sumter,2022-02-25,506,21255,FL,28.701818,-82.079427,202202253097682
3097989,45085,South Carolina,Sumter,2022-02-27,365,27488,SC,33.918265,-80.379423,202202273097989
