In [1]:
# import necessary libraries
import pandas as pd

  from pandas.core.computation.check import NUMEXPR_INSTALLED


In [2]:
raw_data = pd.read_csv('../../data/coronavirus-covid-19-pandemic-usa-counties.csv')

## Data preparation
### Tidy data 

In [3]:
# sample data
raw_data.head()

Unnamed: 0,Admin 2 FIPS Code;Province/State;Admin 2 Level (City/County/Borough/Region);Date;Total Death;Total Confirmed;location
12119;Florida;Sumter;2020-01-31;0;0;28.70181754,-82.079427
13153;Georgia;Houston;2020-01-31;0;0;32.45802497,-83.668791
13227;Georgia;Pickens;2020-01-31;0;0;34.46589159,-84.464066
13303;Georgia;Washington;2020-01-31;0;0;32.96711864,-82.79357
16003;Idaho;Adams;2020-01-31;0;0;44.89333571,-116.454525


In [4]:
# how to make right?
raw_data = pd.read_csv('../../data/coronavirus-covid-19-pandemic-usa-counties.csv', sep=';')

In [5]:
raw_data.head() # This is better.

Unnamed: 0,Admin 2 FIPS Code,Province/State,Admin 2 Level (City/County/Borough/Region),Date,Total Death,Total Confirmed,location
0,12119.0,Florida,Sumter,2020-01-31,0,0,"28.70181754,-82.0794267"
1,13153.0,Georgia,Houston,2020-01-31,0,0,"32.45802497,-83.66879087"
2,13227.0,Georgia,Pickens,2020-01-31,0,0,"34.46589159,-84.46406611"
3,13303.0,Georgia,Washington,2020-01-31,0,0,"32.96711864,-82.79357039"
4,16003.0,Idaho,Adams,2020-01-31,0,0,"44.89333571,-116.4545247"


## Header name ?

In [6]:
raw_data.columns

Index(['Admin 2 FIPS Code', 'Province/State',
       'Admin 2 Level (City/County/Borough/Region)', 'Date', 'Total Death',
       'Total Confirmed', 'location'],
      dtype='object')

In [7]:
# US doesn't have a province, but states
# FIPS code only, Admin 2 Level is not necessary
# What is total confirmed?
raw_data.rename(columns={
    'Admin 2 FIPS Code': 'FIPS',
    'Province/State': 'State',
    'Admin 2 Level (City/County/Borough/Region)': 'City/County/Borough/Region',
    'Total Confirmed': 'infected total',
    'Total Death': 'death total',
}, inplace=True)


In [8]:
raw_data.head()

Unnamed: 0,FIPS,State,City/County/Borough/Region,Date,death total,infected total,location
0,12119.0,Florida,Sumter,2020-01-31,0,0,"28.70181754,-82.0794267"
1,13153.0,Georgia,Houston,2020-01-31,0,0,"32.45802497,-83.66879087"
2,13227.0,Georgia,Pickens,2020-01-31,0,0,"34.46589159,-84.46406611"
3,13303.0,Georgia,Washington,2020-01-31,0,0,"32.96711864,-82.79357039"
4,16003.0,Idaho,Adams,2020-01-31,0,0,"44.89333571,-116.4545247"


### Remove duplicates 

In [9]:
# check duplicates
raw_data.duplicated().sum() # no duplicates, good!

0

### Homogeneous variables

In [10]:
import json

In [11]:
name_of_state_us = json.load(open('name_state_us.json'))

In [12]:
raw_data['Abbr'] = raw_data['State'].map(name_of_state_us)


In [13]:
raw_data.head()

Unnamed: 0,FIPS,State,City/County/Borough/Region,Date,death total,infected total,location,Abbr
0,12119.0,Florida,Sumter,2020-01-31,0,0,"28.70181754,-82.0794267",FL
1,13153.0,Georgia,Houston,2020-01-31,0,0,"32.45802497,-83.66879087",GA
2,13227.0,Georgia,Pickens,2020-01-31,0,0,"34.46589159,-84.46406611",GA
3,13303.0,Georgia,Washington,2020-01-31,0,0,"32.96711864,-82.79357039",GA
4,16003.0,Idaho,Adams,2020-01-31,0,0,"44.89333571,-116.4545247",ID


In [14]:
raw_data['Abbr'].isnull().sum() # something went wrong?

81954

In [15]:
# drop missing 'Abbr' (not a US State)
raw_data = raw_data.dropna(subset=['Abbr'])

In [16]:
raw_data['Abbr'].isnull().sum()

0

In [17]:
# locate must show as (lat, lon)
raw_data[['lat', 'lon']] = raw_data['location'].str.split(',', expand=True)
raw_data.drop('location', axis=1, inplace=True)

In [18]:
raw_data.head()

Unnamed: 0,FIPS,State,City/County/Borough/Region,Date,death total,infected total,Abbr,lat,lon
0,12119.0,Florida,Sumter,2020-01-31,0,0,FL,28.70181754,-82.0794267
1,13153.0,Georgia,Houston,2020-01-31,0,0,GA,32.45802497,-83.66879087
2,13227.0,Georgia,Pickens,2020-01-31,0,0,GA,34.46589159,-84.46406611
3,13303.0,Georgia,Washington,2020-01-31,0,0,GA,32.96711864,-82.79357039
4,16003.0,Idaho,Adams,2020-01-31,0,0,ID,44.89333571,-116.4545247


### Unique identifier

In [19]:
# unique identifier is Date + index of row
raw_data['id'] = raw_data['Abbr'] + raw_data.index.astype(str)  

In [20]:
raw_data.head() # id is smell, but it's ok for now.

Unnamed: 0,FIPS,State,City/County/Borough/Region,Date,death total,infected total,Abbr,lat,lon,id
0,12119.0,Florida,Sumter,2020-01-31,0,0,FL,28.70181754,-82.0794267,FL0
1,13153.0,Georgia,Houston,2020-01-31,0,0,GA,32.45802497,-83.66879087,GA1
2,13227.0,Georgia,Pickens,2020-01-31,0,0,GA,34.46589159,-84.46406611,GA2
3,13303.0,Georgia,Washington,2020-01-31,0,0,GA,32.96711864,-82.79357039,GA3
4,16003.0,Idaho,Adams,2020-01-31,0,0,ID,44.89333571,-116.4545247,ID4


### Data type

In [21]:
raw_data.dtypes

FIPS                          float64
State                          object
City/County/Borough/Region     object
Date                           object
death total                     int64
infected total                  int64
Abbr                           object
lat                            object
lon                            object
id                             object
dtype: object

In [22]:
# First of all FIPS is not float, it's string
raw_data['FIPS'] = raw_data['FIPS'].astype(str).str.replace('.0', '')

# Date is not a string, it's a date
raw_data['Date'] = pd.to_datetime(raw_data['Date'])

# Lat and Lon are not string, they are floated
raw_data['lat'] = raw_data['lat'].astype(float)
raw_data['lon'] = raw_data['lon'].astype(float)

  raw_data['FIPS'] = raw_data['FIPS'].astype(str).str.replace('.0', '')


In [23]:
raw_data.head()

Unnamed: 0,FIPS,State,City/County/Borough/Region,Date,death total,infected total,Abbr,lat,lon,id
0,12119,Florida,Sumter,2020-01-31,0,0,FL,28.701818,-82.079427,FL0
1,13153,Georgia,Houston,2020-01-31,0,0,GA,32.458025,-83.668791,GA1
2,13227,Georgia,Pickens,2020-01-31,0,0,GA,34.465892,-84.464066,GA2
3,133,Georgia,Washington,2020-01-31,0,0,GA,32.967119,-82.79357,GA3
4,103,Idaho,Adams,2020-01-31,0,0,ID,44.893336,-116.454525,ID4


In [24]:
raw_data.dtypes

FIPS                                  object
State                                 object
City/County/Borough/Region            object
Date                          datetime64[ns]
death total                            int64
infected total                         int64
Abbr                                  object
lat                                  float64
lon                                  float64
id                                    object
dtype: object

In [25]:
length = raw_data.shape[0]
length = len(str(length))


In [26]:
# 
raw_data['id'] = raw_data['Abbr'] + raw_data.index.astype(str).str.zfill(length)

In [27]:
raw_data.head()

Unnamed: 0,FIPS,State,City/County/Borough/Region,Date,death total,infected total,Abbr,lat,lon,id
0,12119,Florida,Sumter,2020-01-31,0,0,FL,28.701818,-82.079427,FL0000000
1,13153,Georgia,Houston,2020-01-31,0,0,GA,32.458025,-83.668791,GA0000001
2,13227,Georgia,Pickens,2020-01-31,0,0,GA,34.465892,-84.464066,GA0000002
3,133,Georgia,Washington,2020-01-31,0,0,GA,32.967119,-82.79357,GA0000003
4,103,Idaho,Adams,2020-01-31,0,0,ID,44.893336,-116.454525,ID0000004


In [28]:
# show missing values
raw_data.isnull().sum()

FIPS                          0
State                         0
City/County/Borough/Region    0
Date                          0
death total                   0
infected total                0
Abbr                          0
lat                           0
lon                           0
id                            0
dtype: int64

In [29]:
raw_data.set_index('id', inplace=True)
raw_data

Unnamed: 0_level_0,FIPS,State,City/County/Borough/Region,Date,death total,infected total,Abbr,lat,lon
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
FL0000000,12119,Florida,Sumter,2020-01-31,0,0,FL,28.701818,-82.079427
GA0000001,13153,Georgia,Houston,2020-01-31,0,0,GA,32.458025,-83.668791
GA0000002,13227,Georgia,Pickens,2020-01-31,0,0,GA,34.465892,-84.464066
GA0000003,133,Georgia,Washington,2020-01-31,0,0,GA,32.967119,-82.793570
ID0000004,103,Idaho,Adams,2020-01-31,0,0,ID,44.893336,-116.454525
...,...,...,...,...,...,...,...,...,...
AR3099175,83,Arkansas,Logan,2022-03-10,74,5076,AR,35.214132,-93.719510
CA3099176,09,California,Calaveras,2022-03-10,117,7595,CA,38.205371,-120.552913
CA3099177,35,California,Lassen,2022-03-10,60,9813,CA,40.673113,-120.593510
CA3099178,37,California,Los Angeles,2022-03-10,31178,2808409,CA,34.308284,-118.228241


In [33]:
new_cols = ["FIPS","lat","lon","State","Abbr","City/County/Borough/Region","infected total","death total"]
raw_data=raw_data.reindex(columns=new_cols)
raw_data

Unnamed: 0_level_0,FIPS,lat,lon,State,Abbr,City/County/Borough/Region,infected total,death total
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
FL0000000,12119,28.701818,-82.079427,Florida,FL,Sumter,0,0
GA0000001,13153,32.458025,-83.668791,Georgia,GA,Houston,0,0
GA0000002,13227,34.465892,-84.464066,Georgia,GA,Pickens,0,0
GA0000003,133,32.967119,-82.793570,Georgia,GA,Washington,0,0
ID0000004,103,44.893336,-116.454525,Idaho,ID,Adams,0,0
...,...,...,...,...,...,...,...,...
AR3099175,83,35.214132,-93.719510,Arkansas,AR,Logan,5076,74
CA3099176,09,38.205371,-120.552913,California,CA,Calaveras,7595,117
CA3099177,35,40.673113,-120.593510,California,CA,Lassen,9813,60
CA3099178,37,34.308284,-118.228241,California,CA,Los Angeles,2808409,31178


# Conclusion before handling missing values

In [31]:
# Save data
raw_data.to_csv('../../data/covid19_usa.csv')