In [44]:
import pandas as pd

raw_data = pd.read_csv("../../data/covid19_us_counties.csv")

## Data preparation
### Tidy data 

In [45]:
# sample data
raw_data.head()

Unnamed: 0,Admin 2 FIPS Code;Province/State;Admin 2 Level (City/County/Borough/Region);Date;Total Death;Total Confirmed;location;Year
36103.0;New York;Suffolk;2020-06-22;1965;41010;40.88320119,-72.8012172;2020
36105.0;New York;Sullivan;2020-06-22;45;1438;41.71579493,-74.76394559;2020
37133.0;North Carolina;Onslow;2020-06-22;3;221;34.72607366,-77.42908179;2020
38001.0;North Dakota;Adams;2020-06-22;0;0;46.09686891,-102.5285397;2020
39063.0;Ohio;Hancock;2020-06-22;5;63;41.00250487,-83.66838948;2020


In [46]:
# how to make right?
raw_data = pd.read_csv("../../data/covid19_us_counties.csv", sep=";")

In [47]:
raw_data.head()  # This is better.

Unnamed: 0,Admin 2 FIPS Code,Province/State,Admin 2 Level (City/County/Borough/Region),Date,Total Death,Total Confirmed,location,Year
0,36103.0,New York,Suffolk,2020-06-22,1965,41010,"40.88320119,-72.8012172",2020
1,36105.0,New York,Sullivan,2020-06-22,45,1438,"41.71579493,-74.76394559",2020
2,37133.0,North Carolina,Onslow,2020-06-22,3,221,"34.72607366,-77.42908179",2020
3,38001.0,North Dakota,Adams,2020-06-22,0,0,"46.09686891,-102.5285397",2020
4,39063.0,Ohio,Hancock,2020-06-22,5,63,"41.00250487,-83.66838948",2020


## Header name ?

In [48]:
raw_data.columns

Index(['Admin 2 FIPS Code', 'Province/State',
       'Admin 2 Level (City/County/Borough/Region)', 'Date', 'Total Death',
       'Total Confirmed', 'location', 'Year'],
      dtype='object')

In [49]:
# US doesn't have a province, but states
# FIPS code only, Admin 2 Level is not necessary
# What is total confirmed?
raw_data.rename(
    columns={
        "Admin 2 FIPS Code": "FIPS",
        "Province/State": "state",
        "Admin 2 Level (City/County/Borough/Region)": "CCBR",
        "Total Confirmed": "cases",
        "Total Death": "death",
    },
    inplace=True,
)

### Remove duplicates 

In [50]:
# check duplicates
raw_data.duplicated().sum()  # no duplicates, good!

0

### Homogeneous variables

In [51]:
import json

name_of_state_us = json.load(open("name_state_us.json"))
raw_data["abbr"] = raw_data["state"].map(name_of_state_us)

raw_data.head()

Unnamed: 0,FIPS,state,CCBR,Date,death,cases,location,Year,abbr
0,36103.0,New York,Suffolk,2020-06-22,1965,41010,"40.88320119,-72.8012172",2020,NY
1,36105.0,New York,Sullivan,2020-06-22,45,1438,"41.71579493,-74.76394559",2020,NY
2,37133.0,North Carolina,Onslow,2020-06-22,3,221,"34.72607366,-77.42908179",2020,NC
3,38001.0,North Dakota,Adams,2020-06-22,0,0,"46.09686891,-102.5285397",2020,ND
4,39063.0,Ohio,Hancock,2020-06-22,5,63,"41.00250487,-83.66838948",2020,OH


In [52]:
raw_data["abbr"].isnull().sum()  # something went wrong?

63510

### Missing values

why is it missing? because the name of state is not correct or not it's not a state.

In [53]:
# drop missing 'Abbr' (not a US State)
raw_data = raw_data.dropna(subset=["abbr"])

In [54]:
raw_data["abbr"].isnull().sum()

0

In [55]:
# locate must show as (lat, lon)
raw_data[["lat", "lon"]] = raw_data["location"].str.split(",", expand=True)
raw_data.drop("location", axis=1, inplace=True)

In [56]:
raw_data.head()

Unnamed: 0,FIPS,state,CCBR,Date,death,cases,Year,abbr,lat,lon
0,36103.0,New York,Suffolk,2020-06-22,1965,41010,2020,NY,40.88320119,-72.8012172
1,36105.0,New York,Sullivan,2020-06-22,45,1438,2020,NY,41.71579493,-74.76394559
2,37133.0,North Carolina,Onslow,2020-06-22,3,221,2020,NC,34.72607366,-77.42908179
3,38001.0,North Dakota,Adams,2020-06-22,0,0,2020,ND,46.09686891,-102.5285397
4,39063.0,Ohio,Hancock,2020-06-22,5,63,2020,OH,41.00250487,-83.66838948


In [57]:
# ccbr is short for City/County/Borough/Region now change it to location (better name)
raw_data.rename(columns={"CCBR": "location", "Date": "date"}, inplace=True)

# drop missing 'location' (not a US State)
raw_data = raw_data.dropna(subset=["location"])

# reorganize columns
raw_data = raw_data.reindex(
    columns=[
        "date",
        "FIPS",
        "lat",
        "lon",
        "state",
        "abbr",
        "location",
        "cases",
        "death",
    ]
)
raw_data.head()

Unnamed: 0,date,FIPS,lat,lon,state,abbr,location,cases,death
0,2020-06-22,36103.0,40.88320119,-72.8012172,New York,NY,Suffolk,41010,1965
1,2020-06-22,36105.0,41.71579493,-74.76394559,New York,NY,Sullivan,1438,45
2,2020-06-22,37133.0,34.72607366,-77.42908179,North Carolina,NC,Onslow,221,3
3,2020-06-22,38001.0,46.09686891,-102.5285397,North Dakota,ND,Adams,0,0
4,2020-06-22,39063.0,41.00250487,-83.66838948,Ohio,OH,Hancock,63,5


### Unique identifier

In [58]:
# unique identifier is Date + index of row
raw_data["id"] = raw_data["abbr"] + raw_data.index.astype(str)

In [59]:
raw_data.head()  # id is smell, but it's ok for now.

Unnamed: 0,date,FIPS,lat,lon,state,abbr,location,cases,death,id
0,2020-06-22,36103.0,40.88320119,-72.8012172,New York,NY,Suffolk,41010,1965,NY0
1,2020-06-22,36105.0,41.71579493,-74.76394559,New York,NY,Sullivan,1438,45,NY1
2,2020-06-22,37133.0,34.72607366,-77.42908179,North Carolina,NC,Onslow,221,3,NC2
3,2020-06-22,38001.0,46.09686891,-102.5285397,North Dakota,ND,Adams,0,0,ND3
4,2020-06-22,39063.0,41.00250487,-83.66838948,Ohio,OH,Hancock,63,5,OH4


### Data type

In [60]:
raw_data.dtypes

date         object
FIPS        float64
lat          object
lon          object
state        object
abbr         object
location     object
cases         int64
death         int64
id           object
dtype: object

### We already implemented function to convert data type in `format_data.py`

In [61]:
# First of all FIPS is not float, it's string
raw_data["FIPS"] = raw_data["FIPS"].astype(str).str.replace(".0", "")

# Date is not a string, it's a date
raw_data["date"] = pd.to_datetime(raw_data["date"])

# Lat and Lon are not string, they are floated
raw_data["lat"] = raw_data["lat"].astype(float)
raw_data["lon"] = raw_data["lon"].astype(float)

In [62]:
raw_data.head()

Unnamed: 0,date,FIPS,lat,lon,state,abbr,location,cases,death,id
0,2020-06-22,36103,40.883201,-72.801217,New York,NY,Suffolk,41010,1965,NY0
1,2020-06-22,36105,41.715795,-74.763946,New York,NY,Sullivan,1438,45,NY1
2,2020-06-22,37133,34.726074,-77.429082,North Carolina,NC,Onslow,221,3,NC2
3,2020-06-22,38001,46.096869,-102.52854,North Dakota,ND,Adams,0,0,ND3
4,2020-06-22,39063,41.002505,-83.668389,Ohio,OH,Hancock,63,5,OH4


In [63]:
raw_data.dtypes

date        datetime64[ns]
FIPS                object
lat                float64
lon                float64
state               object
abbr                object
location            object
cases                int64
death                int64
id                  object
dtype: object

In [64]:
length = raw_data.shape[0]
length = len(str(length))

In [65]:
#
raw_data["id"] = raw_data["abbr"] + raw_data.index.astype(str).str.zfill(length)

In [66]:
raw_data.head()

Unnamed: 0,date,FIPS,lat,lon,state,abbr,location,cases,death,id
0,2020-06-22,36103,40.883201,-72.801217,New York,NY,Suffolk,41010,1965,NY0000000
1,2020-06-22,36105,41.715795,-74.763946,New York,NY,Sullivan,1438,45,NY0000001
2,2020-06-22,37133,34.726074,-77.429082,North Carolina,NC,Onslow,221,3,NC0000002
3,2020-06-22,38001,46.096869,-102.52854,North Dakota,ND,Adams,0,0,ND0000003
4,2020-06-22,39063,41.002505,-83.668389,Ohio,OH,Hancock,63,5,OH0000004


In [67]:
# show missing values
raw_data.isnull().sum()

date        0
FIPS        0
lat         0
lon         0
state       0
abbr        0
location    0
cases       0
death       0
id          0
dtype: int64

In [68]:
raw_data.set_index("id", inplace=True)
raw_data

Unnamed: 0_level_0,date,FIPS,lat,lon,state,abbr,location,cases,death
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
NY0000000,2020-06-22,36103,40.883201,-72.801217,New York,NY,Suffolk,41010,1965
NY0000001,2020-06-22,36105,41.715795,-74.763946,New York,NY,Sullivan,1438,45
NC0000002,2020-06-22,37133,34.726074,-77.429082,North Carolina,NC,Onslow,221,3
ND0000003,2020-06-22,38001,46.096869,-102.528540,North Dakota,ND,Adams,0,0
OH0000004,2020-06-22,39063,41.002505,-83.668389,Ohio,OH,Hancock,63,5
...,...,...,...,...,...,...,...,...,...
UT2401695,2022-02-22,49037,37.627630,-109.803371,Utah,UT,San Juan,3916,47
UT2401696,2022-02-22,49039,39.372319,-111.575868,Utah,UT,Sanpete,0,0
UT2401697,2022-02-22,,37.854472,-111.441876,Utah,UT,Southwest Utah,62930,585
VT2401698,2022-02-22,50023,44.273432,-72.616050,Vermont,VT,Washington,8427,44


# Conclusion before handling missing values

In [69]:
# Save data
raw_data.to_csv("../../data/covid19_usa.csv")