## NYT Covid Dataset

In [1]:
import numpy as np
import pandas as pd

In [2]:
df = pd.read_csv('us-counties.csv')

In [3]:
df.head()

Unnamed: 0,date,county,state,fips,cases,deaths
0,2020-01-21,Snohomish,Washington,53061.0,1,0.0
1,2020-01-22,Snohomish,Washington,53061.0,1,0.0
2,2020-01-23,Snohomish,Washington,53061.0,1,0.0
3,2020-01-24,Cook,Illinois,17031.0,1,0.0
4,2020-01-24,Snohomish,Washington,53061.0,1,0.0


In [4]:
df.shape

(1082715, 6)

In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1082715 entries, 0 to 1082714
Data columns (total 6 columns):
 #   Column  Non-Null Count    Dtype  
---  ------  --------------    -----  
 0   date    1082715 non-null  object 
 1   county  1082715 non-null  object 
 2   state   1082715 non-null  object 
 3   fips    1072769 non-null  float64
 4   cases   1082715 non-null  int64  
 5   deaths  1059196 non-null  float64
dtypes: float64(2), int64(1), object(3)
memory usage: 49.6+ MB


## Creating New Dataset for Capstone

In [6]:
nyc = df[df.state == 'New York']

In [7]:
nyc.head()

Unnamed: 0,date,county,state,fips,cases,deaths
416,2020-03-01,New York City,New York,,1,0.0
448,2020-03-02,New York City,New York,,1,0.0
482,2020-03-03,New York City,New York,,2,0.0
518,2020-03-04,New York City,New York,,2,0.0
519,2020-03-04,Westchester,New York,36119.0,9,0.0


In [8]:
nyc.shape

(20346, 6)

In [9]:
nyc = nyc[nyc.county == 'New York City']

In [10]:
nyc.head()

Unnamed: 0,date,county,state,fips,cases,deaths
416,2020-03-01,New York City,New York,,1,0.0
448,2020-03-02,New York City,New York,,1,0.0
482,2020-03-03,New York City,New York,,2,0.0
518,2020-03-04,New York City,New York,,2,0.0
565,2020-03-05,New York City,New York,,4,0.0


In [11]:
nyc = nyc.drop(['state', 'fips', 'county'], axis = 1)

In [12]:
nyc.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 367 entries, 416 to 1081343
Data columns (total 3 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   date    367 non-null    object 
 1   cases   367 non-null    int64  
 2   deaths  367 non-null    float64
dtypes: float64(1), int64(1), object(1)
memory usage: 11.5+ KB


In [13]:
nyc.tail()

Unnamed: 0,date,cases,deaths
1068360,2021-02-26,716678,29173.0
1071606,2021-02-27,720951,29253.0
1074852,2021-02-28,725155,29332.0
1078097,2021-03-01,728859,29408.0
1081343,2021-03-02,732557,29473.0


### Setting Datetime

In [14]:
nyc['date'] =  pd.to_datetime(nyc['date'])

In [15]:
nyc.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 367 entries, 416 to 1081343
Data columns (total 3 columns):
 #   Column  Non-Null Count  Dtype         
---  ------  --------------  -----         
 0   date    367 non-null    datetime64[ns]
 1   cases   367 non-null    int64         
 2   deaths  367 non-null    float64       
dtypes: datetime64[ns](1), float64(1), int64(1)
memory usage: 11.5 KB


### Adjust the cases and deaths

In [16]:
nyc['new_cases'] = nyc['cases'].diff()

In [17]:
nyc['new_deaths'] = nyc['deaths'].diff()

In [18]:
nyc.head(50)

Unnamed: 0,date,cases,deaths,new_cases,new_deaths
416,2020-03-01,1,0.0,,
448,2020-03-02,1,0.0,0.0,0.0
482,2020-03-03,2,0.0,1.0,0.0
518,2020-03-04,2,0.0,0.0,0.0
565,2020-03-05,4,0.0,2.0,0.0
627,2020-03-06,5,0.0,1.0,0.0
715,2020-03-07,12,0.0,7.0,0.0
820,2020-03-08,14,0.0,2.0,0.0
947,2020-03-09,20,0.0,6.0,0.0
1098,2020-03-10,37,0.0,17.0,0.0


In [19]:
nyc.drop(['cases', 'deaths'], axis=1, inplace=True)

In [20]:
nyc.head()

Unnamed: 0,date,new_cases,new_deaths
416,2020-03-01,,
448,2020-03-02,0.0,0.0
482,2020-03-03,1.0,0.0
518,2020-03-04,0.0,0.0
565,2020-03-05,2.0,0.0


In [21]:
nyc.set_index('date')

Unnamed: 0_level_0,new_cases,new_deaths
date,Unnamed: 1_level_1,Unnamed: 2_level_1
2020-03-01,,
2020-03-02,0.0,0.0
2020-03-03,1.0,0.0
2020-03-04,0.0,0.0
2020-03-05,2.0,0.0
...,...,...
2021-02-26,4289.0,85.0
2021-02-27,4273.0,80.0
2021-02-28,4204.0,79.0
2021-03-01,3704.0,76.0


### Moving the date back by 2 weeks

In [22]:
nyc.to_csv (r'C:\Users\epcus\flatiron\capstone\Twint_Covid_NYC\nyccases.csv', index = False, header=True)

In [23]:
nyc.head()

Unnamed: 0,date,new_cases,new_deaths
416,2020-03-01,,
448,2020-03-02,0.0,0.0
482,2020-03-03,1.0,0.0
518,2020-03-04,0.0,0.0
565,2020-03-05,2.0,0.0
