<h2>COVID-19 Data Analysis</h2>

In [1]:
import sqlite3
import numpy as np
import pandas as pd

<p>Data file is in CSV format.</p>
<p>File size is 9.21 GB.</p>
<p>Since this file is too large, below, I count the number of lines and split the file into multiple files.</p>

In [2]:
f = open("COVID-19_Case_Surveillance_Public_Use_Data_with_Geography.csv", "r")

In [3]:
for count, line in enumerate(f):
    pass

In [4]:
print(count)

71387132


In [5]:
chunk_size = 20000000

In [6]:
def write_chunk(part, lines):
    with open("data_part_" + str(part) + ".csv", "w") as f_out:
        f_out.write(header)
        f_out.writelines(lines)

In [7]:
with open("COVID-19_Case_Surveillance_Public_Use_Data_with_Geography.csv", "r") as f:
    count = 0
    header = f.readline()
    lines = []
    for line in f:
        count += 1
        lines.append(line)
        if count % chunk_size == 0:
            write_chunk(count // chunk_size, lines)
            lines = []
    # write remainder
    if len(lines) > 0:
        write_chunk((count // chunk_size) + 1, lines)

In [8]:
f.close()

<h3>Using SQL to store large dataset</h3>
<ul>
    <li>Read in each CSV data-part into a DataFrame</li>
    <li>Then, export each part to an SQL database</li>
</ul>

In [9]:
# Create SQL Engine, Connection, and Cursor
# If .db file does not exist, it will be created during connection
connection = sqlite3.connect('covid_large_dataset.db')
cursor = connection.cursor()

In [10]:
# Create table in database, if it does not exist
command1 = """CREATE TABLE IF NOT EXISTS covid_data(id INTEGER PRIMARY KEY, case_month TEXT, res_state TEXT, 
            state_fips_code TEXT, res_county TEXT, county_fips_code TEXT, age_group TEXT, sex TEXT, race TEXT, 
            ethnicity TEXT, case_positive_specimen_interval INTEGER, case_onset_interval INTEGER, process TEXT, 
            exposure_yn TEXT, current_status TEXT, symptom_status TEXT, hosp_yn TEXT, icu_yn TEXT, death_yn TEXT, 
            underlying_conditions_yn TEXT)"""
cursor.execute(command1)
connection.commit()

In [11]:
# Read in first CSV part and export to SQL table in database
df = pd.read_csv("data_part_1.csv", low_memory=False)
df.to_sql('covid_data', connection, if_exists='append', index_label='id')
connection.commit()

In [12]:
# Read in second CSV part and export to SQL table in database
df = pd.read_csv("data_part_2.csv", low_memory=False)

# The first CSV part had an index (id) of range 0 to 19,999,999
# Therefore, we must reindex this part to the next range, 20,000,000 to 39,999,999
# Otherwise, we would get a unique index error when we try to export to SQL table
df.index = range(20000000, 40000000)

# Export data to SQL table in database
df.to_sql('covid_data', connection, if_exists='append', index_label='id')
connection.commit()

In [13]:
# Read in third CSV part, reindex to range starting with 40,000,000, 
# and export data to SQL table in database
df = pd.read_csv("data_part_3.csv", low_memory=False)
df.index = range(40000000, 60000000)
df.to_sql('covid_data', connection, if_exists='append', index_label='id')
connection.commit()

In [14]:
# Read in fourth CSV part. Get info to check number of lines. 
df = pd.read_csv("data_part_4.csv", low_memory=False)
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 11387132 entries, 0 to 11387131
Data columns (total 19 columns):
 #   Column                           Dtype  
---  ------                           -----  
 0   case_month                       object 
 1   res_state                        object 
 2   state_fips_code                  int64  
 3   res_county                       object 
 4   county_fips_code                 float64
 5   age_group                        object 
 6   sex                              object 
 7   race                             object 
 8   ethnicity                        object 
 9   case_positive_specimen_interval  float64
 10  case_onset_interval              float64
 11  process                          object 
 12  exposure_yn                      object 
 13  current_status                   object 
 14  symptom_status                   object 
 15  hosp_yn                          object 
 16  icu_yn                           object 
 17  death_

In [15]:
# Then, reindex to range starting with 60,000,000, 
# and export data to SQL table in database
df.index = range(60000000, (60000000+11387132))
df.to_sql('covid_data', connection, if_exists='append', index_label='id')
connection.commit()

In [16]:
# Close connection to database
connection.close()

<h3>Open SQL database and read in data to DataFrame for data analysis</h3>

In [3]:
# Create SQL Engine, Connection, and Cursor
connection = sqlite3.connect('covid_large_dataset.db')
cursor = connection.cursor()

In [18]:
# SQL command to read data from table in database
command1 = """SELECT id,
                     case_month,
                     res_state,
                     age_group,
                     sex,
                     race,
                     ethnicity,
                     case_positive_specimen_interval,
                     case_onset_interval,
                     death_yn
                FROM covid_data
               WHERE death_yn = 'Yes';"""

# Execute command and read into DataFrame
df = pd.read_sql(sql=command1, con=connection, index_col="id")

In [19]:
# Get DataFrame info
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 416870 entries, 678 to 71385121
Data columns (total 9 columns):
 #   Column                           Non-Null Count   Dtype  
---  ------                           --------------   -----  
 0   case_month                       416870 non-null  object 
 1   res_state                        416870 non-null  object 
 2   age_group                        415933 non-null  object 
 3   sex                              415205 non-null  object 
 4   race                             396902 non-null  object 
 5   ethnicity                        396191 non-null  object 
 6   case_positive_specimen_interval  153374 non-null  float64
 7   case_onset_interval              160721 non-null  float64
 8   death_yn                         416870 non-null  object 
dtypes: float64(2), object(7)
memory usage: 31.8+ MB


In [20]:
# Find how many covid deaths per state
deaths_by_state = df.groupby(["res_state"]).size()
print(deaths_by_state)

res_state
AK       81
AL     5100
AR     4105
AZ    21502
CA    62033
CO     5022
CT     4673
DC      676
FL    43165
GA      836
IA      907
ID     1894
IL    21804
IN     5995
KS     3804
KY     4723
LA     2472
MA    14369
MD     2821
ME      409
MI    12943
MN     6452
MO     7610
MS     1145
MT     1259
NC     5012
ND      989
NH      779
NJ    19647
NM     2123
NV     8938
NY    40393
OH    21943
OK     3825
OR     1324
PA    22002
PR     3493
RI      594
SC     4999
TN     9049
TX    19467
UT     1292
VA     5220
VT       32
WA     4460
WI     5143
WY      346
dtype: int64


In [21]:
# Export DataFrame to csv file for use later.
df.to_csv("covid-data-with-deaths.csv")

<h2>Data Analysis using SQLite3</h2>
<p>Count number of cases, per state, by month+year</p>

In [7]:
command1 =  """
            SELECT DISTINCT
                case_month,
                res_state,
                count(res_state) AS state_total
            FROM
                covid_data
            WHERE
                case_month IS NOT NULL AND res_state IS NOT NULL
            GROUP BY
                case_month, res_state
            ORDER BY
                case_month, res_state;
            """

In [3]:
# Create SQL Engine, Connection, and Cursor
connection = sqlite3.connect('covid_large_dataset.db')
cursor = connection.cursor()

In [8]:
df = pd.read_sql(sql=command1, con=connection)

In [9]:
df.head()

Unnamed: 0,case_month,res_state,state_total
0,2020-01,AL,103
1,2020-01,AR,17
2,2020-01,AZ,63
3,2020-01,CA,389
4,2020-01,CO,85


In [10]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1459 entries, 0 to 1458
Data columns (total 3 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   case_month   1459 non-null   object
 1   res_state    1459 non-null   object
 2   state_total  1459 non-null   int64 
dtypes: int64(1), object(2)
memory usage: 34.3+ KB


In [11]:
df.to_csv('covid-cases-by-date-and-state.csv')

In [13]:
dates = df['case_month'].value_counts()

In [14]:
states = df['res_state'].value_counts()

In [18]:
type(dates)

pandas.core.series.Series

In [23]:
dates.sort_index(inplace=True)

In [24]:
states.sort_index(inplace=True)

In [29]:
dates_list = list(dates.index)

In [30]:
states_list = list(states.index)          
    

In [76]:

df2 = pd.DataFrame(index=states_list, columns=dates_list)

for index, row in df.iterrows():
    state = row["res_state"]
    date = row["case_month"]
    total = row["state_total"]
    df2[date][state] = total

In [74]:
df2["2020-02"]["AL"] = 50

In [77]:
df2.head(10)

Unnamed: 0,2020-01,2020-02,2020-03,2020-04,2020-05,2020-06,2020-07,2020-08,2020-09,2020-10,...,2021-07,2021-08,2021-09,2021-10,2021-11,2021-12,2022-01,2022-02,2022-03,2022-04
AK,,,239,137,146,614,2293,2104,3068,9364,...,4894,15419,24202,21036,11623,8073,58396,19442,7902,3166
AL,103.0,59.0,2677,6208,11427,21971,49743,37556,27988,37943,...,37925,121030,89555,31418,15298,63126,319806,53397,13493,2845
AR,17.0,18.0,1156,2742,5354,16458,21307,18628,21805,28965,...,41636,68418,37395,15696,17457,46391,218663,25396,4102,1249
AZ,63.0,57.0,2679,7243,16190,71744,86892,22845,16826,33418,...,39168,91751,77778,75752,110413,130755,497079,74776,17380,6436
CA,389.0,489.0,19293,46525,71232,174261,282735,151891,101378,124944,...,203821,415072,252182,179665,165165,775039,2788757,353757,86106,51474
CO,85.0,98.0,6793,12128,9287,7091,14650,9801,14372,43317,...,19182,42400,55573,74866,86277,107273,328452,53974,29089,18544
CT,17.0,36.0,3766,9257,3629,29767,4007,3505,4564,16341,...,11613,16520,19748,68,9905,154,454,27702,33940,13
DC,,36.0,1232,3305,2175,566,4361,712,633,798,...,1125,4915,5730,2942,2571,6263,55741,4871,1868,2458
DE,,,300,4242,4384,1929,3134,2397,2921,4272,...,1723,8550,13325,10494,9228,29794,63792,7991,2854,2077
FL,132.0,290.0,16081,20817,21705,81505,135395,149549,111662,129422,...,162415,323131,293037,184574,47145,107853,525740,380416,146759,22975


In [79]:
df.to_csv('final.csv')

In [43]:
df2.head()

Unnamed: 0,2020-01,2020-02,2020-03,2020-04,2020-05,2020-06,2020-07,2020-08,2020-09,2020-10,...,2021-07,2021-08,2021-09,2021-10,2021-11,2021-12,2022-01,2022-02,2022-03,2022-04
0,103,59,239,137,146,614,2293,2104,3068,9364,...,4894,15419,24202,21036,11623,8073,58396,19442,7902,3166
1,17,18,2677,6208,11427,21971,49743,37556,27988,37943,...,37925,121030,89555,31418,15298,63126,319806,53397,13493,2845
2,63,57,1156,2742,5354,16458,21307,18628,21805,28965,...,41636,68418,37395,15696,17457,46391,218663,25396,4102,1249
3,389,489,2679,7243,16190,71744,86892,22845,16826,33418,...,39168,91751,77778,75752,110413,130755,497079,74776,17380,6436
4,85,98,19293,46525,71232,174261,282735,151891,101378,124944,...,203821,415072,252182,179665,165165,775039,2788757,353757,86106,51474


In [44]:
df2.to_csv("draft-table-1.csv")

In [48]:
len(states_list)

54

In [49]:
df3 = pd.DataFrame()
df3["STATES"] = states_list

In [50]:
df3.head()

Unnamed: 0,STATES
0,AK
1,AL
2,AR
3,AZ
4,CA
