Using the VA state data to fit a distribution to the number of COVID-19 new cases

In [1]:
import pandas as pd
import numpy as np
from datetime import datetime
import warnings
warnings.filterwarnings('ignore')
import plotly.express as px
import plotly.graph_objs as go
from plotly.subplots import make_subplots
import scipy.stats as stats
from matplotlib import pyplot as plt



Graphically plot the distribution and describe the distribution statistics. If using discrete values, calculate the Probability Mass Function for the individual values or range (if using histogram) and plot that

In [2]:
data = pd.read_csv("../../../data/stage_1/long_large_covid.csv")
data

Unnamed: 0,countyFIPS,County Name,State,StateFIPS,Date,Cases,Death,Population
0,0,Statewide Unallocated,AL,1,2020-01-22,0,0,0
1,0,Statewide Unallocated,AL,1,2020-01-23,0,0,0
2,0,Statewide Unallocated,AL,1,2020-01-24,0,0,0
3,0,Statewide Unallocated,AL,1,2020-01-25,0,0,0
4,0,Statewide Unallocated,AL,1,2020-01-26,0,0,0
...,...,...,...,...,...,...,...,...
1829584,56045,Weston County,WY,56,2021-08-12,700,6,6927
1829585,56045,Weston County,WY,56,2021-08-13,700,6,6927
1829586,56045,Weston County,WY,56,2021-08-14,705,6,6927
1829587,56045,Weston County,WY,56,2021-08-15,705,6,6927


In [3]:
#Removing the statewide unallocated data as it cannot be useful for county wise statistics
data= data[data['County Name'] != 'Statewide Unallocated']
data

Unnamed: 0,countyFIPS,County Name,State,StateFIPS,Date,Cases,Death,Population
573,1001,Autauga County,AL,1,2020-01-22,0,0,55869
574,1001,Autauga County,AL,1,2020-01-23,0,0,55869
575,1001,Autauga County,AL,1,2020-01-24,0,0,55869
576,1001,Autauga County,AL,1,2020-01-25,0,0,55869
577,1001,Autauga County,AL,1,2020-01-26,0,0,55869
...,...,...,...,...,...,...,...,...
1829584,56045,Weston County,WY,56,2021-08-12,700,6,6927
1829585,56045,Weston County,WY,56,2021-08-13,700,6,6927
1829586,56045,Weston County,WY,56,2021-08-14,705,6,6927
1829587,56045,Weston County,WY,56,2021-08-15,705,6,6927


In [4]:
data['Date'] = data.Date.apply(lambda d: datetime.strptime(d, '%Y-%m-%d'))

In [5]:
# choosing the VA state
VA_state = data[data['State'] == "VA"]
VA_state.head()

Unnamed: 0,countyFIPS,County Name,State,StateFIPS,Date,Cases,Death,Population
1642791,51001,Accomack County,VA,51,2020-01-22,0,0,32316
1642792,51001,Accomack County,VA,51,2020-01-23,0,0,32316
1642793,51001,Accomack County,VA,51,2020-01-24,0,0,32316
1642794,51001,Accomack County,VA,51,2020-01-25,0,0,32316
1642795,51001,Accomack County,VA,51,2020-01-26,0,0,32316


In [6]:
VA_state['New Cases Per Day'] = VA_state['Cases'].diff()
VA_state['New Deaths Per Day'] = VA_state['Death'].diff()
VA_state

Unnamed: 0,countyFIPS,County Name,State,StateFIPS,Date,Cases,Death,Population,New Cases Per Day,New Deaths Per Day
1642791,51001,Accomack County,VA,51,2020-01-22,0,0,32316,,
1642792,51001,Accomack County,VA,51,2020-01-23,0,0,32316,0.0,0.0
1642793,51001,Accomack County,VA,51,2020-01-24,0,0,32316,0.0,0.0
1642794,51001,Accomack County,VA,51,2020-01-25,0,0,32316,0.0,0.0
1642795,51001,Accomack County,VA,51,2020-01-26,0,0,32316,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...
1718995,51840,City of Winchester,VA,51,2021-08-12,3041,49,28078,11.0,0.0
1718996,51840,City of Winchester,VA,51,2021-08-13,3054,49,28078,13.0,0.0
1718997,51840,City of Winchester,VA,51,2021-08-14,3054,49,28078,0.0,0.0
1718998,51840,City of Winchester,VA,51,2021-08-15,3054,49,28078,0.0,0.0


In [7]:
VA_state =VA_state[VA_state.Date != '2020-01-22']

In [8]:
VA_state

Unnamed: 0,countyFIPS,County Name,State,StateFIPS,Date,Cases,Death,Population,New Cases Per Day,New Deaths Per Day
1642792,51001,Accomack County,VA,51,2020-01-23,0,0,32316,0.0,0.0
1642793,51001,Accomack County,VA,51,2020-01-24,0,0,32316,0.0,0.0
1642794,51001,Accomack County,VA,51,2020-01-25,0,0,32316,0.0,0.0
1642795,51001,Accomack County,VA,51,2020-01-26,0,0,32316,0.0,0.0
1642796,51001,Accomack County,VA,51,2020-01-27,0,0,32316,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...
1718995,51840,City of Winchester,VA,51,2021-08-12,3041,49,28078,11.0,0.0
1718996,51840,City of Winchester,VA,51,2021-08-13,3054,49,28078,13.0,0.0
1718997,51840,City of Winchester,VA,51,2021-08-14,3054,49,28078,0.0,0.0
1718998,51840,City of Winchester,VA,51,2021-08-15,3054,49,28078,0.0,0.0


In [9]:
# group by Date
VA_data = VA_state.groupby(["Date"]).sum().reset_index()
VA_data

Unnamed: 0,Date,countyFIPS,StateFIPS,Cases,Death,Population,New Cases Per Day,New Deaths Per Day
0,2020-01-23,6818111,6783,0,0,8535519,0.0,0.0
1,2020-01-24,6818111,6783,0,0,8535519,0.0,0.0
2,2020-01-25,6818111,6783,0,0,8535519,0.0,0.0
3,2020-01-26,6818111,6783,0,0,8535519,0.0,0.0
4,2020-01-27,6818111,6783,0,0,8535519,0.0,0.0
...,...,...,...,...,...,...,...,...
567,2021-08-12,6818111,6783,715556,11583,8535519,2361.0,2.0
568,2021-08-13,6818111,6783,717826,11599,8535519,2270.0,16.0
569,2021-08-14,6818111,6783,717826,11599,8535519,0.0,0.0
570,2021-08-15,6818111,6783,717826,11599,8535519,0.0,0.0


In [10]:
#Normalization with Population
VA_data['New Cases Per Day (Normalized)']= round((VA_data['New Cases Per Day']/VA_data['Population'])*100000,2)
VA_data['New Deaths Per Day (Normalized)']= round((VA_data['New Deaths Per Day']/VA_data['Population'])*100000,2)


In [11]:
VA_data_week = VA_data.groupby([ pd.Grouper(key='Date', freq='w')])['New Cases Per Day','New Deaths Per Day'].agg({'New Cases Per Day':'mean',
                                                                                                'New Deaths Per Day' :'mean'}).round().reset_index()
VA_data_week

Unnamed: 0,Date,New Cases Per Day,New Deaths Per Day
0,2020-01-26,0.0,0.0
1,2020-02-02,0.0,0.0
2,2020-02-09,0.0,0.0
3,2020-02-16,0.0,0.0
4,2020-02-23,0.0,0.0
...,...,...,...
78,2021-07-25,543.0,3.0
79,2021-08-01,869.0,5.0
80,2021-08-08,1469.0,4.0
81,2021-08-15,1880.0,6.0


In [12]:
# calculating mean
VA_mean = VA_data_week['New Cases Per Day'].mean()
VA_mean

1306.578313253012

In [13]:
VA_data_week['VA PMF'] = stats.poisson.pmf(k=VA_data_week['New Cases Per Day'].round(decimals=0), mu=VA_mean)
VA_data_week

Unnamed: 0,Date,New Cases Per Day,New Deaths Per Day,VA PMF
0,2020-01-26,0.0,0.0,0.000000e+00
1,2020-02-02,0.0,0.0,0.000000e+00
2,2020-02-09,0.0,0.0,0.000000e+00
3,2020-02-16,0.0,0.0,0.000000e+00
4,2020-02-23,0.0,0.0,0.000000e+00
...,...,...,...,...
78,2021-07-25,543.0,3.0,4.795512e-127
79,2021-08-01,869.0,5.0,1.016314e-38
80,2021-08-08,1469.0,4.0,6.370421e-07
81,2021-08-15,1880.0,6.0,8.233170e-51


In [14]:
fig1 = px.histogram(VA_data_week, x='New Cases Per Day',nbins=50,title='Histogram of Normalized number of new cases ')
fig1.write_image("../../../Images/member/Varsha/stage_3/fig1.png")
img_bytes = fig1.to_image(format="png")
Image(img_bytes)

NameError: name 'Image' is not defined

In [None]:
# Calculate the pmf values using the poission distribution and plot
fig = go.Figure()
# Create figure with secondary y-axis
fig = make_subplots(specs=[[{"secondary_y": True}]])
fig.add_trace(go.Histogram(x=VA_data_week['New Cases Per Day'],nbinsx=40,name="VA Normalized weekly Cases Histogram"))
fig.add_trace(go.Scatter(x=VA_data_week['New Cases Per Day'],y=VA_data_week['VA PMF'],mode='markers', name="Poisson Model"), secondary_y=True)

In [None]:
fig = px.scatter(VA_data_week, x=range(0,6000,300), y=stats.poisson.pmf(k=range(0,6000,300),mu=901),title='Poisson Distribution for Number of cases across VA')
fig.update_traces(mode='lines+markers')

Describe the type of distribution (modality) and its statistics (moments of a distribution - center, variance, skewness, kurtosis) in the report and the notebook

In [None]:
VA_data_week['New Cases Per Day'].mean()


In [None]:
VA_data_week['New Cases Per Day'].var()

In [None]:
VA_data_week['New Cases Per Day'].skew()

In [None]:
VA_data_week['New Cases Per Day'].kurt()

Compare the data against other states

In [None]:
states= data[data["State"].isin(["NC", "SC", "NY", "FL", "AL","VA"])]
states

In [None]:
states['New Cases Per Day'] = states['Cases'].diff()
states['New Deaths Per Day'] = states['Death'].diff()
states

In [None]:
# changing negatives:
num = states['New Cases Per Day']._get_numeric_data()
num[num < 0] = 0
num1 = states['New Deaths Per Day']._get_numeric_data()
num1[num1 < 0] = 0

In [None]:
states= states[states['Date'] != '2020-01-22']

In [None]:
states = states.groupby(["State","Date"]).sum().reset_index()
states

In [None]:
#Normalizing
states['New Cases Per Day Normalized']= (states['New Cases Per Day']/states['Population'])*100000
states['New Deaths Per Day Normalized']= (states['New Deaths Per Day']/states['Population'])*100000
states_1 = states
states

In [None]:
#Grouping by week and finding the mean
states_weekly = states.groupby(["State", pd.Grouper(key='Date', freq='w')])['New Cases Per Day Normalized','New Deaths Per Day Normalized'].agg({'New Cases Per Day Normalized':'mean',
 
                                                                                                                                             'New Deaths Per Day Normalized' :'mean'}).reset_index()
states_weekly

In [None]:
#Calculate the mean of the Num of cases per day normalized
compared_states_weekly_data_cases_stats=states_weekly.groupby('State')['New Cases Per Day Normalized'].agg('mean').reset_index().round(decimals=0)
compared_states_weekly_data_cases_stats

In [None]:
#Calculate the mean of the Num of Deaths per day normalized
compared_states_weekly_data_Deaths_stats=states_weekly.groupby('State')['New Deaths Per Day Normalized'].agg('mean').reset_index().round(decimals=0)
compared_states_weekly_data_Deaths_stats

In [None]:
# plotting the histogram for all 6 states
fig = px.histogram(states_weekly, x='New Cases Per Day Normalized',nbins=50,title='Histogram of Normalized number of new cases ')

In [None]:
fig.show()

In [None]:
# plotting the histogram for all 6 states
fig = px.histogram(states_weekly, x='New Deaths Per Day Normalized',nbins=50,title='Histogram of Normalized number of new Deaths ')

In [None]:
fig.show()

In [None]:
k_values = range(0,150,10)
states = compared_states_weekly_data_cases_stats['State']
means = compared_states_weekly_data_cases_stats['New Cases Per Day Normalized']
columns = ['state','mean','k_value','probability']

data_dict={}
for column in columns:
    data_dict[column]=[]
        
for idx, mean in enumerate(means):
    for k_value in k_values:
        data_dict['state'].append(states[idx])
        data_dict['mean'].append(mean)
        data_dict['k_value'].append(k_value)
        data_dict['probability'].append(stats.poisson.pmf(k=k_value,mu=mean))
poisson_data_cases=pd.DataFrame.from_dict(data_dict)

In [None]:
poisson_data_cases

In [None]:
#plot the poisson distribution cases
fig = px.scatter(poisson_data_cases, x="k_value", y="probability", color='state',title='Poisson Distribution for Number of cases across 6 states in US')
fig.update_traces(mode='lines+markers')

Model a poission distribution of new COVID-19 cases and deaths of a state and compare to other 5 states

In [None]:
k_values = range(0,50,10)
states = compared_states_weekly_data_Deaths_stats['State']
means = compared_states_weekly_data_Deaths_stats['New Deaths Per Day Normalized']
columns = ['state','mean','k_value','probability']

data_dict={}
for column in columns:
    data_dict[column]=[]
        
for idx, mean in enumerate(means):
    for k_value in k_values:
        data_dict['state'].append(states[idx])
        data_dict['mean'].append(mean)
        data_dict['k_value'].append(k_value)
        data_dict['probability'].append(stats.poisson.pmf(k=k_value,mu=mean))
poisson_data_deaths=pd.DataFrame.from_dict(data_dict)
poisson_data_deaths

In [None]:
#plot the poisson distribution for deaths
fig = px.scatter(poisson_data_deaths, x="k_value", y="probability", color='state',title='Poisson Distribution for Number of deaths across 6 states in US')
fig.update_traces(mode='lines+markers')

In [None]:
states_weekly['New Cases Per Day Normalized'].mean()


In [None]:
states_weekly['New Cases Per Day Normalized'].var()


In [None]:
states_weekly['New Cases Per Day Normalized'].skew()


In [None]:
states_weekly['New Cases Per Day Normalized'].kurt()



Perform corelation between Enrichment data valiables and COVID-19 cases to observe patterns

In [None]:
president_county_candidate = pd.read_csv("../../../data/stage_1/president_county_candidate.csv")
president_county_candidate

In [None]:
president_county_candidate['county'] = president_county_candidate['county'].apply(lambda cn: cn.strip())

In [None]:
#Rename the column names in the President County Candidate data set
president_county_candidate = president_county_candidate.rename(columns={'state':'State','county':'County Name'})
president_county_candidate['State'] = president_county_candidate['State'].replace(['Delaware','District of Columbia','Alabama','Alaska',
                               'Arizona','Arkansas','California','Colorado','Connecticut','Florida','Georgia','Hawaii',
                                'Idaho','Illinois','Indiana','Iowa','Kansas','Kentucky','Louisiana','Maine','Maryland',
                                'Massachusetts','Michigan','Minnesota','Mississippi','Missouri','Montana','Nebraska',
                                'Nevada','New Hampshire','New Jersey','New Mexico','New York','North Carolina',
                                'North Dakota','Ohio','Oklahoma','Oregon','Pennsylvania','Rhode Island','South Carolina',
                                'South Dakota','Tennessee','Texas','Utah','Vermont','Virginia','Washington','West Virginia',
                                'Wisconsin','Wyoming'],['DE','DC','AL','AK','AZ','AR','CA','CO','CT','FL','GA','HI','ID',
                                'IL','IN','IA','KS','KY','LA','ME','MD','MA','MI','MN','MS','MO','MT','NE','NV','NH','NJ',
                                'NM','NY','NC','ND','OH','OK','OR','PA','RI','SC','SD','TN','TX','UT','VT','VA','WA','WV',
                                'WI','WY'])
president_county_candidate

In [None]:
Population=pd.read_csv("../../../data/stage_1/covid_county_population_usafacts.csv")
Population

In [None]:
# group by state and choosing the six states data
Population = Population.groupby(['State']).sum().reset_index()
Population= Population[Population["State"].isin(["NC", "SC", "NY", "FL", "AL","VA"])].reset_index()
Population.head()

In [None]:
# taking the 6 states data from above task
states_1

In [None]:
# grouping by state 
states_1 = states_1.groupby(['State']).sum().reset_index()
states_1

In [None]:
# replacing the population with president_county_candiadate 
states_1['Population']=Population['population']

In [None]:
# removing unnessary columns
states_1 = states_1.drop(columns="countyFIPS")
states_1 = states_1.drop(columns="StateFIPS")
states_1

In [None]:
# grouping by state
president_county_candidate= president_county_candidate.groupby(['State']).sum().reset_index()
president_county_candidate.head()

In [None]:
# merging two datasets
merged_data = pd.merge(states_1, president_county_candidate, how='inner', on =['State'])
merged_data

In [None]:
# Normalizing with population
merged_data['total_votes']= (merged_data['total_votes']/merged_data['Population'])*100000
merged_data['won']= (merged_data['won']/merged_data['Population'])*100000
merged_data

Correlation between Enrichment data valiables and COVID-19 cases

In [None]:
merged_data['New Cases Per Day Normalized'].corr(merged_data['total_votes'])

In [None]:
merged_data['New Deaths Per Day Normalized'].corr(merged_data['total_votes'])

In [None]:
merged_data['New Cases Per Day Normalized'].corr(merged_data['won'])

In [None]:
merged_data['New Deaths Per Day Normalized'].corr(merged_data['won'])

From the above correlation, in all the six states, we observe that the correlation between total votes , i.e, the number of total votes in the sense that there was negative corralation with both cases and deaths. This shows that Voting doesn't increase the covid cases and deaths.