# Import Libraries

In [1]:
import pandas as pd
from datetime import datetime
import plotly.express as px
import matplotlib.pyplot as plt
import scipy.stats  as stats
import numpy as np

# Part 1

### Data Manipulation and Aggregation

In [2]:
#get main dataframe
main = pd.read_csv('../data/covid_large.csv')

#convert date from object to datetime
main['Date'] = main.Date.apply(lambda d: datetime.strptime(d, '%Y-%m-%d'))

#remove empty rows
main = main[main.countyFIPS != 0]

#select only georgia rows
main = main[main.State == 'GA']

#group rows by date
main = main.groupby(["Date"]).sum().reset_index()

#select only desired columns
main = main[['Date', 'Num of Cases']]

#display dataset
main

Unnamed: 0,Date,Num of Cases
0,2020-01-22,0
1,2020-01-23,0
2,2020-01-24,0
3,2020-01-25,0
4,2020-01-26,0
...,...,...
232,2020-09-10,270138
233,2020-09-11,271810
234,2020-09-12,273843
235,2020-09-13,275249


In [None]:
#make plot of state daily trends
fig = px.line(main, x = "Date", y = "Num of Cases", title ='State Daily Trends ')

#display plot
fig.show()

In [None]:
#display histogram of the georgia state cases
plt.hist(main['Num of Cases'], density = True)

### Results

In [None]:
#fit the cases into exponential distribution
fitted_data = stats.expon.fit(main['Num of Cases'])

#obtain paramaters needed to make exponential distribution
a = np.linspace(0, 300000, 100)
b = stats.expon.pdf(a, *fitted_data)

#plot histogram
plt.hist(main['Num of Cases'], density = True)

#plot exponential distribution
plt.plot(a, b)

# Part  2

### Data Manipulation/Aggregation

In [None]:
#import dataset
main = pd.read_csv('../data/covid_large.csv')

In [None]:
#convert date object to datetime
main['Date'] = main.Date.apply(lambda d: datetime.strptime(d, '%Y-%m-%d'))

#remove empty rows
main = main[main.countyFIPS != 0]

#select specific states and add to a list
stateList = ['GA','IL', 'PA', 'OH', 'MI', 'NJ']

#remove states that are not needed from the dataset
main = main[main.State.isin(stateList)]

#select only the columns that are needed from the dataset
main = main[['State', 'Date', 'Num of Cases', 'Num of Deaths', 'population']]

#group the dataset by state and date
main = main.groupby(["Date", "State"]).sum().reset_index().sort_values(["State", "Date"], ascending = (True, True))

#display dataset
main

In [None]:
#take the diffenece of the columns to get the new cases and deaths
main['New Cases Per Day'] = main['Num of Cases'].diff()
main['New Deaths Per Day'] = main['Num of Deaths'].diff()

#remove unneeded columns
main = main[['Date', 'State', 'New Cases Per Day', 'New Deaths Per Day', 'population']]

#remove the rows that are NaN
main = main[main.Date != '2020-01-22']

#display dataset
main

In [None]:
#normalize the new cases and deaths per day by population of 100000
main['New Cases Per Day (Normalized)']= (main['New Cases Per Day']/main['population'])*100000
main['New Deaths Per Day (Normalized)']= (main['New Deaths Per Day']/main['population'])*100000

#remove unneeded columns
main = main[['Date', 'State', 'New Cases Per Day (Normalized)', 'New Deaths Per Day (Normalized)']]

#display dataset
main

In [None]:
#group by state and compute mean of each column for each state
main = main.groupby(['State'])['New Cases Per Day (Normalized)','New Deaths Per Day (Normalized)'].agg({'New Cases Per Day (Normalized)':'mean','New Deaths Per Day (Normalized)' :'mean'}).reset_index()

#display dataset
main

### Results

In [None]:
#display cases poisson model for each state

#display the poisson model for georgia
georgia = stats.poisson.rvs(size = 1000, mu = 11.025809)
plt.hist(georgia)
plt.title("Georgia") 
plt.show()

#display poisson model for illinois
illinois = stats.poisson.rvs(size = 1000, mu = 8.783703)
plt.hist(illinois)
plt.title("Illinois") 
plt.show()


#display poisson model for michigan
michigan = stats.poisson.rvs(size = 1000, mu = 5.019988)
plt.hist(michigan)
plt.title("Michigan") 
plt.show()

#display poisson model for new jersey
newJersey = stats.poisson.rvs(size = 1000, mu = 9.381419)
plt.hist(newJersey)
plt.title("New Jersey") 
plt.show()

#display poisson model for ohio
ohio = stats.poisson.rvs(size = 1000, mu = 5.019851)
plt.hist(ohio)
plt.title("Ohio") 
plt.show()

#display poisson model for pennsylvania
pennsylvania = stats.poisson.rvs(size = 1000, mu = 4.801392)
plt.hist(pennsylvania)
plt.title("Pennsylvania") 
plt.show()

In [None]:
#display poisson models for each state from of the deaths data

#display georgia poisson model
georgia = stats.poisson.rvs(size = 1000, mu = 0.247674)
plt.hist(georgia)
plt.title("Georgia") 
plt.show()

#display illinois poisson model
illinois = stats.poisson.rvs(size = 1000, mu = 0.278009)
plt.hist(illinois)
plt.title("Illinois") 
plt.show()


#display michigan poisson model
michigan = stats.poisson.rvs(size = 1000, mu = 0.290297)
plt.hist(michigan)
plt.title("Michigan") 
plt.show()

#display new jersey poisson model
newJersey = stats.poisson.rvs(size = 1000, mu = 0.764861)
plt.hist(newJersey)
plt.title("New Jersey") 
plt.show()

#display ohio poisson model
ohio = stats.poisson.rvs(size = 1000, mu = 0.160152)
plt.hist(ohio)
plt.title("Ohio") 
plt.show()

#display pennsylvania poisson model
pennsylvania = stats.poisson.rvs(size = 1000, mu = 0.260453)
plt.hist(pennsylvania)
plt.title("Pennsylvania") 
plt.show()

# Part 3

### Data Manipulation/Aggregation

In [None]:
#import dataset
main = pd.read_csv('../data/covid_large.csv')

In [None]:
#conver date from an object to datetime
main['Date'] = main.Date.apply(lambda d: datetime.strptime(d, '%Y-%m-%d'))

#remove empty rows
main = main[main.countyFIPS != 0]

#select north carolina rows
main = main[main.State == 'NC']

#select only needed columns and sort by County Name and Date
main = main[['County Name', 'Date', 'Num of Cases', 'Num of Deaths', 'population']].sort_values(["County Name", "Date"], ascending = (True, True))

#display dataset
main

In [None]:
#compute diffrence of rows to get the new cases and deaths
main['New Cases Per Day'] = main['Num of Cases'].diff()
main['New Deaths Per Day'] = main['Num of Deaths'].diff()

#select only needed columns
main = main[['Date', 'County Name', 'New Cases Per Day', 'New Deaths Per Day', 'population']]

#remove rows with NaN
main = main[main.Date != '2020-01-22']

#display dataset
main

In [None]:
#normalize new cases and deaths by population of 100000
main['New Cases Per Day (Normalized)']= (main['New Cases Per Day']/main['population'])*100000
main['New Deaths Per Day (Normalized)']= (main['New Deaths Per Day']/main['population'])*100000

#select specific columns
main = main[['Date', 'County Name', 'New Cases Per Day (Normalized)', 'New Deaths Per Day (Normalized)']]

#display dataset
main

In [None]:
#group by county name and comupute mean of each counties normalized data
main = main.groupby(['County Name'])['New Cases Per Day (Normalized)','New Deaths Per Day (Normalized)'].agg({'New Cases Per Day (Normalized)':'mean','New Deaths Per Day (Normalized)' :'mean'}).reset_index()

#display dataset
main

### Results

In [None]:
#display alamance county cases poisson model
alamanceCounty = stats.poisson.rvs(size = 1000, mu = 8.931579)
plt.hist(alamanceCounty)
plt.title("Alamance County") 
plt.show()

#display alexander county cases poisson model
alexanderCounty = stats.poisson.rvs(size = 1000, mu = 5.322460)
plt.hist(alexanderCounty)
plt.title("Alexander County") 
plt.show()

#display alleghany county cases poisson model
alleghanyCounty = stats.poisson.rvs(size = 1000, mu = 8.256187)
plt.hist(alleghanyCounty)
plt.title("Alleghany County") 
plt.show()

#display anson county cases poisson model
ansonCounty = stats.poisson.rvs(size = 1000, mu = 8.839961)
plt.hist(ansonCounty)
plt.title("Anson County") 
plt.show()

#display ashe countycases poisson model
asheCounty = stats.poisson.rvs(size = 1000, mu = 3.753948)
plt.hist(asheCounty)
plt.title("Ashe County") 
plt.show()

In [None]:
#display alamance county deaths poisson model
alamanceCounty = stats.poisson.rvs(size = 1000, mu = 0.119988)
plt.hist(alamanceCounty)
plt.title("Alamance County") 
plt.show()

#display alexander county deaths poisson model
alexanderCounty = stats.poisson.rvs(size = 1000, mu = 0.033901)
plt.hist(alexanderCounty)
plt.title("Alexander County") 
plt.show()

#display alleghany county deaths poisson model
alleghanyCounty = stats.poisson.rvs(size = 1000, mu = 0.000000)
plt.hist(alleghanyCounty)
plt.title("Alleghany County") 
plt.show()

#display anson county deaths poisson model
ansonCounty = stats.poisson.rvs(size = 1000, mu = 0.052000)
plt.hist(ansonCounty)
plt.title("Anson County") 
plt.show()

#display ashe county deaths poisson model
asheCounty = stats.poisson.rvs(size = 1000, mu = 0.015577)
plt.hist(asheCounty)
plt.title("Ashe County") 
plt.show()

# Part 4

### Data Manipulation/Aggregation

In [None]:
#get dataset 
main = pd.read_csv('../data/housing_merge.csv')

In [None]:
#convert date object to datetime
main['Date'] = main.Date.apply(lambda d: datetime.strptime(d, '%Y-%m-%d'))

#remove empty rows
main = main[main.countyFIPS != 0]

#select specific states used in previous example
stateList = ['GA','IL', 'PA', 'OH', 'MI', 'NJ'] 

#remove all other states that are not needed
main = main[main.State.isin(stateList)]

#display dataset
main

In [None]:
#select columns that are only needed
main = main[['State', 'Date','Owner-Occupied Unit', 'Renter-Occupied Unit', 'Average Household Size of Owner-Occupied Unit', 'Average Household Size of Renter-Occupied Unit']]

In [None]:
#group by state and compute the enhacement data for each state
main = main.groupby(['State'])['Owner-Occupied Unit', 'Renter-Occupied Unit', 'Average Household Size of Owner-Occupied Unit', 'Average Household Size of Renter-Occupied Unit'].agg({'Owner-Occupied Unit':'sum', 'Renter-Occupied Unit':'sum', 'Average Household Size of Owner-Occupied Unit':'mean', 'Average Household Size of Renter-Occupied Unit':'mean'}).reset_index()
main