In [1]:
import pandas as pd
import numpy as np

# Import Database and change add the date

In [2]:
# Database is imported as a csv from the wrangling and made into a dataframe with the date as index
df = pd.read_csv('mortality.csv', parse_dates = ['date'])
df = df.set_index(df.date)

# Adding year, month, day as columns in the dataframe
df['year'] = pd.DatetimeIndex(df['date']).year
df['month'] = pd.DatetimeIndex(df['date']).month
df['day'] = pd.DatetimeIndex(df['date']).day

df.head()

FileNotFoundError: File b'mortality.csv' does not exist

In [None]:
import altair as alt
alt.renderers.enable('notebook')

chart = alt.Chart(df).properties(width=800).mark_line().encode(
    alt.X('monthdate(date):T', axis=alt.Axis(title='Month', format='%B')),
    alt.Color('year(date):O', legend=alt.Legend(title='Year')),
    alt.Order('year(data):O')
).interactive()

chart.encode(alt.Y('deaths:Q', axis=alt.Axis(title='Deaths'))) 

# Making dictionaries from Teralytics estimates and adding it to our data set


In [None]:
# population from yearly data and data sampled from Teralytics estimates by author 
# this is made into a dictionary 
samples = {
    '2010-07-02': 3721525,
    '2011-07-02': 3678732,
    '2012-07-02': 3634488,
    '2013-07-02': 3593077,
    '2014-07-02': 3534874,
    '2015-07-02': 3473177,
    '2016-07-02': 3406520,
    '2017-07-02': 3337177,
    
    '2017-09-19': 3337000,
    '2017-10-15': 3237000,
    '2017-11-15': 3202000,
    '2017-12-15': 3200000,
    '2018-01-15': 3223000,
    '2018-02-15': 3278000
}

# add the above data points to our data set
# turn the dictionary into a datafrane and add it into the original data frame
# index is date and the column is est_pop
pdf = pd.DataFrame.from_dict(samples, orient='index', columns=['est_pop'])
pdf = pdf.set_index(pd.to_datetime(pdf.index))
pdf.est_pop = pdf.est_pop.astype(int)

df = df.join(pdf)

In [None]:
alt.Chart(df).mark_circle(size=50).encode(
    alt.X('date:T', scale=alt.Scale(domain=('2017-09-01', '2018-03-01'))),
    alt.Y('est_pop', scale=alt.Scale(domain=(3_100_000, 3_400_000)))
).interactive()

# Linear interpolation of estimated population

In [None]:
# linear interpolation is method of curve fitting using linear polynomials to construct 
# new data points within the range of a discrete set of known data points 
# TO GRAPH
df['est_pop_lerped'] = df.est_pop.interpolate(limit_direction='both')

alt.Chart(df).mark_line().encode(
    alt.X('date:T'),
    alt.Y('est_pop_lerped', scale=alt.Scale(domain=(3_190_000, 3_510_000)))
).interactive()

# Calculating Rate

In [None]:
# made a copy of the dataframe
rdf = df.copy()

# the article says only to use days prior to sept 20th in calcualting yearly median death rate
rdf = rdf[rdf['date'].dt.dayofyear < 263] # sept 20th is 263rd day of year 

# calculating calc_rate
rdf['calc_rate'] = rdf.deaths / rdf.est_pop_lerped * 365 * 1000
rdf.groupby('year').median() 

# Calculating yearly rates

In [None]:
# Dictionary given by the original website
yearly_rates = {
    2015: 8.197106,
    2016: 8.387468,
    2017: 8.750266,
    2018: 8.750266 # this is assumed same as 2017 from above result
}

# yrdf= yearly rates data frame
# turn the dictionary into a data frame and add to dataframe
yrdf = pd.DataFrame.from_dict(yearly_rates, orient='index', columns=['year_rate'])
df = df.join(yrdf, on='year')

# Calculate day rate

In [None]:
# recalculate death rate across entire dataset
df['day_rate'] = df.deaths / df.est_pop_lerped * 365 * 1000

# Clean up population columns

In [None]:
# clean up population columns
df = df.drop('est_pop', axis=1)
df = df.rename({"est_pop_lerped": "pop"}, axis=1)
df.head()

# Calculate day rate and average rate

In [None]:
adf = df.query('year < 2017 and not (month == 2 and day == 29)')
ardf = adf.groupby(['month', 'day'], as_index=False).apply(lambda x: (x.day_rate - x.year_rate).mean())
ardf = ardf.to_frame(name='avg_rate')
ajdf = adf.merge(ardf, on=['month', 'day'])

adf = ajdf 
adf.plot(x='date', y='avg_rate', style='.')

# Average rate smoothed

In [None]:
# Need to run and use lowess to smoothen out graph
from statsmodels.tsa.seasonal import seasonal_decompose
import statsmodels.api as sm
lowess = sm.nonparametric.lowess

smdf = adf
smdf['avg_rate_smoothed'] = lowess(adf.avg_rate, adf.date, frac=0.15, it=3, return_sorted=False)
smdf.plot(x='date', y=['avg_rate', 'avg_rate_smoothed'])

# Calculate trend

In [None]:
# Merging and renaming
full = pd.merge(df, smdf[['date', 'avg_rate', 'avg_rate_smoothed']], on='date', how='left')
full = full.set_index(pd.to_datetime(full.date))

full = full.rename({"avg_rate_smoothed": "trend", "day_rate": "rate"}, axis=1)

# TODO: remove this
full.plot(x='date', y=['year_rate', 'rate', 'trend'])

# Annual trend

In [None]:
# copy annual trend to 2017-18
ann_trend = full.loc[full["date"].dt.year == 2015, "trend"]
ann_trend # how to copy to 2017-18??
ann_trend = pd.DataFrame(ann_trend)
ann_trend.plot()