In [None]:
import pandas as pd
import datetime
import numpy as np
import matplotlib.pyplot as plt
from scipy.stats import truncnorm
import statsmodels.formula.api as smf
import statsmodels.api as sm
import csv

In [None]:
day_interval = 7
geo_granularity = 'state'
time_interval = datetime.timedelta(days = day_interval)
start_date = datetime.date(2020, 3, 23)
date = start_date
results = pd.DataFrame()


while date <= datetime.date(2020, 10, 17):
    url = 'https://raw.githubusercontent.com/stuartlynn/census_2020_response_rates/master/data/raw/' + str(
        date) + '.csv'
    print(str(date))
    try:
        df = pd.read_csv(url, error_bad_lines=False)
    except:
        date += time_interval
        continue
    subset = df[["CRRALL", "CRRINT", "DRRALL", "DRRINT", "state", "RESP_DATE", "county", "tract"]]
    averages = subset.groupby(geo_granularity)['CRRALL'].mean()
    if len(results) == 0:
        results = results.append(pd.Series(0, index=averages.keys()), ignore_index=True)
    results = results.append(averages)
    date += time_interval


In [None]:
print(results)
print(results.keys())

In [None]:
#grab geo information
geo_information = pd.read_csv('data/states.csv')
social_indicators = pd.read_csv('data/Social_Indicators.csv')
economic_indicators = pd.read_csv("data/Economic_Indicators.csv")


In [None]:
geo_information["GEO_ID"] = geo_information["GEO_ID"].apply(lambda x: x[-2:])

social_indicators_cleaned = social_indicators[["NAME", "GEO_ID","DP02_0001E", "DP02_0068PE", "DP02_0113PE", "DP02_0153PE"]]
new_header = social_indicators_cleaned.iloc[0] #grab the first row for the header
social_indicators_cleaned = social_indicators_cleaned[1:] #take the data less the header row
social_indicators_cleaned.columns = new_header #set the header row as the df header

economic_indicators_cleaned = economic_indicators[["NAME", "GEO_ID", "DP03_0062E", "DP03_0119PE"]]
new_header = economic_indicators_cleaned.iloc[0] #grab the first row for the header
economic_indicators_cleaned = economic_indicators_cleaned[1:] #take the data less the header row
economic_indicators_cleaned.columns = new_header #set the header row as the df header

indicators = pd.merge(economic_indicators_cleaned, social_indicators_cleaned, on = ['Geographic Area Name', "id"])
indicators['id'] = indicators['id'].apply(lambda x: x[-2:])
indicators = indicators.query('id != "US"')
indicators['id'] = indicators['id'].astype(int)

for col in list(indicators):
    if col != 'id' and col != 'Geographic Area Name':
        indicators[col] = indicators[col].astype(float)

indicators.to_csv('data/indicators.csv', index=False, quoting=csv.QUOTE_NONNUMERIC)




In [None]:
indicators['id'].dtype

In [None]:
distributions = []
with open('data/distributions.csv', 'w', newline='') as file:
    writer = csv.writer(file)
    writer.writerow(["state", "mu1", "std1", "mu2", "std2"])
    for geo in results.keys():
        dict_results = {}
        geo_result = results[geo]
        differences = [j - i for i, j in zip(geo_result[: -1], geo_result[1 :])] 
        dist_1 = differences[:7]
        dist_2 = differences[7:]
        
        mu1, std1 = norm.fit(dist_1)
        a1, b1 = (0 - mu1) / std1, (100 - mu1) / std1
        params1 = truncnorm.fit(dist_1, fa=a, fb=b)
        mu1, std1 = params1[2], params1[3]
        
        mu2, std2 = norm.fit(dist_2)
        a2, b2 = (0 - mu2) / std2, (100 - mu2) / std2
        params2 = truncnorm.fit(dist_2, fa=a2, fb=b2)
        mu2, std2 = params2[2], params2[3]

        writer.writerow([geo, mu1, std1, mu2, std2])



In [None]:
differences = [j - i for i, j in zip(state_1[: -1], state_1[1 :])] 
print(len(differences))

In [None]:
def plot_dist(mu, std, dist, a, b, color, filename):
    plt.hist(dist, bins=20, density=True, alpha=0.6, color=color)
    xmin, xmax = plt.xlim()
    x = np.linspace(xmin, xmax, 100)
    p = truncnorm.pdf(x, a, b, mu, std)
    plt.plot(x, p, color, linewidth=2)
    title = "Fit results: mu = %.2f,  std = %.2f" % (mu, std)
    plt.title(title)
    plt.xlabel("Reponse Rate Increases")
    plt.ylabel("Density")
    plt.savefig(filename)
    plt.show()

    

In [None]:
#Everything below this point is just to test / experiments.

dist_1 = differences[:7]
dist_2 = differences[7:]

#first 7 weeks
mu1, std1 = norm.fit(dist_1)
a1, b1 = (0 - mu1) / std1, (100 - mu1) / std1
params1 = truncnorm.fit(dist_1, fa=a, fb=b)
plot_dist(mu1, std1, dist_1, a1, b1, '#ababab', "Distribution Plots/first_7")

#last 21 weeks
mu2, std2 = norm.fit(dist_2)
a2, b2 = (0 - mu2) / std2, (100 - mu2) / std2
params2 = truncnorm.fit(dist_2, fa=a2, fb=b2)
plot_dist(mu2, std2, dist_2, a2, b2, '#000000', "Distribution Plots/last_21")

mu, std = norm.fit(differences)
a, b = (0 - mu) / std, (100 - mu) / std
plot_dist(mu, std, differences, a, b, 'k', "Distribution Plots/all_weeks")



# Plot the PDF.


In [None]:
    plt.hist([dist_1, dist_2], bins=20, density=True, stacked=True, alpha=0.5, color=['#ababab', '#000000'])
#   plt.hist(dist_1, bins=10, density=True, stacked=True, alpha=0.2, color='b')
#   plt.hist(dist_2, bins=10, density=True, stacked=True, alpha=0.2, color='r')
    xmin, xmax = plt.xlim()
    x = np.linspace(xmin, xmax, 1000)
    p1 = truncnorm.pdf(x, a1, b1, mu1, std1)
    p2 = truncnorm.pdf(x, a2, b2, mu2, std2)
    plt.plot(x, p1, '#ababab', linewidth=2, label = "First 7 Weeks Truncated Fit")
    plt.plot(x, p2, '#000000', linewidth=2, label = "Last 21 Weeks Truncated Fit")
    plt.xlabel("Reponse Rate Increases")
    plt.ylabel("Density")
    handles, labels = ax.get_legend_handles_labels()
    ax.legend(handles, labels)
    plt.legend()
    plt.savefig('Distribution Plots/two_distributions.png')
    plt.show()


In [None]:
def get_fit(data, degree):
    x = list(range(len(data)))
    fit = np.polyfit(x, data, deg=degree)
    print("completed")
    return fit, x

In [None]:
def plot_fit(x, fit, data):
    plt.xlabel('Week Since Start')
    plt.ylabel('Cumulative Response')
    p = np.poly1d(fit)
    plt.plot(x, p(x))
    plt.scatter(x,data)
    plt.show()


In [None]:
# Below was my attempt to fit kernelized linear regressions (it works but the fit's will go negative 
# so not very good for our purposes)
# state_1 = results[1]
# print(state_1)
# fit, x = get_fit(state_1, 2)
# plot_fit(x, fit, state_1)

x = np.asarray(range(len(differences)))
new_col = np.reshape(np.log(x, where=x>0.1), (x.shape[0],1))
x = sm.add_constant(x)
X = np.append(x, new_col, 1)
print(X)


In [None]:
diff_fit, x = get_fit(differences, degree=2)
plot_fit(x, diff_fit, differences)

In [None]:
results = sm.OLS(differences, X).fit()    

In [None]:
print(results.summary())
fig = sm.graphics.plot_fit(results, 1)

In [None]:
test = pd.read_csv('data/indicators.csv')

In [None]:
print(test)


In [None]:
len(test.columns)