In [1]:
#import dependencies
import pandas as pd
import numpy as np
import numpy.random as npr
import matplotlib.pyplot as plt

rseed = 7

In [19]:
#read in dataframes
oh_covid_df = pd.read_csv(r'OH_COVIDSummaryDataZIP_11_29_20___1.csv')
dists_df = pd.read_csv(r'distances.csv')

In [20]:
#show columns of covid data
oh_covid_cols = oh_covid_df.columns.tolist()
print(oh_covid_cols)

['Zip Code ', 'Population ', 'Case Count - Cumulative', 'Case Count - Last 30 Days ', 'Case Count - Last 14 Days ', 'Case Count Per 100K - Cumulative', 'Case Count Per 100K - Last 30 Days', 'Case Count Per 100K - Last 14 Days']


In [12]:
#show columns of zip code distance data
dists_cols = dists_df.columns.tolist()
print(dists_cols)

['Zip1', 'Zip2', 'Distance(miles)']


In [13]:
#build numpy arrays and display shapes
oh_covid = oh_covid_df.to_numpy()
dists = dists_df.to_numpy()
print(oh_covid.shape)
print(dists.shape)

(1189, 7)
(2244004, 3)


In [14]:
#display head of covid data array
oh_covid[0:5,:]

array([[4.3001e+04, 8.5000e+01, 3.7000e+01, 1.9000e+01, 3.0249e+03,
        1.3167e+03, 6.7620e+02],
       [4.3002e+04,        nan,        nan,        nan,        nan,
               nan,        nan],
       [4.3003e+04, 5.2000e+01, 3.4000e+01, 1.9000e+01, 1.7304e+03,
        1.1314e+03, 6.3230e+02],
       [4.3004e+04, 1.3620e+03, 4.9400e+02, 2.1200e+02, 5.2106e+03,
        1.8899e+03, 8.1100e+02],
       [4.3005e+04,        nan,        nan,        nan,        nan,
               nan,        nan]])

In [15]:
#display head of zip code dist array
dists[0:5,:]

array([[45883.        , 45883.        ,     0.        ],
       [45883.        , 44331.        ,   156.78305048],
       [45883.        , 44615.        ,   187.33630467],
       [45883.        , 44665.        ,   194.30870068],
       [45883.        , 43837.        ,   165.59264031]])

In [16]:
#sorting test code
#TO BE DELETED BEFORE SUBMISSION
temp = dists[dists[:,0] == 45883,:];
temp_sorted = temp[temp[:,2].argsort()]
print(temp.shape)
print(temp[0:5])
print(temp_sorted[0:5,2])

(1498, 3)
[[45883.         45883.             0.        ]
 [45883.         44331.           156.78305048]
 [45883.         44615.           187.33630467]
 [45883.         44665.           194.30870068]
 [45883.         43837.           165.59264031]]
[0.         0.99789907 4.29792141 4.64255669 5.66799433]


In [24]:
#do nn_regression for a target zip code given copies of the zip code distances, covid data, 
# and number of neighbors to consider
# done for both quasi-normalized distance and raw distance
# returns nn_regression estimates done for each of the |source_data.cols-1| features
# in |n_vals|x6 estimates ways for each number of neighbors and distance type
# so result is an |source_data.cols-1|x|n_vals|x6 array
def nn_regression(target_zip, dist_data, source_data, n_vals):
    #list zip code distances to only those that match the target zip code
    dist_holder = np.copy(dist_data[dist_data[:,0] == target_zip])
    
    #sort the distances by distance, ascending 
    dist_sorted = dist_holder[dist_holder[:,2].argsort()]
    
    #remove the first line of dist_sorted if it was the zip code itself
    if dist_sorted[0,-1] < .1:
        dist_sorted = dist_sorted[1:,:]
    
    #build dist_vals to contain source zip codes, inverse distance, exponential decay distance, 
        #mean normalized inverse distance, mean normalized exponential decay distance, 
        #std normalized inverse distance, std normalized exponential decay distance
    dShape = dist_sorted.shape
    dist_vals = np.zeros((dist_sorted.shape[0], 7))
    dist_vals[:,0] = dist_sorted[:,1]
    dist_vals[:,1] = 1/dist_sorted[:,2]
    dist_vals[:,2] = np.exp(-1*np.square(dist_sorted[:,2]))
    dist_mean = np.mean(dist_sorted[:,2])
    dist_vals[:,3] = dist_mean/dist_sorted[:,2]
    dist_vals[:,4] = np.exp(-1*np.square(dist_sorted[:,2]/dist_mean))
    dist_std = np.std(dist_sorted[:,2])
    dist_vals[:,5] = dist_std/dist_sorted[:,2]
    dist_vals[:,6] = np.exp(-1*np.square(dist_sorted[:,2]/dist_std))
    
    #build total matrix to account for each option of dist_vals, source_vals, and n
    totals = np.zeros((source_data.shape[1]-1,dist_vals.shape[1]-1,np.size(n_vals)))
    
    #since dist is sorted, only need to go as deep as the max n
    depth = np.amax(n_vals)
    
    #consider as many nearest neighbors as the max of the n options, but tolerate nan data
    for k in range(dShape[0]):
        #build holder for source data
        t = np.zeros(source_data.shape[1]-1)
        #no guarantee a match has been found yet
        t_found = False
        n_found = 0
        
        #go through source data to find matching source id (zip code)
        for j in range(source_data.shape[0]):
            #if match, assign to t and breack
            if source_data[j,0] == dist_vals[k,0]:
                t = source_data[j,1:]
                t_found = True
                break
        #only proceed if source data found and does not contain nans
        if t_found and not np.isnan(np.sum(t)):
            #update found count
            n_found = n_found + 1
            
            #loop through n neighbor values given to find
            for i in range(np.size(n_vals)):
                #only if within limit for that neighbor class, proceed
                if n_found <= n_vals[i]:
                    #update the totals for the corresponding source and distance features
                    for x in range(source_data.shape[1]-1):
                        for y in range(dist_vals.shape[1]-1):
                            totals[x,y,i] = totals[x,y,i] + dist_vals[k,y+1]*source_data[j,x+1]
        if n_found >= depth:
            break
    return totals;