In [1]:
#import dependencies
import pandas as pd
import numpy as np
import numpy.random as npr
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split

rseed = 7 

In [2]:
#read in dataframes
oh_covid_df = pd.read_csv(r'OH_COVIDSummaryDataZIP_11_29_20___1.csv')
dists_df = pd.read_csv(r'distances.csv')

In [3]:
#show columns of covid data
oh_covid_cols = oh_covid_df.columns.tolist()
print(oh_covid_cols)

['Zip Code ', 'Population ', 'Case Count - Cumulative', 'Case Count - Last 30 Days ', 'Case Count - Last 14 Days ', 'Case Count Per 100K - Cumulative', 'Case Count Per 100K - Last 30 Days', 'Case Count Per 100K - Last 14 Days']


In [4]:
#show columns of zip code distance data
dists_cols = dists_df.columns.tolist()
print(dists_cols)

['Zip1', 'Zip2', 'Distance(miles)']


In [5]:
#build numpy arrays and display shapes
oh_covid = oh_covid_df.to_numpy()
dists = dists_df.to_numpy()
print(oh_covid.shape)
print(dists.shape)

(1189, 8)
(2244004, 3)


In [6]:
neighbor_vals = np.array([1,2,5,10,25,100])

In [7]:
#display head of covid data array
oh_covid[0:5,:]

array([[4.3001e+04, 2.8100e+03, 8.5000e+01, 3.7000e+01, 1.9000e+01,
        3.0249e+03, 1.3167e+03, 6.7620e+02],
       [4.3002e+04, 3.7440e+03,        nan,        nan,        nan,
               nan,        nan,        nan],
       [4.3003e+04, 3.0050e+03, 5.2000e+01, 3.4000e+01, 1.9000e+01,
        1.7304e+03, 1.1314e+03, 6.3230e+02],
       [4.3004e+04, 2.6139e+04, 1.3620e+03, 4.9400e+02, 2.1200e+02,
        5.2106e+03, 1.8899e+03, 8.1100e+02],
       [4.3005e+04, 2.0100e+02,        nan,        nan,        nan,
               nan,        nan,        nan]])

In [8]:
#display head of zip code dist array
dists[0:5,:]

array([[45883.        , 45883.        ,     0.        ],
       [45883.        , 44331.        ,   156.78305048],
       [45883.        , 44615.        ,   187.33630467],
       [45883.        , 44665.        ,   194.30870068],
       [45883.        , 43837.        ,   165.59264031]])

In [9]:
oh_covid_train, oh_covid_test = train_test_split(oh_covid,test_size = .25,random_state = rseed,shuffle = True)
print(oh_covid_train.shape)
print(oh_covid_train[0:2,:])
print(oh_covid_test.shape)
print(oh_covid_test[0:2,:])

(891, 8)
[[4.5885e+04 1.2741e+04 7.6100e+02 3.6700e+02 1.0600e+02 5.9728e+03
  2.8805e+03 8.3200e+02]
 [4.5352e+04 2.4200e+02 7.0000e+00        nan        nan 2.8926e+03
         nan        nan]]
(298, 8)
[[4.4685e+04 2.7842e+04 8.2400e+02 4.5600e+02 2.3500e+02 2.9596e+03
  1.6378e+03 8.4400e+02]
 [4.5123e+04 8.7360e+03 1.9600e+02 8.1000e+01 3.5000e+01 2.2436e+03
  9.2720e+02 4.0060e+02]]


In [10]:
#sorting test code
#TO BE DELETED BEFORE SUBMISSION
temp = dists[dists[:,0] == 45883,:];
temp_sorted = temp[temp[:,2].argsort()]
print(temp.shape)
print(temp[0:5])
print(temp_sorted[0:5,2])

(1498, 3)
[[45883.         45883.             0.        ]
 [45883.         44331.           156.78305048]
 [45883.         44615.           187.33630467]
 [45883.         44665.           194.30870068]
 [45883.         43837.           165.59264031]]
[0.         0.99789907 4.29792141 4.64255669 5.66799433]


In [11]:
#do nn_regression for a target zip code given copies of the zip code distances, covid data, 
# and number of neighbors to consider
# done for both quasi-normalized distance and raw distance
# returns nn_regression estimates done for each of the |source_data.cols-1| features
# in |n_vals|x6 estimates ways for each number of neighbors and distance type
# so result is an |source_data.cols-1|x6x|n_vals| array
def nn_regression(target_zip, dist_data, source_data, n_vals):
    #list zip code distances to only those that match the target zip code
    dist_holder = np.copy(dist_data[dist_data[:,0] == target_zip])
    
    #sort the distances by distance, ascending 
    dist_sorted = dist_holder[dist_holder[:,2].argsort()]
    
    #remove the first line of dist_sorted if it was the zip code itself
    dist_sorted[dist_sorted[:,2] < .1] = 10000000
    
    #build dist_vals to contain source zip codes, inverse distance, exponential decay distance, 
        #mean normalized inverse distance, mean normalized exponential decay distance, 
        #std normalized inverse distance, std normalized exponential decay distance
    dShape = dist_sorted.shape
    dist_vals = np.zeros((dist_sorted.shape[0], 7))
    dist_vals[:,0] = dist_sorted[:,1]
    dist_vals[:,1] = 1/dist_sorted[:,2]
    dist_vals[:,2] = np.exp(-1*np.square(dist_sorted[:,2]))
    dist_mean = np.mean(dist_sorted[:,2])
    dist_vals[:,3] = dist_mean/dist_sorted[:,2]
    dist_vals[:,4] = np.exp(-1*np.square(dist_sorted[:,2]/dist_mean))
    dist_std = np.std(dist_sorted[:,2])
    dist_vals[:,5] = dist_std/dist_sorted[:,2]
    dist_vals[:,6] = np.exp(-1*np.square(dist_sorted[:,2]/dist_std))
    
    #build total matrix to account for each option of dist_vals, source_vals, and n
    totals = np.zeros((source_data.shape[1]-1,dist_vals.shape[1]-1,np.size(n_vals)))
    weights = np.zeros((source_data.shape[1]-1,dist_vals.shape[1]-1,np.size(n_vals)))
    
    #since dist is sorted, only need to go as deep as the max n
    depth = np.amax(n_vals)
    
    #consider as many nearest neighbors as the max of the n options, but tolerate nan data
    for k in range(dShape[0]):
        #build holder for source data
        t = np.zeros(source_data.shape[1]-1)
        #no guarantee a match has been found yet
        t_found = False
        n_found = 0
        
        #go through source data to find matching source id (zip code)
        for j in range(source_data.shape[0]):
            #if match, assign to t and breack
            if source_data[j,0] == dist_vals[k,0]:
                t = source_data[j,:]
                t_found = True
                break
        #only proceed if source data found and does not contain nans
        if t_found and not np.isnan(np.sum(t)):
            #update found count
            n_found = n_found + 1
            
            #loop through n neighbor values given to find
            for i in range(np.size(n_vals)):
                #only if within limit for that neighbor class, proceed
                if n_found <= n_vals[i]:
                    #update the totals for the corresponding source and distance features
                    for x in range(source_data.shape[1]-1):
                        for y in range(dist_vals.shape[1]-1):
                            totals[x,y,i] = totals[x,y,i] + dist_vals[k,y+1]*t[x+1]
                            weights[x,y,i] = weights[x,y,i] + dist_vals[k,y+1]
        if n_found >= depth:
            break
    totals = np.divide(totals, weights)
    prediction = np.copy(totals)
    return prediction;

In [12]:
temp = nn_regression(45215, dists, oh_covid, neighbor_vals)
print(temp.shape)
print(oh_covid_test[0:2,0])

(7, 6, 6)
[44685. 45123.]


In [13]:
print(temp[:,:,5])

[[15539.01339563 11312.37583494 15539.01339563 12140.34176647
  15539.01339563 12140.21361973]
 [  579.41263172   409.32626059   579.41263172   421.26428711
    579.41263172   421.23891828]
 [  236.70732814   151.18273292   236.70732814   185.67718538
    236.70732814   185.67230783]
 [  106.09597085    74.29465426   106.09597085    84.2331595
    106.09597085    84.23255856]
 [ 3560.98335967  3603.90361071  3560.98335967  3286.68080182
   3560.98335967  3286.5162878 ]
 [ 1532.76688565  1374.50644574  1532.76688565  1564.20918067
   1532.76688565  1564.16218914]
 [  688.58231637   673.44999453   688.58231637   707.06246673
    688.58231637   707.0525439 ]]


In [14]:
def assess_error(train_data, test_data, dist_data, n_vals):
    testShape = test_data.shape
    err_rates = np.zeros((test_data.shape[1]-1,6,np.size(n_vals)))
    err_tot = np.zeros((test_data.shape[1]-1,6,np.size(n_vals)))
    r = 0
    for k in range(testShape[0]):
        if not np.isnan(np.sum(test_data[k,:])):
            r += 1
            sample = test_data[k,:]
            preds = nn_regression(sample[0], dist_data, train_data, n_vals)
            for i in range(np.size(n_vals)):
                for j in range(6):
                    err_tot[:,j,i] = err_tot[:,j,i] + np.divide(np.abs(np.subtract(preds[:,j,i],sample[1:])),sample[1:])
    err_rates = err_tot/r
    return err_rates;

In [15]:
oh_covid_err_rates = assess_error(oh_covid_train, oh_covid_test, dists, neighbor_vals)

In [16]:
print(oh_covid_err_rates[:,:,5])

[[3.88033877 2.05040497 3.88033877 3.96986114 3.88033877 3.96984101]
 [4.38267697 3.20055594 4.38267697 4.39719428 4.38267697 4.39718424]
 [3.48354375 2.11500889 3.48354375 3.53339594 3.48354375 3.5334063 ]
 [3.38599159 1.96566299 3.38599159 3.44540378 3.38599159 3.44540947]
 [0.29624693 0.31343003 0.29624693 0.34235531 0.29624693 0.34237185]
 [0.29667051 0.3075416  0.29667051 0.33123491 0.29667051 0.33124261]
 [0.35537633 0.41743317 0.35537633 0.36694988 0.35537633 0.36695165]]


In [17]:
detailed_neighbors = np.array(range(25))+1

In [18]:
oh_covid_err_rates_detailed = assess_error(oh_covid_train, oh_covid_test, dists, detailed_neighbors)

In [19]:
print(oh_covid_err_rates_detailed[:,:,5])

[[3.88033877 2.05040497 3.88033877 3.96986114 3.88033877 3.96984101]
 [4.38267697 3.20055594 4.38267697 4.39719428 4.38267697 4.39718424]
 [3.48354375 2.11500889 3.48354375 3.53339594 3.48354375 3.5334063 ]
 [3.38599159 1.96566299 3.38599159 3.44540378 3.38599159 3.44540947]
 [0.29624693 0.31343003 0.29624693 0.34235531 0.29624693 0.34237185]
 [0.29667051 0.3075416  0.29667051 0.33123491 0.29667051 0.33124261]
 [0.35537633 0.41743317 0.35537633 0.36694988 0.35537633 0.36695165]]


In [21]:
for i in range(np.size(neighbor_vals)):
    name_str = "oh_covid_err_rates_" + str(i) + .csv"
    np.savetxt(name_str, oh_covid_err_rates[:,:,i], delimiter=",")

np.savetxt("oh_covid_err_rates_detailed.csv", oh_covid_err_rates_detailed, delimiter=",")

ValueError: Expected 1D or 2D array, got 3D array instead