In [1]:
import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
%matplotlib inline

import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LassoCV, LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import AdaBoostRegressor, ExtraTreesRegressor, RandomForestRegressor
from sklearn.neighbors import KNeighborsRegressor


sns.set(color_codes=True)

In [None]:
def mse(y_test, y_true):
    return np.mean((y_test - y_true) ** 2)

def expo_fit(y):
    x = np.arange(-y.shape[0], 0)
    return np.exp(np.polyfit(x, np.log(y), 1)[1])

### Step I: Import the data that has already been cleaned and merged

In [None]:
data = pd.read_csv("all_in_1.csv")

X, y = data.drop(columns=['4/18/20']), data['4/18/20']

### Step II: Visualizations (Feel free to skip this part if you want)

In [None]:
results = []
for j in range(86,89):
    result = []
    for i in range(j-12, j-2):
        y_mat = X.iloc[:,i:j].to_numpy()
        exp_predict = np.apply_along_axis(expo_fit, 1, y_mat)
        exp_predict[np.isnan(exp_predict)] = 0
        result.append(mse(exp_predict, X.iloc[:,j]))
    results.append(result)

# special handling for interpolating 4/18/20 data
result = []
for i in range(78, 88):
    y_mat = X.iloc[:,i:90].to_numpy()
    exp_predict = np.apply_along_axis(expo_fit, 1, y_mat)
    exp_predict[np.isnan(exp_predict)] = 0
    result.append(mse(exp_predict, y))

results.append(result)

In [None]:
x = np.arange(11, 1, -1)
labels = ['4/15/20', '4/16/20', '4/17/20', '4/18/20']
plt.figure(figsize=(8, 4))
for i in range(4):
    plt.plot(x, results[i], label=labels[i])
plt.legend()
plt.ylabel("mse of interpolation")
plt.xlabel("# of days used in interpolation")
plt.title("performance of exponential interpolation v.s. # of previous days used", fontsize=14);

In [None]:
plt.figure(figsize=(16, 6))
plt.subplot(1, 2, 1)
sns.distplot(y[y < np.percentile(y, 80)], kde=False)
plt.xlabel("Confirmed cases in 4/18/20")
plt.ylabel("Frequency")
plt.title("Figure A: distribution of confirmed cases for counties < 80 percentile");

plt.subplot(1, 2, 2)
sns.scatterplot(x="4/18/20", y="PopulationEstimate2018", data=data)
plt.xlabel("Confirmed cases in 4/18/20")
plt.title("Figure B: distplot of confirmed cases v.s. population");

In [None]:
X.shape

### 2.5 Feature Selection using KNN

In [None]:
X_KNN, y_KNN = data.loc[:, ['Lat_x', 'Long_']], data['4/18/20']

In [None]:
def dist_metric(loc1, loc2):
    lat1, lon1, lat2, lon2  = loc1[0], loc1[1], loc2[0], loc2[1]
    p = 0.017453292519943295 # Pi / 180
    a = 0.5 - np.cos((lat2 - lat1) * p) / 2 + np.cos(lat1 * p) * np.cos(lat2 * p) * \
    (1 - np.cos((lon2 - lon1) * p)) / 2
    return 12742 * np.arcsin(a ** 2) # 12742 = 2 * R

In [None]:
neigh = KNeighborsRegressor(n_neighbors=2, weights = 'distance', metric = dist_metric)
neigh.fit(X_KNN, y_KNN)


In [None]:
# testing 
neighbours = list(neigh.kneighbors([[32.539527, -86.644082]])[1][0]) + list(neigh.kneighbors([[45.898386, -91.790504]])[1][0])
neighbour_locations = X.loc[neighbours, :]

fig, ax = plt.subplots(figsize=(7, 5))
ax = sns.scatterplot(x="Long_", y="Lat_x", data=neighbour_locations)

plt.xlabel('lat')
plt.ylabel('long')
plt.title('counties by position')

In [None]:
clusters = neigh.kneighbors()[1]

In [None]:
neighbour_dict = {i: clusters[i] for i in X.index}

In [None]:
# ref_pop is the poplation of the county being referenced to, neigh_pop is the population of one of its neighbor counties
# counts, neigh_pops are np arrays, ref_pop is is an integer
def normalize_by_pop(stat_mat, neigh_pops, ref_pop):
    factors = [ref_pop/n_pop for n_pop in neigh_pops]
    weighted_sums = stat_mat @ factors
    return list(weighted_sums / len(neigh_pops))
 

f = ['4/15/20','4/16/20','4/17/20']
f_new = ['4/15/20_neigh_nor','4/16/20_neigh_nor','4/17/20_neigh_nor']

new_feature_list = []


In [None]:
for i in X.index:
    neigh_id = neighbour_dict.get(i)

    ref_pop = X.loc[i, 'PopulationEstimate2018']
    neigh_pops = X.loc[neigh_id, 'PopulationEstimate2018']
    stat_matrix = np.transpose(X.loc[neigh_id, f]) # a len(f) by num_neigh matrix
    
    new_feature_list.append(normalize_by_pop(stat_matrix, neigh_pops, ref_pop))

In [None]:
new_feature_df = pd.DataFrame(np.array(new_feature_list), columns=f_new)

X_new = pd.concat([X, new_feature_df], axis=1)

### 3. Simple Regressors (LinearRegression v.s. DecisionTree)

In [None]:
features = ['4/15/20', 
            '4/16/20', 
            '4/17/20', 
            'dem_to_rep_ratio', 
            'PopulationDensityperSqMile2010', 
            'public schools',
            'FracMale2017',
            'DiabetesPercentage',
            'People_Tested',
            'HeartDiseaseMortality'] + f_new
X_train, X_test, y_train, y_test = train_test_split(X_new[features], y, test_size=0.1, random_state=42)

In [None]:
linear_regr = LinearRegression(normalize=True)
linear_regr.fit(X_train, y_train)

tree_regr = DecisionTreeRegressor(max_depth=20, random_state=80)
tree_regr.fit(X_train, y_train)

linear_pred = np.round(np.maximum(0, linear_regr.predict(X_test)))
linear_mse = mse(linear_pred, y_test)
tree_pred = np.round(tree_regr.predict(X_test))
tree_mse = mse(tree_pred, y_test)

linear_mse, tree_mse

#### Check the mse for prediction of counties with small confirmed cases

In [None]:
small_indices = np.argwhere(y_test < 50).flatten()
a = y_test.to_numpy()[small_indices]
"""
Linear
"""
b = linear_pred[small_indices]
"""
Decision Tree
"""
c = tree_pred[small_indices]

print(f"Linear mse: {mse(a, b)}, Tree mse: {mse(a, c)}")

In [None]:
big_indices = np.argwhere(y_test >= 50).flatten()
a = y_test.to_numpy()[big_indices]
"""
Linear
"""
b = linear_pred[big_indices]
"""
Decision Tree
"""
c = tree_pred[big_indices]

print(f"Linear mse: {mse(a, b)}, Tree mse: {mse(a, c)}")

In [None]:
class finalRegressor(object):
    def __init__(self, threshold=50, forest=False, randstate=42):
        self.thres = threshold
        if forest:
            self.tree_regr = RandomForestRegressor(n_estimators=50, max_depth=20, random_state=randstate)
        else:
            self.tree_regr = DecisionTreeRegressor(max_depth=20, random_state=randstate)
        self.linear_regr = LinearRegression(normalize=True)
    
    def fit(self, X_train, y_train):
        self.tree_regr.fit(X_train, y_train)
        self.linear_regr.fit(X_train, y_train)
        self.thres_val = np.percentile(y_train, self.thres)
    
    def predict(self, X_test):
        self.linear_pred = np.maximum(self.linear_regr.predict(X_test), 0)
        self.tree_pred = self.tree_regr.predict(X_test)
        self.cond_vec = ((self.linear_pred + self.tree_pred) / 2) < self.thres_val
        return np.round(self.tree_pred * self.cond_vec + self.linear_pred * ~self.cond_vec)


In [None]:
thresh_results = []

for thres in range(20, 100):
    final_regr = finalRegressor(thres)
    final_regr.fit(X_train, y_train)
    final_pred = final_regr.predict(X_test)
    thresh_results.append(mse(final_pred, y_test))

In [None]:
min(thresh_results), np.mean(thresh_results),  np.std(thresh_results)

In [None]:
thresh_results

In [None]:
plt.plot(thresh_results)

In [None]:
sns.set(style="whitegrid")
tips = sns.load_dataset("tips")
ax = sns.barplot(x="day", y="total_bill", data=tips)