In [None]:
import numpy as np
import pandas as pd
import math
import matplotlib.pyplot as plt

# sklearn utilities
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import mean_squared_error as MSE 
from sklearn.preprocessing import robust_scale

# sklearn models
from sklearn import linear_model
from sklearn.svm import SVR

# Inputing Files

In [None]:
county_area = pd.read_csv('county_area.csv')
county_area = county_area.drop(["Areaname"], axis=1)
county_area = county_area.rename(columns={"STCOU": "fips"})
county_area

In [None]:
county_mask_use = pd.read_csv('mask_use.csv')
county_mask_use = county_mask_use.rename(columns={"COUNTYFP": "fips"})
county_mask_use

In [None]:
county_cases = pd.read_csv('county_cases.csv')
county_cases = county_cases.drop(["county","state"], axis=1)
county_cases

In [None]:
county_pop = pd.read_csv('county_pop.csv', encoding='latin-1')
county_pop = county_pop.drop(['STNAME','CTYNAME'], axis=1)
county_pop = county_pop.rename(columns={"COUNTY": "fips"})
county_pop

## Merging Dataframes Together

In [None]:
df = county_pop.merge(county_mask_use, how='left', on="fips")
df = df.merge(county_area, how="left", on="fips")
df['POP_DEN'] = df['POPESTIMATE2019'] / df['AREA']
df['POP_DENxALWAYS'] = df['POP_DEN'] * df['ALWAYS']
df['POP_DENxFREQUENTLY'] =df['POP_DEN'] * df['FREQUENTLY']
df['POP_DENxSOMETIMES'] = df['POP_DEN'] * df['SOMETIMES']
df['POP_DENxRARELY'] = df['POP_DEN'] * df['RARELY']
df['POP_DENxNEVER'] = df['POP_DEN'] * df['NEVER']
df['POPxALWAYS'] = df['POPESTIMATE2019'] * df['ALWAYS']
df['POPxFREQUENTLY'] =df['POPESTIMATE2019'] * df['FREQUENTLY']
df['POPxSOMETIMES'] = df['POPESTIMATE2019'] * df['SOMETIMES']
df['POPxRARELY'] = df['POPESTIMATE2019'] * df['RARELY']
df['POPxNEVER'] = df['POPESTIMATE2019'] * df['NEVER']
df

## Normalize Data

In [None]:
def normalize(column):
    normal = []
    for value in column:
        value = (value - min(column)) / (max(column) - min(column))
        normal.append(value)
    return normal

In [None]:
def new_normal(dataframe):
    normal = robust_scale(dataframe)
    return normal

In [None]:
df = df.merge(county_cases, how="left", on="fips")
df.insert(0, 'Ones', 1)
column_titles = ["POPESTIMATE2019","NEVER","RARELY","SOMETIMES","FREQUENTLY","ALWAYS","AREA","POP_DEN","POP_DENxALWAYS","POP_DENxFREQUENTLY","POP_DENxSOMETIMES","POP_DENxRARELY","POP_DENxNEVER",'POPxALWAYS','POPxFREQUENTLY','POPxSOMETIMES','POPxRARELY','POPxNEVER']
new_df = df.reindex(columns=column_titles)
normal_all = new_normal(new_df)
normal_df = pd.DataFrame(normal_all)
normal_df.columns = column_titles
normal_df['fips'] = df['fips']
normal_df = normal_df.merge(county_cases, how="left", on="fips")
column_titles = ["fips","cases","deaths","POPESTIMATE2019","NEVER","RARELY","SOMETIMES","FREQUENTLY","ALWAYS","AREA","POP_DEN","POP_DENxALWAYS","POP_DENxFREQUENTLY","POP_DENxSOMETIMES","POP_DENxRARELY","POP_DENxNEVER",'POPxALWAYS','POPxFREQUENTLY','POPxSOMETIMES','POPxRARELY','POPxNEVER']
normal_df = normal_df.reindex(columns=column_titles)
normal_df

In [None]:
normalize_pop = normalize(df['POPESTIMATE2019'])
normal_pop_df = pd.DataFrame(normalize_pop)
normal_pop_df

## Graphs

fig, ax = plt.subplots(figsize=(16,10))
for i in range(df.shape[0]):
    plt.scatter(df['POPESTIMATE2019'][i], df['cases'][i])
plt.xlabel('Population')
plt.ylabel('Positive Cases')
plt.ylim(0,60000)
plt.xlim(0,2000000)
plt.title('Population vs. Positive Cases')
plt.show()

fig, ax = plt.subplots(figsize=(16,10))
for i in range(df.shape[0]):
    plt.scatter(df['ALWAYS'][i], df['cases'][i])
plt.xlabel('Percentage of Mask Use')
plt.ylabel('Positive Cases')
plt.title('Percentage Mask Use vs. Positive Cases')
plt.show()

fig, ax = plt.subplots(figsize=(16,10))
for i in range(df.shape[0]):
    plt.scatter(df['ALWAYS'][i], df['NEVER'][i])
plt.xlabel('Percentage of Mask Use')
plt.ylabel('Percentage of NO Mask Use')
plt.title('Percentage Mask Use vs. Percent NO Mask Use')
plt.show()

fig, ax = plt.subplots(figsize=(16,10))
for i in range(df.shape[0]):
    plt.scatter(df['POP_DEN'][i], df['cases'][i])
plt.xlabel('Population density')
plt.ylabel('Cases')
plt.title('Population Density vs. Cases')
plt.show()

fig, ax = plt.subplots(figsize=(16,10))
for i in range(df.shape[0]):
    plt.scatter(df['POP_DEN'][i], df['deaths'][i])
plt.xlabel('Population Density')
plt.ylabel('Deaths')
plt.title('Population Density vs. Deaths')
plt.show()

fig, ax = plt.subplots(figsize=(16,10))
for i in range(df.shape[0]):
    plt.scatter(df['POP_DENxALWAYS'][i], df['cases'][i])
plt.xlabel('Population Density x Mask use')
plt.ylabel('Cases')
plt.title('Population Density x Mask use vs. Cases')
plt.show()

## Split Data

In [None]:
def split(X, Y, size):
    return train_test_split(X, Y, test_size=size)

# Models

## Linear Regression

In [None]:
x_train, x_test, y_train, y_test = split(normal_pop_df, df['cases'], 0.2)
x_train_lin = x_train.copy()
print(type(x_train_lin))
x_train_lin.insert(0, 'Ones', 1)
x = np.matrix(x_train_lin.values)
y = np.matrix(y_train.values)

In [None]:
lin_regr = linear_model.LinearRegression(fit_intercept = False)
lin_regr.fit(x, y.T)
lin_regr.coef_ 

In [None]:
X = np.array(x[:, 1].A1)
f = lin_regr.predict(x).flatten()

fig, ax = plt.subplots(figsize=(12,8))
ax.plot(X, f, color = "red", label='Prediction')
ax.plot(normal_pop_df[0], df['cases'], "r+",color = "blue", label='Data')
ax.legend(loc=2)
ax.set_xlabel('Population')
ax.set_ylabel('Positive Cases')
ax.set_title('Predicted Positive Cases vs. Population')
plt.show()

## SVM Regression

## Linear Regression using all of the given variables

In [254]:
#Target Variable
y = np.log(df['cases']) 
x_train, x_test, y_train, y_test = train_test_split(normal_df,y, test_size = .2, shuffle = True)

In [255]:
all_var_reg = linear_model.LinearRegression(fit_intercept = False)
all_var_reg.fit(x_train,y_train)
y_predicted = all_var_reg.predict(x_test)
print('This model includes cases as a feature so there has to be overfitting')
print("The Mean Squared Error given normalized data:" , MSE(y_test, y_predicted))
print('Because the MSE is so small, this is an example of overfitting.')

This model includes cases as a feature so there has to be overfitting
The Mean Squared Error given normalized data: 1.4294280693351769
Because the MSE is so small, this is an example of overfitting.


## Linear Regression using mask use and population ONLY

In [321]:
y = np.log(df['cases']) 
x_train, x_test, y_train, y_test = train_test_split(normal_df[['POPESTIMATE2019','NEVER','RARELY','SOMETIMES','FREQUENTLY','ALWAYS']],y, test_size = .2, shuffle = True)

In [322]:
multi_var_reg = linear_model.LinearRegression(fit_intercept = False)
multi_var_reg.fit(x_train,y_train)
y_predicted = multi_var_reg.predict(x_test)

In [323]:
print("The Mean Squared Error given normalized data:" , MSE(y_test, y_predicted))
print("This very high MSE means that this particular variable configuration is not important")

The Mean Squared Error given normalized data: 1.8007694500198304
This very high MSE means that this particular variable configuration is not important


## Linear Regression using Population density and mask use

In [294]:
y = np.log(df['cases']) 
x_train, x_test, y_train, y_test = train_test_split(normal_df[['POP_DENxSOMETIMES','POP_DENxRARELY','POP_DENxALWAYS','POP_DENxFREQUENTLY','POP_DENxNEVER']],y, test_size = .2, shuffle = True)

In [295]:
multi_var_reg = linear_model.LinearRegression(fit_intercept = False)
multi_var_reg.fit(x_train,y_train)
y_predicted = multi_var_reg.predict(x_test)

In [296]:
print("The Mean Squared Error given normalized data:" , MSE(y_test, y_predicted))
print("THE MSE is still very high, continue to look for suitable features")

The Mean Squared Error given normalized data: 38.65759844315964
THE MSE is still very high, continue to look for suitable features


## Linear Regression using Area and POP x Mask use


In [291]:
y = np.log(df['cases']) 
x_train, x_test, y_train, y_test = train_test_split(normal_df[['AREA','POPxSOMETIMES','POPxRARELY','POPxALWAYS','POPxFREQUENTLY','POPxNEVER']],y, test_size = .2, shuffle = True)

In [292]:
multi_var_reg = linear_model.LinearRegression(fit_intercept = False)
multi_var_reg.fit(x_train,y_train)
y_predicted = multi_var_reg.predict(x_test)

In [293]:
print("The Mean Squared Error given normalized data:" , MSE(y_test, y_predicted))

The Mean Squared Error given normalized data: 37.53605151790012


## Linear Regression using Population and Population Density x Mask Use

In [288]:
y = np.log(df['cases']) 
x_train, x_test, y_train, y_test = train_test_split(normal_df[['POPESTIMATE2019','POP_DENxSOMETIMES','POP_DENxRARELY','POP_DENxALWAYS','POP_DENxFREQUENTLY','POP_DENxNEVER']],y, test_size = .2, shuffle = True)

In [289]:
multi_var_reg = linear_model.LinearRegression(fit_intercept = False)
multi_var_reg.fit(x_train,y_train)
y_predicted = multi_var_reg.predict(x_test)

In [290]:
print("The Mean Squared Error given normalized data:" , MSE(y_test, y_predicted))

The Mean Squared Error given normalized data: 35.41038545603391


## Linear Regression using Population, Population Density x Mask Use, and Area

In [262]:
y = np.log(df['cases']) 
x_train, x_test, y_train, y_test = train_test_split(normal_df[['POPESTIMATE2019','POP_DENxSOMETIMES','POP_DENxRARELY','POP_DENxALWAYS','POP_DENxFREQUENTLY','POP_DENxNEVER','AREA']],y, test_size = .2, shuffle = True)

In [263]:
multi_var_reg = linear_model.LinearRegression(fit_intercept = False)
multi_var_reg.fit(x_train,y_train)
y_predicted = multi_var_reg.predict(x_test)

In [264]:
print("The Mean Squared Error given normalized data:" , MSE(y_test, y_predicted))
multi_var_reg.score(x_test,y_test)

The Mean Squared Error given normalized data: 37.23378271612027


-12.853259303172514

In [253]:
normal_df

Unnamed: 0,fips,cases,deaths,POPESTIMATE2019,NEVER,RARELY,SOMETIMES,FREQUENTLY,ALWAYS,AREA,...,POP_DENxALWAYS,POP_DENxFREQUENTLY,POP_DENxSOMETIMES,POP_DENxRARELY,POP_DENxNEVER,POPxALWAYS,POPxFREQUENTLY,POPxSOMETIMES,POPxRARELY,POPxNEVER
0,1001,2059,31,0.531522,-0.189873,0.013333,0.230769,1.096386,-0.241733,-0.087260,...,0.392066,0.986415,0.686119,0.574735,0.322373,0.388149,0.963127,0.700074,0.560736,0.323106
1,1003,6658,69,3.482726,0.189873,-0.186667,-0.230769,1.433735,-0.278221,2.556161,...,0.524723,1.416847,0.544150,0.524210,0.961035,2.663221,5.688549,2.873971,2.793491,4.353203
2,1005,1033,9,-0.018339,-0.012658,0.640000,0.051282,-0.036145,-0.027366,0.470378,...,-0.133261,-0.142277,-0.126075,0.052016,-0.139840,-0.009778,-0.015499,0.016638,0.277259,-0.015276
3,1007,840,14,-0.058754,-0.607595,-0.520000,-0.256410,0.891566,0.342075,-0.048220,...,0.001306,0.089251,-0.111614,-0.256064,-0.307315,0.011821,0.091853,-0.106102,-0.272649,-0.327508
4,1009,1932,25,0.566030,-0.189873,0.546667,0.820513,-0.120482,-0.173318,-0.001487,...,0.387452,0.466975,1.008201,1.061257,0.294052,0.442616,0.516004,1.141521,1.168028,0.349958
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3123,56037,462,2,0.293013,-0.088608,2.960000,1.461538,-0.698795,-1.044470,18.284960,...,-0.367398,-0.396004,-0.335278,-0.260007,-0.378054,-0.034024,0.088161,1.040339,2.625569,0.225214
3124,56039,747,1,-0.039887,0.341772,1.120000,0.564103,0.518072,-0.716078,6.634632,...,-0.352040,-0.355401,-0.338757,-0.307004,-0.335629,-0.139745,0.055329,0.136269,0.449436,0.133611
3125,56041,431,3,-0.096983,0.379747,2.733333,0.487179,0.036145,-1.062714,2.668729,...,-0.339340,-0.322609,-0.285016,-0.037933,-0.272172,-0.222502,-0.081348,0.039673,0.928522,0.069685
3126,56043,144,7,-0.316007,1.721519,1.093333,-0.602564,0.975904,-0.957811,2.957099,...,-0.368973,-0.375142,-0.396628,-0.356235,-0.308237,-0.319737,-0.248058,-0.349510,-0.161863,-0.031260


## Linear Regression using Population x Mask Use and Population Desnsity x Mask use

In [274]:
y = np.log(df['cases']) 
x_train, x_test, y_train, y_test = train_test_split(normal_df[['POPESTIMATE2019','AREA','POPESTIMATE2019','POP_DENxSOMETIMES','POP_DENxRARELY','POP_DENxALWAYS','POP_DENxFREQUENTLY','POP_DENxNEVER','POPxSOMETIMES']],y, test_size = size, shuffle = True)

In [284]:
y = np.log(df['cases']) 
sizes = [.05,.1,.15,.2,.25,.3,.35,.4]
outs = []
for size in sizes:
    avg = []
    for i in range(20):
        x_train, x_test, y_train, y_test = train_test_split(normal_df[['POPESTIMATE2019','POP_DENxSOMETIMES','POP_DENxRARELY','POP_DENxALWAYS','POP_DENxFREQUENTLY','POP_DENxNEVER','POPxSOMETIMES','POPxRARELY','POPxALWAYS','POPxFREQUENTLY','POPxNEVER',]],y, test_size = size, shuffle = True)
        multi_var_reg = linear_model.LinearRegression(fit_intercept = False)
        multi_var_reg.fit(x_train,y_train)
        y_predicted = multi_var_reg.predict(x_test)
        avg.append(MSE(y_test, y_predicted))
    outs.append(sum(avg) / len(avg))
train_splits = sizes[outs.index(min(outs))]
print('The best Split: ', train_split)
print('With an avg MSE of: ', min(outs))

The best Split:  0.2
With an avg MSE of:  2.0177172454522405


In [285]:
print("The Mean Squared Error given normalized data:" , MSE(y_test, y_predicted))

The Mean Squared Error given normalized data: 2.050389461387641


In [286]:
#Predictions
multi_var_reg.score(x_test,y_test)

0.2683525334970642

## Optimized Linear Regression

In [344]:
y = np.log(df['cases']) 
sizes = [.05,.1,.15,.2,.25,.3,.35,.4]
outs = []
for size in sizes:
    avg = []
    for i in range(20):
        x_train, x_test, y_train, y_test = train_test_split(normal_df[['POPESTIMATE2019','AREA','NEVER','RARELY','SOMETIMES','FREQUENTLY','ALWAYS']],y, test_size = size, shuffle = True)
        multi_var_reg = linear_model.LinearRegression(fit_intercept = False)
        multi_var_reg.fit(x_train,y_train)
        y_predicted = multi_var_reg.predict(x_test)
        avg.append(MSE(y_test, y_predicted))
    outs.append(sum(avg) / len(avg))
train_splits = sizes[outs.index(min(outs))]
print('The best Split: ', train_split)
print('With an avg MSE of: ', min(outs))

The best Split:  0.2
With an avg MSE of:  1.9216489462200432


In [345]:
multi_var_reg.score(x_test,y_test)

0.1062378521882349

## SVM Regression

In [None]:
normal_svm_df = normal_df.drop(['fips','cases','deaths'], axis=1)
sizes = [0.2,0.3]
outs = []
for size in sizes:
    avg = []
    for i in range(20):
        x_train, x_test, y_train, y_test = split(normal_svm_df, df['cases'], size)
        svr = SVR()
        svr.fit(x_train, y_train)
        svr_predict = svr.predict(x_test)
        avg.append(MSE(y_test, svr_predict))
    outs.append(sum(avg) / len(avg))
train_split = sizes[outs.index(min(outs))]
print('Best split is:', train_split)
print('With average MSE of:', min(outs))

In [None]:
x_train, x_test, y_train, y_test = split(normal_svm_df, df['cases'], train_split)
kernels = ['rbf', 'linear', 'poly']
outs = []
for k in kernels:
    avg = []
    for i in range(60):
        svr = SVR(kernel=k)
        svr.fit(x_train, y_train)
        svr_predict = svr.predict(x_test)
        avg.append(MSE(y_test, svr_predict))
    outs.append(sum(avg) / len(avg))
optimal_kernel = kernels[outs.index(min(outs))]
print('Best kernel is:', ideal_kernel)
print('With average MSE of:', min(outs))

In [None]:
x_train, x_test, y_train, y_test = split(normal_svm_df, df['cases'], train_split)
regs = [1,2,5,25,125]
outs = []
for reg in regs:
    avg = []
    for i in range(20):
        svr = SVR(kernel=optimal_kernel, C=reg)
        svr.fit(x_train, y_train)
        svr_predict = svr.predict(x_test)
        avg.append(MSE(y_test, svr_predict))
    outs.append(sum(avg) / len(avg))
optimal_C = regs[outs.index(min(outs))]
print('Best C value is:', optimal_C)
print('With average MSE of:', min(outs))
outs