### Preprocessing

In [141]:
# Import our dependencies
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler,OneHotEncoder
import pandas as pd
import tensorflow as tf
import matplotlib.pyplot as plt
from scipy import stats
from sklearn.linear_model import LinearRegression

#  Import and read the charity_data.csv.
import pandas as pd 
vax_df = pd.read_csv("sample_data/county_data.csv")
demo_df = pd.read_csv("sample_data/NYS_demographics")
income_df = pd.read_csv("sample_data/NYS_income")
vax_df.head()
demo_df.head()
income_df.head()

Unnamed: 0,index,County / County Group,Households with Elderly,Households with Children,Economic Development Region,Income Groups,Percent of Poverty Level,Low-to-Moderate Income (LMI) Group,Race / Ethnicity,Housing Unit Type,LMI Population Segment,Education Level,Head of Household Age
0,0,Albany,No,No,Capital District,"$0 to <$10,000",1 - Income at or below 100% HHSPG,Group 1 - Very Low Income,"Asian, non-Hispanic",4 - Moderate Multi-Family (5-50 units),#1 – Low-Income Renters in Multifamily (5+ Uni...,3 - Some College,<30
1,1,Albany,Yes,No,Capital District,"$0 to <$10,000",1 - Income at or below 100% HHSPG,Group 1 - Very Low Income,"Asian, non-Hispanic",4 - Moderate Multi-Family (5-50 units),#1 – Low-Income Renters in Multifamily (5+ Uni...,4 - Associate's,70+
2,2,Albany,No,No,Capital District,"$0 to <$10,000",1 - Income at or below 100% HHSPG,Group 1 - Very Low Income,"Asian, non-Hispanic",5 - Large Multi-Family (50+ units),#1 – Low-Income Renters in Multifamily (5+ Uni...,4 - Associate's,<30
3,3,Albany,No,No,Capital District,"$0 to <$10,000",1 - Income at or below 100% HHSPG,Group 1 - Very Low Income,"Asian, non-Hispanic",4 - Moderate Multi-Family (5-50 units),#1 – Low-Income Renters in Multifamily (5+ Uni...,3 - Some College,<30
4,4,Albany,Yes,No,Capital District,"$0 to <$10,000",1 - Income at or below 100% HHSPG,Group 1 - Very Low Income,"Asian, non-Hispanic",2 - Single Family Detached,#3 – Low-Income Owners in Single-Family & Smal...,5 - Bachelor's,70+


In [142]:
#Only keep necessary columns
vax_df = vax_df[['county','population','metrics.vaccinationsCompletedRatio']]

#Drop rows with NA
vax_df = vax_df.dropna()

#Clean county names
vax_df['county'] = vax_df['county'].str.replace(' County', '')

#Rename columns
vax_df.columns = ['County', 'Total_Pop', 'Vax_Rate']

vax_df

Unnamed: 0,County,Total_Pop,Vax_Rate
0,Albany,305506,0.672
1,Allegany,46091,0.389
2,Bronx,1418207,0.614
3,Broome,190488,0.580
4,Cattaraugus,76117,0.500
...,...,...,...
57,Washington,61204,0.598
58,Wayne,89918,0.566
59,Westchester,967506,0.705
60,Wyoming,39859,0.464


In [143]:
#Only keep necessary columns
demo_df = demo_df[['County', 'White', 'AfricanAmerican', 'Asian', 'Other']]

#Drop rows with NA
demo_df = demo_df.dropna()

#Clean percentages
demo_df['White'] = demo_df['White'].str.replace('%', '').astype(float)
demo_df['AfricanAmerican'] = demo_df['AfricanAmerican'].str.replace('%', '').astype(float)
demo_df['Asian'] = demo_df['Asian'].str.replace('%', '').astype(float)
demo_df['Other'] = demo_df['Other'].str.replace('%', '').astype(float)

demo_df.head()


Unnamed: 0,County,White,AfricanAmerican,Asian,Other
1,Albany,77.8,13.2,6.6,2.5
2,Columbia,91.1,5.1,1.9,2.0
3,Greene,90.2,6.4,1.3,2.1
4,Rensselaer,87.8,7.2,2.8,2.2
5,Saratoga,93.6,1.9,2.7,1.7


In [144]:
#Only keep necessary columns
income_df = income_df[['County / County Group', 'Low-to-Moderate Income (LMI) Group']]

#Drop rows with NA
income_df = income_df.dropna()

#Recode Income
income_df[['Income - Very Low', 'Income - Low', 'Income - Moderate', 'Income - High']] = pd.get_dummies(income_df['Low-to-Moderate Income (LMI) Group']).astype(int)

#Drop LMI Group
income_df = income_df.drop('Low-to-Moderate Income (LMI) Group', axis=1)

#Rename County
income_df.rename(columns={'County / County Group':'County'}, inplace=True)

#Group by county
income_df = income_df.groupby('County').sum()

In [145]:
# join files
df_1 = vax_df.merge(demo_df, how='inner',on='County')
df = df_1.merge(income_df, how='inner', on='County')
df.head()

Unnamed: 0,County,Total_Pop,Vax_Rate,White,AfricanAmerican,Asian,Other,Income - Very Low,Income - Low,Income - Moderate,Income - High
0,Albany,305506,0.672,77.8,13.2,6.6,2.5,896,792,1152,4236
1,Albany,305506,0.672,77.8,13.2,6.6,2.5,896,792,1152,4236
2,Bronx,1418207,0.614,44.7,43.7,4.7,6.8,8208,3870,4004,7196
3,Bronx,1418207,0.614,44.7,43.7,4.7,6.8,8208,3870,4004,7196
4,Chautauqua,126903,0.53,94.3,2.8,0.7,2.2,980,1000,672,2350


In [146]:
# Drop county
df = df.drop(['County'], axis=1)

df.head()

Unnamed: 0,Total_Pop,Vax_Rate,White,AfricanAmerican,Asian,Other,Income - Very Low,Income - Low,Income - Moderate,Income - High
0,305506,0.672,77.8,13.2,6.6,2.5,896,792,1152,4236
1,305506,0.672,77.8,13.2,6.6,2.5,896,792,1152,4236
2,1418207,0.614,44.7,43.7,4.7,6.8,8208,3870,4004,7196
3,1418207,0.614,44.7,43.7,4.7,6.8,8208,3870,4004,7196
4,126903,0.53,94.3,2.8,0.7,2.2,980,1000,672,2350


In [147]:
# Check all values are numeric
df.info()



<class 'pandas.core.frame.DataFrame'>
Int64Index: 46 entries, 0 to 45
Data columns (total 10 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   Total_Pop          46 non-null     int64  
 1   Vax_Rate           46 non-null     float64
 2   White              46 non-null     float64
 3   AfricanAmerican    46 non-null     float64
 4   Asian              46 non-null     float64
 5   Other              46 non-null     float64
 6   Income - Very Low  46 non-null     int64  
 7   Income - Low       46 non-null     int64  
 8   Income - Moderate  46 non-null     int64  
 9   Income - High      46 non-null     int64  
dtypes: float64(5), int64(5)
memory usage: 4.0 KB


In [148]:
# Export to CSV for multiple regression
df.to_csv('sample_data/clean_df.csv')

In [149]:
# Split our preprocessed data into our features and target arrays
y = df["Vax_Rate"].values
X = df.drop(["Vax_Rate"],1).values

# Split the preprocessed data into a training and testing dataset
X_train, X_test, y_train, y_test = train_test_split(X, y)

In [150]:
# Create a StandardScaler instances
scaler = StandardScaler()

# Fit the StandardScaler
X_scaler = scaler.fit(X_train)

# Scale the data
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

### Linear Regression

In [151]:
model = LinearRegression()
model.fit(X_train, y_train)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None, normalize=False)

In [152]:
y_pred = model.predict(X_test)
print(y_pred.shape)

(12,)


In [153]:
print(model.coef_)
print(model.intercept_)

[-1.10873397e-10 -1.73933863e-01 -1.75468284e-01 -1.69163031e-01
 -1.54979827e-01  4.07509366e-07 -3.27910282e-05 -3.32032577e-06
  9.71618633e-06]
17.9592929880453


### Neural Network

In [154]:
# Define the model - deep neural net, i.e., the number of input features and hidden nodes for each layer.
number_input_features = len(X_train[0])
hidden_nodes_layer1 = 100
hidden_nodes_layer2 = 100

nn = tf.keras.models.Sequential()

# First hidden layer
nn.add(
    tf.keras.layers.Dense(units=hidden_nodes_layer1, 
                          input_dim=number_input_features, 
                          activation="relu")
)

# Second hidden layer
nn.add(tf.keras.layers.Dense(units=hidden_nodes_layer2, 
                             activation="tanh"))

# Output layer
nn.add(tf.keras.layers.Dense(units=1, activation="linear"))

# Check the structure of the model
nn.summary()

Model: "sequential_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_3 (Dense)              (None, 100)               1000      
_________________________________________________________________
dense_4 (Dense)              (None, 100)               10100     
_________________________________________________________________
dense_5 (Dense)              (None, 1)                 101       
Total params: 11,201
Trainable params: 11,201
Non-trainable params: 0
_________________________________________________________________


In [155]:
# Compile the model
nn.compile(loss="mean_absolute_error", 
           optimizer="adam")

In [156]:
# Create a callback that saves the model's weights every 5 epochs
callbacks = tf.keras.callbacks.ModelCheckpoint('weights{epoch:08d}.h5', 
                                     save_weights_only=True, save_freq=5)

In [157]:
# Train the model
fit_model = nn.fit(X_train_scaled, y_train, epochs=100, callbacks=[callbacks])

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100
Epoch 75/100
Epoch 76/100
Epoch 77/100
Epoch 78

In [158]:
# Evaluate the model using the test data
model_loss = nn.evaluate(X_test_scaled,y_test,verbose=1)
print(f"Loss: {model_loss}")

Loss: 0.05741020664572716


In [159]:
#Save 
nn.save('COVID_Vax_Predictions.h5')