### Preprocessing

In [1]:
# Import our dependencies
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler,OneHotEncoder
import pandas as pd
import tensorflow as tf
import matplotlib.pyplot as plt
from scipy import stats
from sklearn.linear_model import LinearRegression

#  Import and read the charity_data.csv.
import pandas as pd 
vax_df = pd.read_csv("sample_data/county_data.csv")
demo_df = pd.read_csv("sample_data/NYS_demographics")
income_df = pd.read_csv("sample_data/NYS_income")
vax_df.head()
demo_df.head()
income_df.head()

Unnamed: 0,index,County / County Group,Households with Elderly,Households with Children,Economic Development Region,Income Groups,Percent of Poverty Level,Low-to-Moderate Income (LMI) Group,Race / Ethnicity,Housing Unit Type,LMI Population Segment,Education Level,Head of Household Age
0,0,Albany,No,No,Capital District,"$0 to <$10,000",1 - Income at or below 100% HHSPG,Group 1 - Very Low Income,"Asian, non-Hispanic",4 - Moderate Multi-Family (5-50 units),#1 – Low-Income Renters in Multifamily (5+ Uni...,3 - Some College,<30
1,1,Albany,Yes,No,Capital District,"$0 to <$10,000",1 - Income at or below 100% HHSPG,Group 1 - Very Low Income,"Asian, non-Hispanic",4 - Moderate Multi-Family (5-50 units),#1 – Low-Income Renters in Multifamily (5+ Uni...,4 - Associate's,70+
2,2,Albany,No,No,Capital District,"$0 to <$10,000",1 - Income at or below 100% HHSPG,Group 1 - Very Low Income,"Asian, non-Hispanic",5 - Large Multi-Family (50+ units),#1 – Low-Income Renters in Multifamily (5+ Uni...,4 - Associate's,<30
3,3,Albany,No,No,Capital District,"$0 to <$10,000",1 - Income at or below 100% HHSPG,Group 1 - Very Low Income,"Asian, non-Hispanic",4 - Moderate Multi-Family (5-50 units),#1 – Low-Income Renters in Multifamily (5+ Uni...,3 - Some College,<30
4,4,Albany,Yes,No,Capital District,"$0 to <$10,000",1 - Income at or below 100% HHSPG,Group 1 - Very Low Income,"Asian, non-Hispanic",2 - Single Family Detached,#3 – Low-Income Owners in Single-Family & Smal...,5 - Bachelor's,70+


In [2]:
#Only keep necessary columns
vax_df = vax_df[['county','population','metrics.vaccinationsCompletedRatio']]

#Drop rows with NA
vax_df = vax_df.dropna()

#Clean county names
vax_df['county'] = vax_df['county'].str.replace(' County', '')

#Rename columns
vax_df.columns = ['County', 'Total_Pop', 'Vax_Rate']

#Drop duplicate rows
vax_df.drop_duplicates()

vax_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 62 entries, 0 to 61
Data columns (total 3 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   County     62 non-null     object 
 1   Total_Pop  62 non-null     int64  
 2   Vax_Rate   62 non-null     float64
dtypes: float64(1), int64(1), object(1)
memory usage: 1.9+ KB


In [3]:
#Only keep necessary columns
demo_df = demo_df[['County', 'White', 'AfricanAmerican', 'Asian', 'Other']]

#Drop rows with NA
demo_df = demo_df.dropna()

#Clean percentages
demo_df['White'] = demo_df['White'].str.replace('%', '').astype(float)
demo_df['AfricanAmerican'] = demo_df['AfricanAmerican'].str.replace('%', '').astype(float)
demo_df['Asian'] = demo_df['Asian'].str.replace('%', '').astype(float)
demo_df['Other'] = demo_df['Other'].str.replace('%', '').astype(float)

demo_df.head()


Unnamed: 0,County,White,AfricanAmerican,Asian,Other
1,Albany,77.8,13.2,6.6,2.5
2,Columbia,91.1,5.1,1.9,2.0
3,Greene,90.2,6.4,1.3,2.1
4,Rensselaer,87.8,7.2,2.8,2.2
5,Saratoga,93.6,1.9,2.7,1.7


In [4]:
# Drop non-beneficial columns
income_df = income_df.drop(['index','Economic Development Region', 'LMI Population Segment', 'Race / Ethnicity'], axis=1)

#Drop rows with NA
income_df = income_df.dropna()

#Recode categorical features
income_cat = ['Households with Elderly',
       'Households with Children', 'Income Groups', 'Percent of Poverty Level',
       'Low-to-Moderate Income (LMI) Group', 'Housing Unit Type',
       'Education Level', 'Head of Household Age']

# Create a OneHotEncoder instance
enc = OneHotEncoder(sparse=False)

# Fit and transform the OneHotEncoder using the categorical variable list
encode_df = pd.DataFrame(enc.fit_transform(income_df[income_cat]))

# Add the encoded variable names to the dataframe
encode_df.columns = enc.get_feature_names(income_cat)

# Merge one-hot encoded features and drop the originals
income_df = income_df.merge(encode_df,left_index=True, 
                                  right_index=True)
income_df = income_df.drop(income_cat,1)

#Rename County
income_df.rename(columns={'County / County Group':'County'}, inplace=True)

#Group by county
income_df = income_df.groupby('County').sum()
income_df.head()

Unnamed: 0_level_0,Households with Elderly_No,Households with Elderly_Yes,Households with Children_No,Households with Children_Yes,"Income Groups_$0 to <$10,000","Income Groups_$10,000-<$20,000","Income Groups_$20,000-<$30,000","Income Groups_$30,000-<$40,000","Income Groups_$40,000-<$50,000","Income Groups_$50,000+",Percent of Poverty Level_1 - Income at or below 100% HHSPG,Percent of Poverty Level_2 - Income 101%-150% HHSPG,Percent of Poverty Level_3 - Income 151%-200% HHSPG,Percent of Poverty Level_4 - Income 201% HHSPG or more,Low-to-Moderate Income (LMI) Group_Group 1 - Very Low Income,Low-to-Moderate Income (LMI) Group_Group 2 - Low Income,Low-to-Moderate Income (LMI) Group_Group 3 - Moderate Income,Low-to-Moderate Income (LMI) Group_Non-LMI Household,Housing Unit Type_1 - Single Family Attached,Housing Unit Type_2 - Single Family Detached,Housing Unit Type_3 - Small Multi-Family (2-4 units),Housing Unit Type_4 - Moderate Multi-Family (5-50 units),Housing Unit Type_5 - Large Multi-Family (50+ units),Housing Unit Type_6 - Mobile Homes & Other,Education Level_1 - Less than High School Diploma,Education Level_2 - High School Diploma,Education Level_3 - Some College,Education Level_4 - Associate's,Education Level_5 - Bachelor's,Education Level_6 - Graduate Degree,Head of Household Age_30-39,Head of Household Age_40-49,Head of Household Age_50-59,Head of Household Age_60-69,Head of Household Age_70+,Head of Household Age_<30
County,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1,Unnamed: 32_level_1,Unnamed: 33_level_1,Unnamed: 34_level_1,Unnamed: 35_level_1,Unnamed: 36_level_1
Albany,4142.0,2934.0,5458.0,1618.0,374.0,584.0,626.0,606.0,562.0,4324.0,640.0,436.0,456.0,5544.0,896.0,792.0,1152.0,4236.0,290.0,4124.0,1450.0,792.0,330.0,90.0,478.0,1574.0,1228.0,814.0,1554.0,1428.0,984.0,1164.0,1554.0,1380.0,1282.0,712.0
Bronx,14442.0,8836.0,14960.0,8318.0,3606.0,3732.0,2634.0,2404.0,1840.0,9062.0,6372.0,2838.0,2338.0,11730.0,8208.0,3870.0,4004.0,7196.0,1442.0,1720.0,4336.0,7824.0,7884.0,72.0,5966.0,5826.0,4440.0,1754.0,3174.0,2118.0,3946.0,4716.0,5024.0,3858.0,3648.0,2086.0
"Broome, Chenango, Delaware, & Tioga",5834.0,5616.0,8732.0,2718.0,786.0,1474.0,1342.0,1220.0,1098.0,5530.0,1446.0,1232.0,1126.0,7646.0,2200.0,1964.0,1538.0,5748.0,144.0,7852.0,1260.0,746.0,156.0,1292.0,1178.0,3692.0,2218.0,1480.0,1610.0,1272.0,1326.0,1662.0,2550.0,2530.0,2646.0,736.0
Cattaraugus & Allegany,3790.0,3288.0,5156.0,1922.0,462.0,932.0,896.0,880.0,754.0,3154.0,922.0,856.0,802.0,4498.0,1406.0,1428.0,1008.0,3236.0,52.0,5324.0,586.0,258.0,28.0,830.0,706.0,2724.0,1352.0,870.0,768.0,658.0,890.0,1060.0,1582.0,1482.0,1518.0,546.0
Cayuga & Onondaga,9460.0,7046.0,12010.0,4496.0,1146.0,1696.0,1630.0,1588.0,1390.0,9056.0,1894.0,1394.0,1436.0,11782.0,2706.0,2460.0,2126.0,9214.0,600.0,11442.0,1716.0,1600.0,596.0,552.0,1356.0,4172.0,3198.0,1990.0,3160.0,2630.0,2120.0,2710.0,3780.0,3312.0,3144.0,1440.0


In [5]:
# join files
df_1 = vax_df.merge(income_df, how='inner',on='County')
df = df_1.merge(demo_df, how='inner', on='County')
df

Unnamed: 0,County,Total_Pop,Vax_Rate,Households with Elderly_No,Households with Elderly_Yes,Households with Children_No,Households with Children_Yes,"Income Groups_$0 to <$10,000","Income Groups_$10,000-<$20,000","Income Groups_$20,000-<$30,000","Income Groups_$30,000-<$40,000","Income Groups_$40,000-<$50,000","Income Groups_$50,000+",Percent of Poverty Level_1 - Income at or below 100% HHSPG,Percent of Poverty Level_2 - Income 101%-150% HHSPG,Percent of Poverty Level_3 - Income 151%-200% HHSPG,Percent of Poverty Level_4 - Income 201% HHSPG or more,Low-to-Moderate Income (LMI) Group_Group 1 - Very Low Income,Low-to-Moderate Income (LMI) Group_Group 2 - Low Income,Low-to-Moderate Income (LMI) Group_Group 3 - Moderate Income,Low-to-Moderate Income (LMI) Group_Non-LMI Household,Housing Unit Type_1 - Single Family Attached,Housing Unit Type_2 - Single Family Detached,Housing Unit Type_3 - Small Multi-Family (2-4 units),Housing Unit Type_4 - Moderate Multi-Family (5-50 units),Housing Unit Type_5 - Large Multi-Family (50+ units),Housing Unit Type_6 - Mobile Homes & Other,Education Level_1 - Less than High School Diploma,Education Level_2 - High School Diploma,Education Level_3 - Some College,Education Level_4 - Associate's,Education Level_5 - Bachelor's,Education Level_6 - Graduate Degree,Head of Household Age_30-39,Head of Household Age_40-49,Head of Household Age_50-59,Head of Household Age_60-69,Head of Household Age_70+,Head of Household Age_<30,White,AfricanAmerican,Asian,Other
0,Albany,305506,0.672,4142.0,2934.0,5458.0,1618.0,374.0,584.0,626.0,606.0,562.0,4324.0,640.0,436.0,456.0,5544.0,896.0,792.0,1152.0,4236.0,290.0,4124.0,1450.0,792.0,330.0,90.0,478.0,1574.0,1228.0,814.0,1554.0,1428.0,984.0,1164.0,1554.0,1380.0,1282.0,712.0,77.8,13.2,6.6,2.5
1,Albany,305506,0.672,4142.0,2934.0,5458.0,1618.0,374.0,584.0,626.0,606.0,562.0,4324.0,640.0,436.0,456.0,5544.0,896.0,792.0,1152.0,4236.0,290.0,4124.0,1450.0,792.0,330.0,90.0,478.0,1574.0,1228.0,814.0,1554.0,1428.0,984.0,1164.0,1554.0,1380.0,1282.0,712.0,77.8,13.2,6.6,2.5
2,Bronx,1418207,0.614,14442.0,8836.0,14960.0,8318.0,3606.0,3732.0,2634.0,2404.0,1840.0,9062.0,6372.0,2838.0,2338.0,11730.0,8208.0,3870.0,4004.0,7196.0,1442.0,1720.0,4336.0,7824.0,7884.0,72.0,5966.0,5826.0,4440.0,1754.0,3174.0,2118.0,3946.0,4716.0,5024.0,3858.0,3648.0,2086.0,44.7,43.7,4.7,6.8
3,Bronx,1418207,0.614,14442.0,8836.0,14960.0,8318.0,3606.0,3732.0,2634.0,2404.0,1840.0,9062.0,6372.0,2838.0,2338.0,11730.0,8208.0,3870.0,4004.0,7196.0,1442.0,1720.0,4336.0,7824.0,7884.0,72.0,5966.0,5826.0,4440.0,1754.0,3174.0,2118.0,3946.0,4716.0,5024.0,3858.0,3648.0,2086.0,44.7,43.7,4.7,6.8
4,Chautauqua,126903,0.53,2654.0,2348.0,3806.0,1196.0,332.0,682.0,610.0,598.0,542.0,2238.0,640.0,618.0,560.0,3184.0,980.0,1000.0,672.0,2350.0,68.0,3754.0,570.0,202.0,40.0,368.0,504.0,1740.0,980.0,694.0,604.0,480.0,600.0,710.0,1200.0,1054.0,1130.0,308.0,94.3,2.8,0.7,2.2
5,Chautauqua,126903,0.53,2654.0,2348.0,3806.0,1196.0,332.0,682.0,610.0,598.0,542.0,2238.0,640.0,618.0,560.0,3184.0,980.0,1000.0,672.0,2350.0,68.0,3754.0,570.0,202.0,40.0,368.0,504.0,1740.0,980.0,694.0,604.0,480.0,600.0,710.0,1200.0,1054.0,1130.0,308.0,94.3,2.8,0.7,2.2
6,Dutchess,294218,0.626,3802.0,3144.0,4984.0,1962.0,268.0,484.0,524.0,514.0,444.0,4712.0,498.0,384.0,418.0,5646.0,718.0,728.0,1124.0,4376.0,384.0,4792.0,756.0,648.0,166.0,200.0,458.0,1606.0,1342.0,680.0,1468.0,1392.0,800.0,1286.0,1716.0,1470.0,1338.0,336.0,82.0,11.7,3.7,2.5
7,Dutchess,294218,0.626,3802.0,3144.0,4984.0,1962.0,268.0,484.0,524.0,514.0,444.0,4712.0,498.0,384.0,418.0,5646.0,718.0,728.0,1124.0,4376.0,384.0,4792.0,756.0,648.0,166.0,200.0,458.0,1606.0,1342.0,680.0,1468.0,1392.0,800.0,1286.0,1716.0,1470.0,1338.0,336.0,82.0,11.7,3.7,2.5
8,Erie,918702,0.625,12548.0,9398.0,16490.0,5456.0,1672.0,2358.0,2324.0,2182.0,1894.0,11516.0,2686.0,1904.0,1954.0,15402.0,3838.0,3310.0,2814.0,11984.0,686.0,14078.0,4478.0,1668.0,724.0,312.0,1884.0,5734.0,4288.0,2614.0,3972.0,3454.0,2818.0,3550.0,4826.0,4098.0,4618.0,2036.0,81.1,13.1,3.6,2.2
9,Erie,918702,0.625,12548.0,9398.0,16490.0,5456.0,1672.0,2358.0,2324.0,2182.0,1894.0,11516.0,2686.0,1904.0,1954.0,15402.0,3838.0,3310.0,2814.0,11984.0,686.0,14078.0,4478.0,1668.0,724.0,312.0,1884.0,5734.0,4288.0,2614.0,3972.0,3454.0,2818.0,3550.0,4826.0,4098.0,4618.0,2036.0,81.1,13.1,3.6,2.2


In [6]:
# Drop county
df = df.drop(['County'], axis=1)

In [7]:
# Check all values are numeric
df.info()
df.columns


<class 'pandas.core.frame.DataFrame'>
Int64Index: 46 entries, 0 to 45
Data columns (total 42 columns):
 #   Column                                                        Non-Null Count  Dtype  
---  ------                                                        --------------  -----  
 0   Total_Pop                                                     46 non-null     int64  
 1   Vax_Rate                                                      46 non-null     float64
 2   Households with Elderly_No                                    46 non-null     float64
 3   Households with Elderly_Yes                                   46 non-null     float64
 4   Households with Children_No                                   46 non-null     float64
 5   Households with Children_Yes                                  46 non-null     float64
 6   Income Groups_$0 to <$10,000                                  46 non-null     float64
 7   Income Groups_$10,000-<$20,000                                46 non-null

Index(['Total_Pop', 'Vax_Rate', 'Households with Elderly_No',
       'Households with Elderly_Yes', 'Households with Children_No',
       'Households with Children_Yes', 'Income Groups_$0 to <$10,000',
       'Income Groups_$10,000-<$20,000', 'Income Groups_$20,000-<$30,000',
       'Income Groups_$30,000-<$40,000', 'Income Groups_$40,000-<$50,000',
       'Income Groups_$50,000+',
       'Percent of Poverty Level_1 - Income at or below 100% HHSPG',
       'Percent of Poverty Level_2 - Income 101%-150% HHSPG',
       'Percent of Poverty Level_3 - Income 151%-200% HHSPG',
       'Percent of Poverty Level_4 - Income 201% HHSPG or more',
       'Low-to-Moderate Income (LMI) Group_Group 1 - Very Low Income',
       'Low-to-Moderate Income (LMI) Group_Group 2 - Low Income',
       'Low-to-Moderate Income (LMI) Group_Group 3 - Moderate Income',
       'Low-to-Moderate Income (LMI) Group_Non-LMI Household',
       'Housing Unit Type_1 - Single Family Attached',
       'Housing Unit Type_2 - S

In [8]:
# Export to CSV for multiple regression
df.to_csv('sample_data/clean_df.csv')

In [9]:
# Split our preprocessed data into our features and target arrays
y = df["Vax_Rate"].values
X = df.drop(["Vax_Rate"],1).values

# Split the preprocessed data into a training and testing dataset
X_train, X_test, y_train, y_test = train_test_split(X, y)

In [10]:
# Create a StandardScaler instances
scaler = StandardScaler()

# Fit the StandardScaler
X_scaler = scaler.fit(X_train)

# Scale the data
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

### Neural Network

In [11]:
# Define the model - deep neural net, i.e., the number of input features and hidden nodes for each layer.
number_input_features = len(X_train[0])
hidden_nodes_layer1 = 100
hidden_nodes_layer2 = 100

nn = tf.keras.models.Sequential()

# First hidden layer
nn.add(
    tf.keras.layers.Dense(units=hidden_nodes_layer1, 
                          input_dim=number_input_features, 
                          activation="relu")
)

# Second hidden layer
nn.add(tf.keras.layers.Dense(units=hidden_nodes_layer2, 
                             activation="tanh"))

# Output layer
nn.add(tf.keras.layers.Dense(units=1, activation="linear"))

# Check the structure of the model
nn.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense (Dense)                (None, 100)               4200      
_________________________________________________________________
dense_1 (Dense)              (None, 100)               10100     
_________________________________________________________________
dense_2 (Dense)              (None, 1)                 101       
Total params: 14,401
Trainable params: 14,401
Non-trainable params: 0
_________________________________________________________________


In [12]:
# Compile the model
nn.compile(loss="mean_absolute_error", 
           optimizer="adam")

In [13]:
# Create a callback that saves the model's weights every 5 epochs
callbacks = tf.keras.callbacks.ModelCheckpoint('weights{epoch:08d}.h5', 
                                     save_weights_only=True, save_freq=5)

In [14]:
# Train the model
fit_model = nn.fit(X_train_scaled, y_train, epochs=100, callbacks=[callbacks])

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100
Epoch 75/100
Epoch 76/100
Epoch 77/100
Epoch 78

In [15]:
# Evaluate the model using the test data
model_loss = nn.evaluate(X_test_scaled,y_test,verbose=1)
print(f"Loss: {model_loss}")

Loss: 0.09108620136976242


In [16]:
#Save 
nn.save('COVID_Vax_Predictions.h5')