# Data Preprocessing

In [1]:
# Import dependencies
import pandas as pd
import tensorflow as tf
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler,OneHotEncoder
from sklearn.preprocessing import LabelEncoder

In [2]:
# Import PLACES data
places_df = pd.read_csv("./Resources/processed_PLACES_COPD.csv")
places_df = places_df.drop(["State_County"], axis=1)
places_df.head()

Unnamed: 0,State,County,Levels_Smokers,Levels_COPD
0,Iowa,Cass,19.3,8.3
1,Iowa,Monona,18.5,8.3
2,Alaska,Dillingham,31.1,8.9
3,Colorado,Custer,13.8,7.5
4,Alaska,Ketchikan Gateway,19.4,6.6


In [3]:
# Import Coal Mines data
coal_df = pd.read_csv("./Resources/processed_Coal_Mines.csv")
coal_df = coal_df.drop(["State_County"], axis=1)
coal_df.head()

Unnamed: 0,State,County,Surface_Mines,Underground_Mines
0,Alabama,Walker,22,2.0
1,Alabama,Jefferson,22,5.0
2,Alabama,Jefferson,13,5.0
3,Alabama,Tuscaloosa,13,12.0
4,Alabama,Jefferson,13,14.0


In [4]:
# Import Air Quality data
aqi_df = pd.read_csv("./Resources/processed_Decade_Air_Quality.csv")
aqi_df = aqi_df.drop(["State_County"], axis=1)
aqi_df.head()

Unnamed: 0,State,County,Days_with_AQI,Good_Days,Moderate_Days,Unhealthy_for_Sensitive_Groups_Days,Unhealthy_Days,Very_Unhealthy_Days,Hazardous_Days
0,Alabama,Baldwin,523.0,455.0,66.0,2.0,0.0,0.0,0.0
1,Alabama,Clay,226.0,194.0,32.0,0.0,0.0,0.0,0.0
2,Alabama,Colbert,586.0,472.0,114.0,0.0,0.0,0.0,0.0
3,Alabama,DeKalb,724.0,635.0,89.0,0.0,0.0,0.0,0.0
4,Alabama,Elmore,472.0,436.0,36.0,0.0,0.0,0.0,0.0


In [5]:
# Import Census 2019 data
census_df = pd.read_csv("./Resources/processed_census_data.csv")
census_df = census_df.drop(["State_County"], axis=1)
census_df.head()

Unnamed: 0,State,County,PERC_TOT_MALE_0,PERC_TOT_FEMALE_0,PERC_WA_MALE_0,PERC_WA_FEMALE_0,PERC_BA_MALE_0,PERC_BA_FEMALE_0,PERC_IA_MALE_0,PERC_IA_FEMALE_0,...,PERC_HWAC_MALE_18,PERC_HWAC_FEMALE_18,PERC_HBAC_MALE_18,PERC_HBAC_FEMALE_18,PERC_HIAC_MALE_18,PERC_HIAC_FEMALE_18,PERC_HAAC_MALE_18,PERC_HAAC_FEMALE_18,PERC_HNAC_MALE_18,PERC_HNAC_FEMALE_18
0,Alabama,Autauga County,48.492008,51.507992,37.369561,38.892767,9.373714,10.739408,0.216578,0.259536,...,0.747863,1.282051,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,Alabama,Baldwin County,48.490373,51.509627,42.471129,44.969852,4.249353,4.527536,0.404508,0.375839,...,0.42735,0.899685,0.0,0.0,0.0,0.022492,0.0,0.0,0.0,0.0
2,Alabama,Barbour County,52.920684,47.079316,25.881066,23.2723,25.565098,22.664668,0.417241,0.271409,...,0.639659,0.426439,0.0,0.21322,0.0,0.21322,0.0,0.0,0.0,0.0
3,Alabama,Bibb County,53.268733,46.731267,39.144414,37.621684,13.132982,8.136108,0.236671,0.223274,...,1.028278,0.0,0.0,0.514139,0.0,0.0,0.0,0.0,0.0,0.0
4,Alabama,Blount County,49.237367,50.762633,47.137966,48.687442,0.892332,0.798949,0.332031,0.30782,...,1.818182,0.545455,0.0,0.0,0.0,0.0,0.0,0.0,0.090909,0.0


In [6]:
# Merge places and coal dataframes
merged_df = pd.merge(places_df, coal_df, on=["State", "County"], how="left")
merged_df

Unnamed: 0,State,County,Levels_Smokers,Levels_COPD,Surface_Mines,Underground_Mines
0,Iowa,Cass,19.3,8.3,,
1,Iowa,Monona,18.5,8.3,,
2,Alaska,Dillingham,31.1,8.9,,
3,Colorado,Custer,13.8,7.5,,
4,Alaska,Ketchikan Gateway,19.4,6.6,,
...,...,...,...,...,...,...
3176,Wisconsin,Sauk,16.8,6.5,,
3177,Wisconsin,Brown,15.1,5.5,,
3178,West Virginia,Webster,26.8,15.3,,
3179,Wisconsin,Winnebago,17.1,5.8,,


In [7]:
# View null values
merged_df.isnull().sum()

State                   0
County                  1
Levels_Smokers          0
Levels_COPD             0
Surface_Mines        3078
Underground_Mines    3078
dtype: int64

In [8]:
# View the county with a NaN value
merged_df[merged_df['County'].isnull()]

Unnamed: 0,State,County,Levels_Smokers,Levels_COPD,Surface_Mines,Underground_Mines
33,United States,,15.3,6.6,,


In [9]:
# Drop the row containing "United States"
merged_df = merged_df[merged_df.State != "United States"]
merged_df.isnull().sum()

State                   0
County                  0
Levels_Smokers          0
Levels_COPD             0
Surface_Mines        3077
Underground_Mines    3077
dtype: int64

In [10]:
# Fill Nulls from states without mines to be 0
merged_df = merged_df.fillna(0)
merged_df.isnull().sum()

State                0
County               0
Levels_Smokers       0
Levels_COPD          0
Surface_Mines        0
Underground_Mines    0
dtype: int64

In [11]:
# Drop County name as it might confuse the model
merged_df = merged_df.drop(["County"], axis=1)

# Encode the Data

In [12]:
# Prepare for label encoding
le = LabelEncoder()
encoded_df = merged_df.copy()

In [13]:
# Filter all string object data types into a dataframe_cat for encoding
# dataframe_cat = merged_df.dtypes[merged_df.dtypes == "object"].index.tolist()
dataframe_cat = places_df.dtypes[places_df.dtypes == "object"].index.tolist()

dataframe_cat

['State', 'County']

In [14]:
# For loop to encode text columns to numerical values
for column in dataframe_cat:
    places_df[column] = le.fit_transform(places_df[column])
#     encoded_df[column] = le.fit_transform(merged_df[column])


# encoded_df.head()
places_df.head()

Unnamed: 0,State,County,Levels_Smokers,Levels_COPD
0,15,270,19.3,8.3
1,15,1104,18.5,8.3
2,1,477,31.1,8.9
3,5,426,13.8,7.5
4,1,869,19.4,6.6


# Split, Train, Test

In [17]:
# Split our preprocessed data into our features and target arrays
# X = encoded_df.drop(columns = "Levels_COPD").values
# y = encoded_df["Levels_COPD"]

X = places_df.drop(columns = "Levels_COPD").values
y = places_df["Levels_COPD"]

# Split the preprocessed data into a training and testing dataset
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0)

In [18]:
# Create a StandardScaler instances
scaler = StandardScaler()

# Fit the StandardScaler
X_scaler = scaler.fit(X_train)

# Scale the data
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

In [19]:
len(X_train[0])

3

In [20]:
# Define the model - deep neural net, i.e., the number of input features and hidden nodes for each layer.
number_input_features = len(X_train[0])
hidden_nodes_layer1 = len(X_train[0]) / 2
# hidden_nodes_layer2 = len(X_train[0]) / 3
# hidden_nodes_layer3 = 210
# hidden_nodes_layer4 = 10
# hidden_nodes_layer5 = 10

nn = tf.keras.models.Sequential()

# First hidden layer
nn.add(
    tf.keras.layers.Dense(units=hidden_nodes_layer1, input_dim=number_input_features, activation="relu")
)
# # Second hidden layer
# nn.add(tf.keras.layers.Dense(units=hidden_nodes_layer2, activation="gelu"))

# # Third hidden layer
# nn.add(tf.keras.layers.Dense(units=hidden_nodes_layer3, activation="tanh"))

# # Fourth hidden layer
# nn.add(tf.keras.layers.Dense(units=hidden_nodes_layer4, activation="gelu"))

# # Fifth hidden layer
# nn.add(tf.keras.layers.Dense(units=hidden_nodes_layer5, activation="tanh"))

# Output layer
nn.add(tf.keras.layers.Dense(units=1, activation="linear"))

# Check the structure of the model
nn.summary()

Metal device set to: Apple M1
Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 dense (Dense)               (None, 1)                 4         
                                                                 
 dense_1 (Dense)             (None, 1)                 2         
                                                                 
Total params: 6
Trainable params: 6
Non-trainable params: 0
_________________________________________________________________


2022-11-04 13:52:47.206293: I tensorflow/core/common_runtime/pluggable_device/pluggable_device_factory.cc:306] Could not identify NUMA node of platform GPU ID 0, defaulting to 0. Your kernel may not have been built with NUMA support.
2022-11-04 13:52:47.206751: I tensorflow/core/common_runtime/pluggable_device/pluggable_device_factory.cc:272] Created TensorFlow device (/job:localhost/replica:0/task:0/device:GPU:0 with 0 MB memory) -> physical PluggableDevice (device: 0, name: METAL, pci bus id: <undefined>)


In [21]:
# # Import checkpoint dependencies
# import os
# from tensorflow.keras.callbacks import ModelCheckpoint

# # Define the checkpoint path and filenames
# os.makedirs("optimized_checkpoints/",exist_ok=True)
# checkpoint_path = "optimized_checkpoints/weights.{epoch:02d}.hdf5"

In [22]:
# Compile the model
nn.compile(loss="binary_crossentropy", optimizer="adam", metrics=["accuracy"])

# # Create a callback that saves the model's weights every 5 epochs
# cp_callback = ModelCheckpoint(
#     filepath = checkpoint_path,
#     verbose = 1,
#     save_weights_only = True,
#     save_freq = 5
# )

In [23]:
# Train the model
# fit_model = nn.fit(X_train_scaled, y_train, epochs=1, callbacks=[cp_callback])
# Train the model
fit_model = nn.fit(X_train_scaled, y_train, epochs=50)

Epoch 1/50


2022-11-04 13:52:51.371793: W tensorflow/core/platform/profile_utils/cpu_utils.cc:128] Failed to get CPU frequency: 0 Hz
2022-11-04 13:52:51.637503: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:114] Plugin optimizer for device_type GPU is enabled.


Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50


In [24]:
# Evaluate the model using the test data
model_loss, model_accuracy = nn.evaluate(X_test_scaled,y_test,verbose=2)
print(f"Loss: {model_loss}, Accuracy: {model_accuracy}")

25/25 - 0s - loss: 134.6335 - accuracy: 0.0000e+00 - 171ms/epoch - 7ms/step
Loss: 134.63351440429688, Accuracy: 0.0


2022-11-04 13:53:11.476746: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:114] Plugin optimizer for device_type GPU is enabled.


In [None]:
# # Export our model to HDF5 file
# nn.save('COPD_DeepLearning.h5')