# Data Preprocessing

In [1]:
# Import dependencies
import pandas as pd
import tensorflow as tf
# from matplotlib import pyplot as plt
# import seaborn as sb
# from sklearn.model_selection import train_test_split
# from sklearn.preprocessing import StandardScaler,OneHotEncoder
# from sklearn.preprocessing import LabelEncoder

AttributeError: module 'numpy' has no attribute 'ndarray'

In [None]:
# Import PLACES data
places_df = pd.read_csv("./Resources/processed_PLACES_COPD.csv")
places_df = places_df.drop(["State_County"], axis=1)
places_df.head()

In [None]:
# Import Coal Mines data
coal_df = pd.read_csv("./Resources/processed_Coal_Mines.csv")
coal_df = coal_df.drop(["State_County"], axis=1)
coal_df.head()

In [None]:
# Import Air Quality data
aqi_df = pd.read_csv("./Resources/processed_Decade_Air_Quality.csv")
aqi_df = aqi_df.drop(["State_County"], axis=1)
aqi_df.head()

In [None]:
# Import Census 2019 data
census_df = pd.read_csv("./Resources/processed_census_data.csv")
census_df = census_df.drop(["State_County"], axis=1)
census_df.head()

In [None]:
# Merge places and coal dataframes
merged_df = pd.merge(places_df, coal_df, on=["State", "County"], how="left")
merged_df

In [None]:
# Merge in the Air Quality data
merged_df = pd.merge(merged_df, aqi_df, on=["State", "County"], how="left")
merged_df

In [None]:
# Merge in the census data
merged_df = pd.merge(merged_df, census_df, on=["State", "County"], how="left")
merged_df

In [None]:
# View null values
merged_df.isnull().sum()

In [None]:
# View the county with a NaN value
merged_df[merged_df['County'].isnull()]

In [None]:
# Drop the row containing "United States"
merged_df = merged_df[merged_df.State != "United States"]
merged_df.isnull().sum()

In [None]:
# Fill Nulls from states without mines to be 0
merged_df = merged_df.fillna(0)
merged_df.isnull().sum()

In [None]:
# Drop County name as it might confuse the model
merged_df = merged_df.drop(["County"], axis=1)

# Encode the Data

In [None]:
# Prepare for label encoding
le = LabelEncoder()
encoded_df = merged_df.copy()

In [None]:
# Filter all string object data types into a dataframe_cat for encoding
dataframe_cat = merged_df.dtypes[merged_df.dtypes == "object"].index.tolist()

dataframe_cat

In [None]:
# For loop to encode text columns to numerical values
for column in dataframe_cat:
    encoded_df[column] = le.fit_transform(merged_df[column])


encoded_df.head()

# Split, Train, Test

In [None]:
# Split our preprocessed data into our features and target arrays
X = encoded_df.drop("Levels_COPD", axis=1)
y = encoded_df["Levels_COPD"]


# Split the preprocessed data into a training and testing dataset
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0)

In [None]:
# Create a StandardScaler instances
scaler = StandardScaler()

# Fit the StandardScaler
X_scaler = scaler.fit(X_train)

# Scale the data
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

# Find Correlation Between Features and Target

In [None]:
correlation_df = encoded_df[["State", "Levels_Smokers", "Surface_Mines", 
                             "Underground_Mines", "Good_Days", "Levels_COPD"]]

In [None]:
plt.figure(figsize=(10, 5))
heatmap = sb.heatmap(correlation_df.corr(), vmin=-1, vmax=1, annot=True, cmap='RdBu')
heatmap.set_title('Correlation Heatmap', fontdict={'fontsize':18}, pad=12);
# save heatmap as .png file
# dpi - sets the resolution of the saved image in dots/inches
# bbox_inches - when set to 'tight' - does not allow the labels to be cropped
plt.savefig('heatmap.png', dpi=300, bbox_inches='tight')

In [None]:
len(X_train[0])

In [None]:
# Define the model - deep neural net, i.e., the number of input features and hidden nodes for each layer.
number_input_features = len(X_train[0])
hidden_nodes_layer1 = len(X_train[0]) / 2
hidden_nodes_layer2 = len(X_train[0]) / 3
hidden_nodes_layer3 = len(X_train[0]) / 4
# hidden_nodes_layer4 = len(X_train[0]) / 5
# hidden_nodes_layer5 = len(X_train[0]) / 6

nn = tf.keras.models.Sequential()

# First hidden layer
nn.add(
    tf.keras.layers.Dense(units=hidden_nodes_layer1, input_dim=number_input_features, activation="relu")
)
# Second hidden layer
nn.add(tf.keras.layers.Dense(units=hidden_nodes_layer2, activation="relu"))

# Third hidden layer
nn.add(tf.keras.layers.Dense(units=hidden_nodes_layer3, activation="relu"))

# # Fourth hidden layer
# nn.add(tf.keras.layers.Dense(units=hidden_nodes_layer4, activation="relu"))

# # Fifth hidden layer
# nn.add(tf.keras.layers.Dense(units=hidden_nodes_layer5, activation="relu"))

# Output layer
nn.add(tf.keras.layers.Dense(units=1, activation="linear"))

# Check the structure of the model
nn.summary()

In [None]:
# # Import checkpoint dependencies
# import os
# from tensorflow.keras.callbacks import ModelCheckpoint

# # Define the checkpoint path and filenames
# os.makedirs("optimized_checkpoints/",exist_ok=True)
# checkpoint_path = "optimized_checkpoints/weights.{epoch:02d}.hdf5"

In [None]:
# Compile the model
nn.compile(optimizer='adam', loss='mean_squared_error', metrics=['mean_squared_error'])

# # Create a callback that saves the model's weights every 5 epochs
# cp_callback = ModelCheckpoint(
#     filepath = checkpoint_path,
#     verbose = 1,
#     save_weights_only = True,
#     save_freq = 5
# )

In [None]:
# Train the model
# fit_model = nn.fit(X_train_scaled, y_train, epochs=1, callbacks=[cp_callback])
# Train the model
fit_model = nn.fit(X_train_scaled, y_train, epochs=75)

In [None]:
# # Evaluate the model using the test data
# model_loss, model_accuracy = nn.evaluate(X_test_scaled,y_test,verbose=2)
# print(f"Loss: {model_loss}, Accuracy: {model_accuracy}")

In [None]:
# # Export our model to HDF5 file
# nn.save('COPD_DeepLearning.h5')

# Determine Feature Importances

In [31]:
features = sorted(zip(X.columns, nn.feature_importances_), key = lambda x: x[1])
cols = [f[0] for f in features]
width = [f[1] for f in features]

fig, ax = plt.subplots()

fig.set_size_inches(10,200)
plt.margins(y=0.001)

ax.barh(y=cols, width=width)

plt.show()

AttributeError: 'Sequential' object has no attribute 'feature_importances_'