In [1]:
# Import our dependencies
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import tensorflow as tf
from sklearn.preprocessing import StandardScaler,OneHotEncoder
from sklearn.preprocessing import LabelEncoder

In [2]:
file_path = './Resources/processed_PLACES_COPD.csv'

In [3]:
# Importing the dataset
dataset = pd.read_csv(file_path)


In [4]:
dataset.head()

Unnamed: 0,State_County,State,County,Levels_Smokers,Levels_COPD
0,"Cass, Iowa",Iowa,Cass,19.3,8.3
1,"Monona, Iowa",Iowa,Monona,18.5,8.3
2,"Dillingham, Alaska",Alaska,Dillingham,31.1,8.9
3,"Custer, Colorado",Colorado,Custer,13.8,7.5
4,"Ketchikan Gateway, Alaska",Alaska,Ketchikan Gateway,19.4,6.6


In [6]:
# dataset['Total_Population']= dataset['Total_Population'].astype(float)

In [7]:
le = LabelEncoder()
df2 = dataset.copy()

In [8]:
# Generate our categorical variable lists
dataset_cat = dataset.dtypes[dataset.dtypes == "object"].index.tolist()
dataset_cat

['State_County', 'State', 'County']

In [9]:
# For loop to encode text columns to numerical values
for textColumn in dataset_cat:
    df2[textColumn] = le.fit_transform(df2[textColumn])

In [10]:
# Display dataframe
df2

Unnamed: 0,State_County,State,County,Levels_Smokers,Levels_COPD
0,414,15,270,19.3,8.3
1,1938,15,1104,18.5,8.3
2,778,1,477,31.1,8.9
3,688,5,426,13.8,7.5
4,1484,1,869,19.4,6.6
...,...,...,...,...,...
3117,2515,49,1462,16.8,6.5
3118,290,49,209,15.1,5.5
3119,2994,48,1745,26.8,15.3
3120,3058,49,1785,17.1,5.8


In [11]:
#X = dataset.drop(['Levels_COPD'], axis = 1).values
#y = dataset['Levels_COPD']

In [12]:
# Create our features
a_cols = [i for i in df2.columns if i not in ('Levels_COPD')]
X = df2[a_cols]

# Create our target
y = df2['Levels_COPD']

In [13]:
# Step 3: Splitting the dataset into the Training set and Test set
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.05)

In [14]:
# Create a OneHotEncoder instance
# enc = OneHotEncoder(sparse=False)

# Fit and transform the OneHotEncoder using the categorical variable list
# encode_df = pd.DataFrame(enc.fit_transform(dataset[dataset_cat]))

# Add the encoded variable names to the dataframe
# encode_df.columns = enc.get_feature_names_out(dataset_cat)

# Set options to show all columns
# pd.options.display.max_columns = 50

# View the dataframe
# encode_df.head()

In [15]:
# Step 4: Training the Random Forest Regression model on the training set

# Fitting Random Forest Regression to the dataset
from sklearn.ensemble import RandomForestRegressor
regressor = RandomForestRegressor(n_estimators = 10, random_state = 0)
regressor.fit(X_train, y_train)

In [17]:
# Step 5: Predicting the Results
y_pred = regressor.predict(X_test)
y_pred

array([10.84, 12.47,  5.25, 10.32,  9.75, 10.75,  7.89,  9.6 , 10.35,
        9.54,  8.38,  6.41,  8.83,  7.74,  8.18,  5.56, 10.35,  5.96,
        8.02,  7.  ,  8.99,  6.62,  8.98,  7.95,  9.08,  6.79,  8.3 ,
        9.3 , 12.56,  6.92, 11.69,  6.74,  7.44,  8.94,  8.14,  7.05,
        6.37,  5.94, 10.87,  7.97,  9.48,  6.71, 11.51,  9.91,  6.41,
       10.89, 11.48,  7.14,  8.69, 10.1 ,  7.4 ,  9.13,  7.63,  6.62,
        8.88, 10.34,  4.69,  5.3 , 10.33, 10.77,  7.61,  7.48, 11.12,
        8.05,  7.6 ,  8.34,  7.33, 10.62,  9.41,  9.7 , 10.04,  8.79,
       10.43, 10.04, 10.04,  4.86,  8.52, 14.61,  7.26,  8.65,  9.26,
        6.19,  5.49,  5.83,  5.9 ,  5.73,  8.21,  7.58,  9.88,  7.63,
        9.38,  8.73,  9.14, 12.43,  6.38, 11.16, 12.68, 11.56,  7.65,
       11.3 ,  6.9 ,  8.61, 10.27,  6.24, 10.26,  9.81,  9.93,  7.74,
        7.37, 11.83,  8.66, 10.6 ,  9.7 ,  8.4 ,  7.3 ,  9.77, 10.03,
        7.28, 12.13,  7.62, 10.3 ,  8.4 , 10.27,  8.35,  8.75,  5.75,
        7.68,  5.58,

In [18]:
# Step 6: Evaluating the Algorithm
from sklearn import metrics 
print('Root Mean Squared Error:', np.sqrt(metrics.mean_squared_error(y_test, y_pred)))

Root Mean Squared Error: 1.1067399190449594


In [None]:
regressor.feature_importances_

In [None]:
features = sorted(zip(X.columns, regressor.feature_importances_), key = lambda x: x[1])
cols = [f[0] for f in features]
width = [f[1] for f in features]
fig, ax = plt.subplots()
fig.set_size_inches(20,20)
plt.margins(y=0.001)
ax.barh(y=cols, width=width)
plt.show()

In [None]:
regressor.score(X_test, y_test)

In [None]:
# Step 5: Predicting the Results

y_pred = regressor.predict(X_test)

In [None]:
# Step 6: Comparing the Real Values with Predicted Values
df = pd.DataFrame({'Real Values':y_test, 'Predicted Values':y_pred})
df

In [None]:
plt.title('Scatter Plots')
plt.xlabel('Levels_Smokers')
plt.ylabel('Levels_COPD')
plt.scatter(X, y)
plt.show()

In [None]:
X.info()

In [None]:
np.arange(X)

In [None]:
# # Visualising the Random Forest Regression Results  
# 
X_grid = np.arange(int (float(min(X))), int(float(max(X))), 0.01)
X_grid = X_grid.reshape((len(X_grid), 1))
plt.scatter(X_test, y_test, color = 'red')
plt.scatter(X_test, y_pred, color = 'green')
plt.title('Random Forest Regression')
plt.xlabel('Levels_Smokers')
plt.ylabel('Levels_COPD')
plt.show()

In [None]:
plt.plot(X_grid, regressor.predict(X_grid), color = 'black')
plt.title('Random Forest Regression')
plt.xlabel('Temperature')
plt.ylabel('Revenue')
plt.show()

In [None]:
# Generate dummy dataset
X, y = make_blobs(n_samples=1000, centers=2, n_features=2, random_state=78)

# Creating a DataFrame with the dummy data
df = pd.DataFrame(X, columns=["Feature 1", "Feature 2"])
df["Target"] = y

# Plotting the dummy data
df.plot.scatter(x="Feature 1", y="Feature 2", c="Target", colormap="winter")

In [None]:
# Use sklearn to split dataset
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=78)

In [None]:
# Create scaler instance
X_scaler = skl.preprocessing.StandardScaler()

# Fit the scaler
X_scaler.fit(X_train)

# Scale the data
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

In [None]:
# Create the Keras Sequential model
nn_model = tf.keras.models.Sequential()

In [None]:
# Add our first Dense layer, including the input layer
nn_model.add(tf.keras.layers.Dense(units=1, activation="relu", input_dim=2))

In [None]:
# Add the output layer that uses a probability activation function
nn_model.add(tf.keras.layers.Dense(units=1, activation="sigmoid"))

In [None]:
# Check the structure of the Sequential model
nn_model.summary()

In [None]:
# Compile the Sequential model together and customize metrics
nn_model.compile(loss="binary_crossentropy", optimizer="adam", metrics=["accuracy"])

In [None]:
# Fit the model to the training data
fit_model = nn_model.fit(X_train_scaled, y_train, epochs=100)

In [None]:
# Create a DataFrame containing training history
history_df = pd.DataFrame(fit_model.history, index=range(1,len(fit_model.history["loss"])+1))

# Plot the loss
history_df.plot(y="loss")

In [None]:
# Plot the accuracy
history_df.plot(y="accuracy")

In [None]:
# Evaluate the model using the test data
model_loss, model_accuracy = nn_model.evaluate(X_test_scaled,y_test,verbose=2)
print(f"Loss: {model_loss}, Accuracy: {model_accuracy}")