In [1]:
#Import dependencies
import warnings
warnings.filterwarnings('ignore')
import numpy as np
import pandas as pd
import os
from collections import Counter
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from imblearn.over_sampling import RandomOverSampler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import balanced_accuracy_score
from sklearn.metrics import confusion_matrix
from imblearn.metrics import classification_report_imbalanced
from imblearn.combine import SMOTEENN
import sqlalchemy as sa
from sqlalchemy import create_engine
import psycopg2 as pg
from sklearn.metrics import classification_report
import tensorflow as tf

# Connect to database

In [2]:
#Connect to Postgresql database 
conn=sa.create_engine('postgresql://root:postgres@dataanalyticsdb.cxnhjzyey4ka.us-east-2.rds.amazonaws.com:5432/coursefinalproject')

# Load data and perform data cleaning

In [3]:
#Load and display data
df=pd.read_sql_query("SELECT * FROM fragmingham", con=conn)
df

Unnamed: 0,sex,age,education,currentSmoker,cigsPerDay,BPMeds,prevalentStroke,prevalentHyp,diabetes,totChol,sysBP,diaBP,BMI,heartRate,glucose,TenYearCHD
0,1,39,4.0,0,0.0,0.0,0,0,0,195.0,106.0,70.0,26.97,80.0,77.0,0
1,0,46,2.0,0,0.0,0.0,0,0,0,250.0,121.0,81.0,28.73,95.0,76.0,0
2,1,48,1.0,1,20.0,0.0,0,0,0,245.0,127.5,80.0,25.34,75.0,70.0,0
3,0,61,3.0,1,30.0,0.0,0,1,0,225.0,150.0,95.0,28.58,65.0,103.0,1
4,0,46,3.0,1,23.0,0.0,0,0,0,285.0,130.0,84.0,23.10,85.0,85.0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3653,1,50,1.0,1,1.0,0.0,0,1,0,313.0,179.0,92.0,25.97,66.0,86.0,1
3654,1,51,3.0,1,43.0,0.0,0,0,0,207.0,126.5,80.0,19.71,65.0,68.0,0
3655,0,52,2.0,0,0.0,0.0,0,0,0,269.0,133.5,83.0,21.47,80.0,107.0,0
3656,1,40,3.0,0,0.0,0.0,0,1,0,185.0,141.0,98.0,25.60,67.0,72.0,0


In [4]:
#Diplay column names
df.columns

Index(['sex', 'age', 'education', 'currentSmoker', 'cigsPerDay', 'BPMeds',
       'prevalentStroke', 'prevalentHyp', 'diabetes', 'totChol', 'sysBP',
       'diaBP', 'BMI', 'heartRate', 'glucose', 'TenYearCHD'],
      dtype='object')

In [5]:
#Explore data types
df.dtypes

sex                  int64
age                  int64
education          float64
currentSmoker        int64
cigsPerDay         float64
BPMeds             float64
prevalentStroke      int64
prevalentHyp         int64
diabetes             int64
totChol            float64
sysBP              float64
diaBP              float64
BMI                float64
heartRate          float64
glucose            float64
TenYearCHD           int64
dtype: object

In [6]:
#Explore for null value
df.isnull().sum()

sex                0
age                0
education          0
currentSmoker      0
cigsPerDay         0
BPMeds             0
prevalentStroke    0
prevalentHyp       0
diabetes           0
totChol            0
sysBP              0
diaBP              0
BMI                0
heartRate          0
glucose            0
TenYearCHD         0
dtype: int64

In [7]:
#Find duplicates
print(f'Duplicate entries:{df.duplicated().sum()}')

Duplicate entries:0


In [8]:
#Drop unncessary columns and display DataFrame
df.drop(columns=['currentSmoker','diabetes','BPMeds','prevalentStroke'], inplace=True)
df

Unnamed: 0,sex,age,education,cigsPerDay,prevalentHyp,totChol,sysBP,diaBP,BMI,heartRate,glucose,TenYearCHD
0,1,39,4.0,0.0,0,195.0,106.0,70.0,26.97,80.0,77.0,0
1,0,46,2.0,0.0,0,250.0,121.0,81.0,28.73,95.0,76.0,0
2,1,48,1.0,20.0,0,245.0,127.5,80.0,25.34,75.0,70.0,0
3,0,61,3.0,30.0,1,225.0,150.0,95.0,28.58,65.0,103.0,1
4,0,46,3.0,23.0,0,285.0,130.0,84.0,23.10,85.0,85.0,0
...,...,...,...,...,...,...,...,...,...,...,...,...
3653,1,50,1.0,1.0,1,313.0,179.0,92.0,25.97,66.0,86.0,1
3654,1,51,3.0,43.0,0,207.0,126.5,80.0,19.71,65.0,68.0,0
3655,0,52,2.0,0.0,0,269.0,133.5,83.0,21.47,80.0,107.0,0
3656,1,40,3.0,0.0,1,185.0,141.0,98.0,25.60,67.0,72.0,0


In [9]:
#Display basic statistical info
df.describe()

Unnamed: 0,sex,age,education,cigsPerDay,prevalentHyp,totChol,sysBP,diaBP,BMI,heartRate,glucose,TenYearCHD
count,3658.0,3658.0,3658.0,3658.0,3658.0,3658.0,3658.0,3658.0,3658.0,3658.0,3658.0,3658.0
mean,0.443685,49.551941,1.980317,9.025424,0.311646,236.847731,132.370558,82.917031,25.782802,75.730727,81.852925,0.152269
std,0.496886,8.562029,1.022656,11.92159,0.463229,44.097681,22.086866,11.974258,4.065601,11.981525,23.904164,0.359331
min,0.0,32.0,1.0,0.0,0.0,113.0,83.5,48.0,15.54,44.0,40.0,0.0
25%,0.0,42.0,1.0,0.0,0.0,206.0,117.0,75.0,23.08,68.0,71.0,0.0
50%,0.0,49.0,2.0,0.0,0.0,234.0,128.0,82.0,25.38,75.0,78.0,0.0
75%,1.0,56.0,3.0,20.0,1.0,263.0,143.875,90.0,28.0375,82.0,87.0,0.0
max,1.0,70.0,4.0,70.0,1.0,600.0,295.0,142.5,56.8,143.0,394.0,1.0


# Split the data into training and testing

In [10]:
#Create features
X=df.copy()
X=X.drop(columns='TenYearCHD')

#Create target
y=df[['TenYearCHD']]

y.shape

(3658, 1)

In [11]:
#Check balance of target values
y['TenYearCHD'].value_counts()

0    3101
1     557
Name: TenYearCHD, dtype: int64

In [12]:
#Split data into Train and Test Sets
X_train, X_test, y_train, y_test=train_test_split(X,y, random_state=1, stratify=y)
print(f'Xtrain:{len(X_train)}')
print(f'y_train:{len(y_train)}')
print(f'X_test:{len(X_test)}')
print(f'y_test:{len(y_test)}')

Xtrain:2743
y_train:2743
X_test:915
y_test:915


# Scale the  data

In [13]:
#Create a StandardScaler instance
scaler=StandardScaler()

#Fit StandardScaler()
X_scaler=scaler.fit(X_train)

#Scaling data
X_train_scaled=X_scaler.transform(X_train)

X_test_scaled=X_scaler.transform(X_test)

# SMOTEENN and Logistic Regression

In [14]:
#Resample the training data SMOTEENN
smote_enn=SMOTEENN(random_state=1)
X_resampled, y_resampled=smote_enn.fit_resample(X_train_scaled,y_train)


In [15]:
print(len(X_resampled))
print(len(y_resampled))

3474
3474


In [16]:
#Train the model
model=LogisticRegression(solver='lbfgs', random_state=1)
model.fit(X_resampled, y_resampled)

LogisticRegression(random_state=1)

In [17]:
#Calculate de balanced accuracy score
y_pred=model.predict(X_test_scaled)
balanced_accuracy_score(y_test, y_pred)

0.633441185196173

In [18]:
#Display the confusion matrix
confusion_matrix(y_test, y_pred)

array([[369, 407],
       [ 29, 110]], dtype=int64)

In [19]:
#Display confusion maxtrix as a DataFrame
cm=confusion_matrix(y_test, y_pred)
cm_df=pd.DataFrame(cm, index=['Actual 0','Actual 1'], columns=['Predicted 0', 'Predicted 1'])
cm_df

Unnamed: 0,Predicted 0,Predicted 1
Actual 0,369,407
Actual 1,29,110


In [20]:
#Print the imbalanced classification report
print (classification_report_imbalanced(y_test, y_pred))

                   pre       rec       spe        f1       geo       iba       sup

          0       0.93      0.48      0.79      0.63      0.61      0.36       776
          1       0.21      0.79      0.48      0.34      0.61      0.39       139

avg / total       0.82      0.52      0.74      0.58      0.61      0.37       915



#   Balanced Random Forest Classifier

In [21]:
#Train the EasyEnsembleClassifier
from imblearn.ensemble import BalancedRandomForestClassifier

brf=BalancedRandomForestClassifier(n_estimators=100, random_state=1)
brf.fit(X_train, y_train)

BalancedRandomForestClassifier(random_state=1)

In [22]:
#Calculate the balanced accuracy score
y_pred=brf.predict(X_test)
balanced_accuracy_score(y_test, y_pred)

0.6581992138248165

In [23]:
#Display the confusion matrix
confusion_matrix(y_test, y_pred)

array([[480, 296],
       [ 42,  97]], dtype=int64)

In [24]:
#Print the imbalanced classification report
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.92      0.62      0.74       776
           1       0.25      0.70      0.36       139

    accuracy                           0.63       915
   macro avg       0.58      0.66      0.55       915
weighted avg       0.82      0.63      0.68       915



In [25]:
#List the features sorted in descending order by feature importance
importances=brf.feature_importances_
sorted(zip(brf.feature_importances_, X.columns), reverse=True)

[(0.16028440569225996, 'age'),
 (0.14133627758024617, 'sysBP'),
 (0.12220760269407348, 'BMI'),
 (0.11597676379497396, 'glucose'),
 (0.11411974940505758, 'totChol'),
 (0.11031162956352512, 'diaBP'),
 (0.09203469908252443, 'heartRate'),
 (0.05953857175147752, 'cigsPerDay'),
 (0.03736546068556977, 'education'),
 (0.025314681805582454, 'sex'),
 (0.021510157944709497, 'prevalentHyp')]

# Easy Ensemble Classifier

In [26]:
from imblearn.ensemble import EasyEnsembleClassifier
eec=EasyEnsembleClassifier(n_estimators=100, random_state=1)
eec.fit(X_train, y_train)

EasyEnsembleClassifier(n_estimators=100, random_state=1)

In [27]:
# Calculated the balanced accuracy score
y_pred=eec.predict(X_test)
balanced_accuracy_score(y_test, y_pred)

0.6599189720388637

In [28]:
# Display the confusion matrix
confusion_matrix(y_test, y_pred)

array([[505, 271],
       [ 46,  93]], dtype=int64)

In [29]:
# Print the imbalanced classification report
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.92      0.65      0.76       776
           1       0.26      0.67      0.37       139

    accuracy                           0.65       915
   macro avg       0.59      0.66      0.57       915
weighted avg       0.82      0.65      0.70       915



# Random oversampling

In [30]:
# Implement random oversampling
from imblearn.over_sampling import RandomOverSampler
ros = RandomOverSampler(random_state=1)
X_resampled, y_resampled = ros.fit_resample(X_train_scaled, y_train)


In [31]:
print(X_resampled.shape)
print(y_resampled.shape)

(4650, 11)
(4650, 1)


In [32]:
#Perform Logistic Regression
model = LogisticRegression(solver='lbfgs', random_state=1)
model.fit(X_resampled, y_resampled)

LogisticRegression(random_state=1)

In [33]:
#Make Predictions
y_pred = model.predict(X_test_scaled)
confusion_matrix(y_test, y_pred)

array([[504, 272],
       [ 47,  92]], dtype=int64)

In [34]:
#Display confusion maxtrix as a DataFrame
cm=confusion_matrix(y_test, y_pred)
cm_df=pd.DataFrame(cm, index=['Actual 0','Actual 1'], columns=['Predicted 0', 'Predicted 1'])
cm_df

Unnamed: 0,Predicted 0,Predicted 1
Actual 0,504,272
Actual 1,47,92


In [35]:
#Calculate balanced accuracy score
balanced_accuracy_score(y_test, y_pred)

0.6556775198397983

In [36]:
#Print classification report
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.91      0.65      0.76       776
           1       0.25      0.66      0.37       139

    accuracy                           0.65       915
   macro avg       0.58      0.66      0.56       915
weighted avg       0.81      0.65      0.70       915



# SMOTE and Logistic Regression

In [37]:
#Import dependency
from imblearn.over_sampling import SMOTE
X_resampled, y_resampled = SMOTE(random_state=1, sampling_strategy='auto').fit_resample(X_train_scaled, y_train)

In [38]:
print(X_resampled.shape)
print(y_resampled.shape)

(4650, 11)
(4650, 1)


In [39]:
#Perform Logistic regression
model = LogisticRegression(solver='lbfgs', random_state=1)
model.fit(X_resampled, y_resampled)

LogisticRegression(random_state=1)

In [40]:
#Make predictions
y_pred = model.predict(X_test_scaled)
balanced_accuracy_score(y_test, y_pred)

0.6537445301490766

In [41]:

confusion_matrix(y_test, y_pred)

array([[501, 275],
       [ 47,  92]], dtype=int64)

In [42]:
#Display confusion maxtrix as a DataFrame
cm=confusion_matrix(y_test, y_pred)
cm_df=pd.DataFrame(cm, index=['Actual 0','Actual 1'], columns=['Predicted 0', 'Predicted 1'])
cm_df

Unnamed: 0,Predicted 0,Predicted 1
Actual 0,501,275
Actual 1,47,92


In [43]:
# Print the imbalanced classification report
print(classification_report_imbalanced(y_test, y_pred))

                   pre       rec       spe        f1       geo       iba       sup

          0       0.91      0.65      0.66      0.76      0.65      0.43       776
          1       0.25      0.66      0.65      0.36      0.65      0.43       139

avg / total       0.81      0.65      0.66      0.70      0.65      0.43       915



# Neural Network

In [44]:
#Define the basic neural network model
number_input_features=len(X_train_scaled[0])

nn_model=tf.keras.models.Sequential()

#First hidden layer
nn_model.add(tf.keras.layers.Dense(units=80, input_dim=number_input_features, activation='relu'))

#Second hidden layer
nn_model.add(tf.keras.layers.Dense(units=30, activation='relu'))

#Third hidden layer
nn_model.add(tf.keras.layers.Dense(units=30, activation='relu'))

#Output layer
nn_model.add(tf.keras.layers.Dense(units=1, activation='sigmoid'))

#Check the structure of the model
nn_model.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 dense (Dense)               (None, 80)                960       
                                                                 
 dense_1 (Dense)             (None, 30)                2430      
                                                                 
 dense_2 (Dense)             (None, 30)                930       
                                                                 
 dense_3 (Dense)             (None, 1)                 31        
                                                                 
Total params: 4,351
Trainable params: 4,351
Non-trainable params: 0
_________________________________________________________________


In [45]:
#Compile the model
nn_model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])


In [46]:
#Train the model
nn_model.fit(X_train_scaled, y_train, epochs=100)

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100
Epoch 75/100
Epoch 76/100
Epoch 77/100
Epoch 78

Epoch 84/100
Epoch 85/100
Epoch 86/100
Epoch 87/100
Epoch 88/100
Epoch 89/100
Epoch 90/100
Epoch 91/100
Epoch 92/100
Epoch 93/100
Epoch 94/100
Epoch 95/100
Epoch 96/100
Epoch 97/100
Epoch 98/100
Epoch 99/100
Epoch 100/100


<keras.callbacks.History at 0x249760b5288>

In [47]:
#Evaluate the model using th test data
model_loss, model_accuracy=nn_model.evaluate(X_test_scaled, y_test, verbose=2)
print(f'Loss: {model_loss}, Accuracy:{model_accuracy}')

29/29 - 0s - loss: 1.4517 - accuracy: 0.7541 - 194ms/epoch - 7ms/step
Loss: 1.451681137084961, Accuracy:0.7540983557701111
