In [1]:
#Import dependencies
import warnings
warnings.filterwarnings('ignore')
import numpy as np
import pandas as pd
import os
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from imblearn.over_sampling import RandomOverSampler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import balanced_accuracy_score
from sklearn.metrics import confusion_matrix
from imblearn.metrics import classification_report_imbalanced
from imblearn.combine import SMOTEENN
import sqlalchemy as sa
from sqlalchemy import create_engine
import psycopg2 as pg
from sklearn.metrics import classification_report
import tensorflow as tf
from imblearn.ensemble import BalancedRandomForestClassifier
from imblearn.ensemble import EasyEnsembleClassifier
from sklearn.svm import SVC

# Preliminary data preprocessing
For the data preprocessing the data was imported from the database in *AWS* and connected to *postgresq*l using *sqlalchemy*. The data was checked for duplicate and null values. The null valued were dropped and there were no duplicate entries. The column 'male' was changed to 'sex' for clarity. This part of the preprocessing was done when the data was imported to the database.

In [2]:
#Connect to Postgresql database 
conn=sa.create_engine('postgresql://root:postgres@dataanalyticsdb.cxnhjzyey4ka.us-east-2.rds.amazonaws.com:5432/coursefinalproject')

In [3]:
#Load and display data
df=pd.read_sql_query("SELECT * FROM fragmingham", con=conn)
df

Unnamed: 0,sex,age,education,currentSmoker,cigsPerDay,BPMeds,prevalentStroke,prevalentHyp,diabetes,totChol,sysBP,diaBP,BMI,heartRate,glucose,TenYearCHD
0,1,39,4.0,0,0.0,0.0,0,0,0,195.0,106.0,70.0,26.97,80.0,77.0,0
1,0,46,2.0,0,0.0,0.0,0,0,0,250.0,121.0,81.0,28.73,95.0,76.0,0
2,1,48,1.0,1,20.0,0.0,0,0,0,245.0,127.5,80.0,25.34,75.0,70.0,0
3,0,61,3.0,1,30.0,0.0,0,1,0,225.0,150.0,95.0,28.58,65.0,103.0,1
4,0,46,3.0,1,23.0,0.0,0,0,0,285.0,130.0,84.0,23.10,85.0,85.0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3653,1,50,1.0,1,1.0,0.0,0,1,0,313.0,179.0,92.0,25.97,66.0,86.0,1
3654,1,51,3.0,1,43.0,0.0,0,0,0,207.0,126.5,80.0,19.71,65.0,68.0,0
3655,0,52,2.0,0,0.0,0.0,0,0,0,269.0,133.5,83.0,21.47,80.0,107.0,0
3656,1,40,3.0,0,0.0,0.0,0,1,0,185.0,141.0,98.0,25.60,67.0,72.0,0


In [4]:
#Diplay column names
df.columns

Index(['sex', 'age', 'education', 'currentSmoker', 'cigsPerDay', 'BPMeds',
       'prevalentStroke', 'prevalentHyp', 'diabetes', 'totChol', 'sysBP',
       'diaBP', 'BMI', 'heartRate', 'glucose', 'TenYearCHD'],
      dtype='object')

In [5]:
#Explore data types
df.dtypes

sex                  int64
age                  int64
education          float64
currentSmoker        int64
cigsPerDay         float64
BPMeds             float64
prevalentStroke      int64
prevalentHyp         int64
diabetes             int64
totChol            float64
sysBP              float64
diaBP              float64
BMI                float64
heartRate          float64
glucose            float64
TenYearCHD           int64
dtype: object

In [6]:
#Explore for null value
df.isnull().sum()

sex                0
age                0
education          0
currentSmoker      0
cigsPerDay         0
BPMeds             0
prevalentStroke    0
prevalentHyp       0
diabetes           0
totChol            0
sysBP              0
diaBP              0
BMI                0
heartRate          0
glucose            0
TenYearCHD         0
dtype: int64

In [7]:
#Find duplicates
print(f'Duplicate entries:{df.duplicated().sum()}')

Duplicate entries:0


- The dataset contains two similar features, one of them is if the participant smokes *currentSmoker* and number of cigarettes per day *cigsPerDay* that the person smokes. *currenSmoker* is categorical (0=non-smoker or 1=smoker) and *cigsPerDay* is continuous (float64). If *currentSmoker* is 0, then *cigsPerDay* would be 0. These two features showed high correlation in the exploratory data analysis  so *currentSmoker* was dropped in favour of *cigsPerDay*.
- After running the *Balanced Random Forest Classifier* for the first time and checking the *importances* we decided to drop some additional columns as their value did not even reach 0.01. 

In [8]:
#Drop unncessary columns and display DataFrame
df.drop(columns=['currentSmoker', 'prevalentStroke','BPMeds','diabetes' ], inplace=True)
df

Unnamed: 0,sex,age,education,cigsPerDay,prevalentHyp,totChol,sysBP,diaBP,BMI,heartRate,glucose,TenYearCHD
0,1,39,4.0,0.0,0,195.0,106.0,70.0,26.97,80.0,77.0,0
1,0,46,2.0,0.0,0,250.0,121.0,81.0,28.73,95.0,76.0,0
2,1,48,1.0,20.0,0,245.0,127.5,80.0,25.34,75.0,70.0,0
3,0,61,3.0,30.0,1,225.0,150.0,95.0,28.58,65.0,103.0,1
4,0,46,3.0,23.0,0,285.0,130.0,84.0,23.10,85.0,85.0,0
...,...,...,...,...,...,...,...,...,...,...,...,...
3653,1,50,1.0,1.0,1,313.0,179.0,92.0,25.97,66.0,86.0,1
3654,1,51,3.0,43.0,0,207.0,126.5,80.0,19.71,65.0,68.0,0
3655,0,52,2.0,0.0,0,269.0,133.5,83.0,21.47,80.0,107.0,0
3656,1,40,3.0,0.0,1,185.0,141.0,98.0,25.60,67.0,72.0,0


In [9]:
#Display basic statistical info
df.describe()

Unnamed: 0,sex,age,education,cigsPerDay,prevalentHyp,totChol,sysBP,diaBP,BMI,heartRate,glucose,TenYearCHD
count,3658.0,3658.0,3658.0,3658.0,3658.0,3658.0,3658.0,3658.0,3658.0,3658.0,3658.0,3658.0
mean,0.443685,49.551941,1.980317,9.025424,0.311646,236.847731,132.370558,82.917031,25.782802,75.730727,81.852925,0.152269
std,0.496886,8.562029,1.022656,11.92159,0.463229,44.097681,22.086866,11.974258,4.065601,11.981525,23.904164,0.359331
min,0.0,32.0,1.0,0.0,0.0,113.0,83.5,48.0,15.54,44.0,40.0,0.0
25%,0.0,42.0,1.0,0.0,0.0,206.0,117.0,75.0,23.08,68.0,71.0,0.0
50%,0.0,49.0,2.0,0.0,0.0,234.0,128.0,82.0,25.38,75.0,78.0,0.0
75%,1.0,56.0,3.0,20.0,1.0,263.0,143.875,90.0,28.0375,82.0,87.0,0.0
max,1.0,70.0,4.0,70.0,1.0,600.0,295.0,142.5,56.8,143.0,394.0,1.0


In [10]:
#Locate outliers for totChol
df.loc[df.totChol>500]

Unnamed: 0,sex,age,education,cigsPerDay,prevalentHyp,totChol,sysBP,diaBP,BMI,heartRate,glucose,TenYearCHD
966,0,52,2.0,0.0,1,600.0,159.5,94.0,28.27,78.0,140.0,1


- The extreme outlier of 600 for *totChol* was dropped.

In [11]:
#Drop extreme values for totChol
df=df.drop(df[df['totChol']>500].index)
df.reset_index(drop=True)

Unnamed: 0,sex,age,education,cigsPerDay,prevalentHyp,totChol,sysBP,diaBP,BMI,heartRate,glucose,TenYearCHD
0,1,39,4.0,0.0,0,195.0,106.0,70.0,26.97,80.0,77.0,0
1,0,46,2.0,0.0,0,250.0,121.0,81.0,28.73,95.0,76.0,0
2,1,48,1.0,20.0,0,245.0,127.5,80.0,25.34,75.0,70.0,0
3,0,61,3.0,30.0,1,225.0,150.0,95.0,28.58,65.0,103.0,1
4,0,46,3.0,23.0,0,285.0,130.0,84.0,23.10,85.0,85.0,0
...,...,...,...,...,...,...,...,...,...,...,...,...
3652,1,50,1.0,1.0,1,313.0,179.0,92.0,25.97,66.0,86.0,1
3653,1,51,3.0,43.0,0,207.0,126.5,80.0,19.71,65.0,68.0,0
3654,0,52,2.0,0.0,0,269.0,133.5,83.0,21.47,80.0,107.0,0
3655,1,40,3.0,0.0,1,185.0,141.0,98.0,25.60,67.0,72.0,0


# Create features and split data into training and testing
- Our target is *TenYearCHD* which is the result of the Framingham risk score that determines whether a person is at risk of developing coronary heart disease in 10 years. The values of the target are categorical as int64 (0=is not at risk of developing CHD in ten years/1=at risk of developing CHD in ten years).
- The rest of the features are: *sex, age, education, cigsPerDay, prevalentHyp, totChol, sysBP, diaBP, BMI, heartRate and glucose*.

In [12]:
#Create features
X=df.copy()
X=X.drop(columns='TenYearCHD')

#Create target
y=df[['TenYearCHD']]

y.shape

(3657, 1)

In [13]:
#Check balance of target values
y['TenYearCHD'].value_counts()

0    3101
1     556
Name: TenYearCHD, dtype: int64

# How data was split into training and testing sets
The data was split into training and testing sets using the *scikitlearn* module `test_train_split()`. Given the imbalance in the dataset in the target class (*TenYearCHD*) the data was stratified (`stratify=y`) during the split.

In [14]:
#Split data into Train and Test Sets
X_train, X_test, y_train, y_test=train_test_split(X,y, random_state=1, stratify=y)
print(f'Xtrain:{X_train.shape}')
print(f'y_train:{y_train.shape}')
print(f'X_test:{X_test.shape}')
print(f'y_test:{y_test.shape}')

Xtrain:(2742, 11)
y_train:(2742, 1)
X_test:(915, 11)
y_test:(915, 1)


# Scale the  data
Given the difference in values and that some features are continuous and some are categorical the data was standardized using *scikitlearn* `StandardScaler`.

In [15]:
#Create a StandardScaler instance
scaler=StandardScaler()

#Fit StandardScaler()
X_scaler=scaler.fit(X_train)

#Scaling data
X_train_scaled=X_scaler.transform(X_train)

X_test_scaled=X_scaler.transform(X_test)

# First model choice
The first Machine Learning model we chose is *Logistic Regression* as we are trying to predict a discrete binary outcome. *Logistic regression* is easier to implement, interpret, and very efficient to train. It can have good accuracy for many simple data sets and it performs well when the dataset is linearly separable. Although we believe *Logistic regression* might work in this case, logistic regression inherently runs on a linear model and there are other models available which could also prove useful.

##  Resample with  Random oversampling
Given that the dataset is imbalanced we resampled the data using *RandomOverSampler* from the *imblearn* library.

In [16]:
# Implement random oversampling
ros = RandomOverSampler(random_state=1)
X_resampled, y_resampled = ros.fit_resample(X_train_scaled, y_train)


In [17]:
print(X_resampled.shape)
print(y_resampled.shape)

(4650, 11)
(4650, 1)


In [18]:
#Perform Logistic Regression
model = LogisticRegression(solver='lbfgs', random_state=1)
model.fit(X_resampled, y_resampled)

LogisticRegression(random_state=1)

In [19]:
#Make Predictions
y_pred = model.predict(X_test_scaled)

In [20]:
#Display confusion maxtrix as a DataFrame
cm=confusion_matrix(y_test, y_pred)
cm_df=pd.DataFrame(cm, index=['Actual 0','Actual 1'], columns=['Predicted 0', 'Predicted 1'])
cm_df

Unnamed: 0,Predicted 0,Predicted 1
Actual 0,499,277
Actual 1,42,97


In [21]:
#Print classification report
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.92      0.64      0.76       776
           1       0.26      0.70      0.38       139

    accuracy                           0.65       915
   macro avg       0.59      0.67      0.57       915
weighted avg       0.82      0.65      0.70       915



# Additional Models for comparisson

## SMOTEENN and Logistic Regression

In [22]:
#Resample the training data SMOTEENN
smote_enn=SMOTEENN(random_state=1)
X_resampled, y_resampled=smote_enn.fit_resample(X_train_scaled,y_train)


In [23]:
print(X_resampled.shape)
print(y_resampled.shape)

(3489, 11)
(3489, 1)


In [24]:
#Train the model
model=LogisticRegression(solver='lbfgs', random_state=1)
model.fit(X_resampled, y_resampled)

LogisticRegression(random_state=1)

In [25]:
#Make Predictions
y_pred = model.predict(X_test_scaled)
confusion_matrix(y_test, y_pred)

array([[362, 414],
       [ 22, 117]], dtype=int64)

In [26]:
#Display confusion maxtrix as a DataFrame
cm=confusion_matrix(y_test, y_pred)
cm_df=pd.DataFrame(cm, index=['Actual 0','Actual 1'], columns=['Predicted 0', 'Predicted 1'])
cm_df

Unnamed: 0,Predicted 0,Predicted 1
Actual 0,362,414
Actual 1,22,117


In [27]:
#Print the classification report
print (classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.94      0.47      0.62       776
           1       0.22      0.84      0.35       139

    accuracy                           0.52       915
   macro avg       0.58      0.65      0.49       915
weighted avg       0.83      0.52      0.58       915



##   Balanced Random Forest Classifier 

In [28]:
#Train the Balanced Random Forest Classifier
brf=BalancedRandomForestClassifier(n_estimators=100, random_state=1)
brf.fit(X_train_scaled, y_train)

BalancedRandomForestClassifier(random_state=1)

In [29]:
#Make predictions
y_pred=brf.predict(X_test_scaled)

In [30]:
#Display the confusion matrix
cm=confusion_matrix(y_test, y_pred)
cm_df=pd.DataFrame(cm, index=['Actual 0','Actual 1'], columns=['Predicted 0', 'Predicted 1'])
cm_df

Unnamed: 0,Predicted 0,Predicted 1
Actual 0,488,288
Actual 1,44,95


In [31]:
#Print the  classification report
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.92      0.63      0.75       776
           1       0.25      0.68      0.36       139

    accuracy                           0.64       915
   macro avg       0.58      0.66      0.56       915
weighted avg       0.82      0.64      0.69       915



In [32]:
#List the features sorted in descending order by feature importance
importances=brf.feature_importances_
sorted(zip(brf.feature_importances_, X.columns), reverse=True)

[(0.16351460455779293, 'age'),
 (0.1405446659512617, 'sysBP'),
 (0.11945616965365684, 'BMI'),
 (0.11898302288134119, 'totChol'),
 (0.11379731879715406, 'diaBP'),
 (0.11193782917345585, 'glucose'),
 (0.09090217881065187, 'heartRate'),
 (0.05585240785767976, 'cigsPerDay'),
 (0.03781316162465326, 'education'),
 (0.02605843137640234, 'sex'),
 (0.02114020931595029, 'prevalentHyp')]

##  Neural Network

In [33]:
#Define the basic neural network model
number_input_features=len(X_train_scaled[0])

nn_model=tf.keras.models.Sequential()

#First hidden layer
nn_model.add(tf.keras.layers.Dense(units=80, input_dim=number_input_features, activation='relu'))

#Second hidden layer
nn_model.add(tf.keras.layers.Dense(units=30, activation='relu'))

#Third hidden layer
nn_model.add(tf.keras.layers.Dense(units=30, activation='relu'))

#Output layer
nn_model.add(tf.keras.layers.Dense(units=1, activation='sigmoid'))

#Check the structure of the model
nn_model.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 dense (Dense)               (None, 80)                960       
                                                                 
 dense_1 (Dense)             (None, 30)                2430      
                                                                 
 dense_2 (Dense)             (None, 30)                930       
                                                                 
 dense_3 (Dense)             (None, 1)                 31        
                                                                 
Total params: 4,351
Trainable params: 4,351
Non-trainable params: 0
_________________________________________________________________


In [34]:
#Compile the model
nn_model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])


In [35]:
#Train the model
nn_model.fit(X_train_scaled, y_train, epochs=100)

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100
Epoch 75/100
Epoch 76/100
Epoch 77/100
Epoch 78

Epoch 84/100
Epoch 85/100
Epoch 86/100
Epoch 87/100
Epoch 88/100
Epoch 89/100
Epoch 90/100
Epoch 91/100
Epoch 92/100
Epoch 93/100
Epoch 94/100
Epoch 95/100
Epoch 96/100
Epoch 97/100
Epoch 98/100
Epoch 99/100
Epoch 100/100


<keras.callbacks.History at 0x15705c3eb48>

In [36]:
#Evaluate the model using th test data
model_loss, model_accuracy=nn_model.evaluate(X_test_scaled, y_test, verbose=2)
print(f'Loss: {model_loss}, Accuracy:{model_accuracy}')

29/29 - 1s - loss: 1.1951 - accuracy: 0.8000 - 1s/epoch - 47ms/step
Loss: 1.1951147317886353, Accuracy:0.800000011920929


In [39]:
y_pred=nn_model.predict(X_test_scaled)
y_pred=np.argmax(y_pred, axis=1)



In [41]:
#Display the confusion matrix
cm=confusion_matrix(y_test, y_pred)
cm_df=pd.DataFrame(cm, index=['Actual 0','Actual 1'], columns=['Predicted 0', 'Predicted 1'])
cm_df

Unnamed: 0,Predicted 0,Predicted 1
Actual 0,776,0
Actual 1,139,0


In [42]:
#Print the  classification report
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.85      1.00      0.92       776
           1       0.00      0.00      0.00       139

    accuracy                           0.85       915
   macro avg       0.42      0.50      0.46       915
weighted avg       0.72      0.85      0.78       915

