In [1]:
#Import dependencies
import warnings
warnings.filterwarnings('ignore')
import numpy as np
import pandas as pd
import os
from collections import Counter
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from imblearn.over_sampling import RandomOverSampler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import balanced_accuracy_score
from sklearn.metrics import confusion_matrix
from imblearn.metrics import classification_report_imbalanced
from imblearn.combine import SMOTEENN
import sqlalchemy as sa
from sqlalchemy import create_engine
import psycopg2 as pg

# Connect to database

In [2]:
#Connect to Postgresql database 
conn=sa.create_engine('postgresql://root:postgres@dataanalyticsdb.cxnhjzyey4ka.us-east-2.rds.amazonaws.com:5432/coursefinalproject')

# Load data and perform data cleaning

In [3]:
#Load and display data
df=pd.read_sql_query("SELECT * FROM fragmingham", con=conn)
df

Unnamed: 0,sex,age,education,currentSmoker,cigsPerDay,BPMeds,prevalentStroke,prevalentHyp,diabetes,totChol,sysBP,diaBP,BMI,heartRate,glucose,TenYearCHD
0,1,39,4.0,0,0.0,0.0,0,0,0,195.0,106.0,70.0,26.97,80.0,77.0,0
1,0,46,2.0,0,0.0,0.0,0,0,0,250.0,121.0,81.0,28.73,95.0,76.0,0
2,1,48,1.0,1,20.0,0.0,0,0,0,245.0,127.5,80.0,25.34,75.0,70.0,0
3,0,61,3.0,1,30.0,0.0,0,1,0,225.0,150.0,95.0,28.58,65.0,103.0,1
4,0,46,3.0,1,23.0,0.0,0,0,0,285.0,130.0,84.0,23.10,85.0,85.0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3653,1,50,1.0,1,1.0,0.0,0,1,0,313.0,179.0,92.0,25.97,66.0,86.0,1
3654,1,51,3.0,1,43.0,0.0,0,0,0,207.0,126.5,80.0,19.71,65.0,68.0,0
3655,0,52,2.0,0,0.0,0.0,0,0,0,269.0,133.5,83.0,21.47,80.0,107.0,0
3656,1,40,3.0,0,0.0,0.0,0,1,0,185.0,141.0,98.0,25.60,67.0,72.0,0


In [4]:
#Drop unncessary columns and display DataFrame
df.drop(columns=['education','cigsPerDay','prevalentStroke'], inplace=True)
df

Unnamed: 0,sex,age,currentSmoker,BPMeds,prevalentHyp,diabetes,totChol,sysBP,diaBP,BMI,heartRate,glucose,TenYearCHD
0,1,39,0,0.0,0,0,195.0,106.0,70.0,26.97,80.0,77.0,0
1,0,46,0,0.0,0,0,250.0,121.0,81.0,28.73,95.0,76.0,0
2,1,48,1,0.0,0,0,245.0,127.5,80.0,25.34,75.0,70.0,0
3,0,61,1,0.0,1,0,225.0,150.0,95.0,28.58,65.0,103.0,1
4,0,46,1,0.0,0,0,285.0,130.0,84.0,23.10,85.0,85.0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
3653,1,50,1,0.0,1,0,313.0,179.0,92.0,25.97,66.0,86.0,1
3654,1,51,1,0.0,0,0,207.0,126.5,80.0,19.71,65.0,68.0,0
3655,0,52,0,0.0,0,0,269.0,133.5,83.0,21.47,80.0,107.0,0
3656,1,40,0,0.0,1,0,185.0,141.0,98.0,25.60,67.0,72.0,0


# Split the data into training and testing

In [5]:
#Create features
X=df.copy()
X=X.drop(columns='TenYearCHD')

#Create target
y=df[['TenYearCHD']]

y.shape

(3658, 1)

In [6]:
#Check balance of target values
y['TenYearCHD'].value_counts()

0    3101
1     557
Name: TenYearCHD, dtype: int64

In [7]:
#Split data into Train and Test Sets
X_train, X_test, y_train, y_test=train_test_split(X,y, random_state=1)
Counter(y_train)

Counter({'TenYearCHD': 1})

# Scale the  data

In [8]:
#Create a StandardScaler instance
scaler=StandardScaler()

In [9]:
#Fit Standard Scaler
X_scaler=scaler.fit(X_train)

In [10]:
#Scaling data
X_train_scaled=X_scaler.transform(X_train)
X_test_scaled=X_scaler.transform(X_test)

# Resample data (Oversampling/Undersampling/Combination) Perform Logistic Regression

In [11]:
#Resample the training data SMOTEENN
smote_enn=SMOTEENN(random_state=1)
X_resampled, y_resampled=smote_enn.fit_resample(X,y)

In [12]:
#Train the model
model=LogisticRegression(solver='lbfgs', random_state=1)
model.fit(X_resampled, y_train)

LogisticRegression(random_state=1)

In [13]:
#Calculate de balanced accuracy score
y_pred=model.predict(X_test)
balanced_accuracy_score(y_test, y_pred)

0.526336405529954

In [14]:
#Display the confusion matrix
confusion_matrix(y_test, y_pred)

array([[766,   9],
       [131,   9]], dtype=int64)

In [15]:
#Display confusion maxtrix as a DataFrame
cm=confusion_matrix(y_test, y_pred)
cm_df=pd.DataFrame(cm, index=['Actual 0','Actual 1'], columns=['Predicted 0', 'Predicted 1'])
cm_df

Unnamed: 0,Predicted 0,Predicted 1
Actual 0,766,9
Actual 1,131,9


In [16]:
#Print the imbalanced classification report
print (classification_report_imbalanced(y_test, y_pred))

                   pre       rec       spe        f1       geo       iba       sup

          0       0.85      0.99      0.06      0.92      0.25      0.07       775
          1       0.50      0.06      0.99      0.11      0.25      0.06       140

avg / total       0.80      0.85      0.21      0.79      0.25      0.07       915



In [17]:
#List the features sorted in descending order by feature importance


#  As an alternative to the previous step use Balanced Random Forest Classifier

In [18]:
#Train the EasyEnsembleClassifier
from imblearn.ensemble import BalancedRandomForestClassifier

brf=BalancedRandomForestClassifier(n_estimators=100, random_state=1)
brf.fit(X_resampled, y_train)

BalancedRandomForestClassifier(random_state=1)

In [19]:
#Calculate the balanced accuracy score
y_pred=brf.predict(X_test_scaled)
balanced_accuracy_score(y_test, y_pred)

0.6502534562211981

In [20]:
#Display the confusion matrix
confusion_matrix(y_test, y_pred)

array([[482, 293],
       [ 45,  95]], dtype=int64)

In [21]:
#Print the imbalanced classification report
print(classification_report_imbalanced(y_test, y_pred))

                   pre       rec       spe        f1       geo       iba       sup

          0       0.91      0.62      0.68      0.74      0.65      0.42       775
          1       0.24      0.68      0.62      0.36      0.65      0.42       140

avg / total       0.81      0.63      0.67      0.68      0.65      0.42       915



In [22]:
#List the features sorted in descending order by feature importance
importances=brf.feature_importances_
sorted(zip(brf.feature_importances_, X.columns), reverse=True)

[(0.1681709331410142, 'age'),
 (0.14374705074282867, 'sysBP'),
 (0.13530959434657322, 'BMI'),
 (0.12265783107374886, 'diaBP'),
 (0.12251139387393628, 'totChol'),
 (0.11791672550690346, 'glucose'),
 (0.1040862076973327, 'heartRate'),
 (0.02586373046174877, 'sex'),
 (0.025220955496446013, 'currentSmoker'),
 (0.021022730431335887, 'prevalentHyp'),
 (0.007034580143786297, 'BPMeds'),
 (0.0064582670843457615, 'diabetes')]

# Easy Ensemble Classifier

In [23]:
from imblearn.ensemble import EasyEnsembleClassifier
eec=EasyEnsembleClassifier(n_estimators=100, random_state=1)
eec.fit(X_train_scaled, y_train)

EasyEnsembleClassifier(n_estimators=100, random_state=1)

In [24]:
# Calculated the balanced accuracy score
y_pred=eec.predict(X_test_scaled)
balanced_accuracy_score(y_test, y_pred)

0.6633410138248848

In [25]:
# Display the confusion matrix
confusion_matrix(y_test, y_pred)

array([[458, 317],
       [ 37, 103]], dtype=int64)

In [26]:
# Print the imbalanced classification report
print(classification_report_imbalanced(y_test, y_pred))

                   pre       rec       spe        f1       geo       iba       sup

          0       0.93      0.59      0.74      0.72      0.66      0.43       775
          1       0.25      0.74      0.59      0.37      0.66      0.44       140

avg / total       0.82      0.61      0.71      0.67      0.66      0.43       915

