In [1]:
import warnings
warnings.filterwarnings('ignore')
import numpy as np
import pandas as pd
import os
from collections import Counter
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from imblearn.over_sampling import RandomOverSampler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import balanced_accuracy_score
from sklearn.metrics import confusion_matrix
from imblearn.metrics import classification_report_imbalanced
from imblearn.combine import SMOTEENN
import sqlalchemy as sa
from sqlalchemy import create_engine
import psycopg2 as pg
from imblearn.over_sampling import RandomOverSampler

In [2]:
#Connect to Postgresql database 
conn=sa.create_engine('postgresql://root:postgres@dataanalyticsdb.cxnhjzyey4ka.us-east-2.rds.amazonaws.com:5432/coursefinalproject')

In [3]:
#Load and display data
df=pd.read_sql_query("SELECT * FROM fragmingham", con=conn)
df

Unnamed: 0,sex,age,education,currentSmoker,cigsPerDay,BPMeds,prevalentStroke,prevalentHyp,diabetes,totChol,sysBP,diaBP,BMI,heartRate,glucose,TenYearCHD
0,1,39,4.0,0,0.0,0.0,0,0,0,195.0,106.0,70.0,26.97,80.0,77.0,0
1,0,46,2.0,0,0.0,0.0,0,0,0,250.0,121.0,81.0,28.73,95.0,76.0,0
2,1,48,1.0,1,20.0,0.0,0,0,0,245.0,127.5,80.0,25.34,75.0,70.0,0
3,0,61,3.0,1,30.0,0.0,0,1,0,225.0,150.0,95.0,28.58,65.0,103.0,1
4,0,46,3.0,1,23.0,0.0,0,0,0,285.0,130.0,84.0,23.10,85.0,85.0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3653,1,50,1.0,1,1.0,0.0,0,1,0,313.0,179.0,92.0,25.97,66.0,86.0,1
3654,1,51,3.0,1,43.0,0.0,0,0,0,207.0,126.5,80.0,19.71,65.0,68.0,0
3655,0,52,2.0,0,0.0,0.0,0,0,0,269.0,133.5,83.0,21.47,80.0,107.0,0
3656,1,40,3.0,0,0.0,0.0,0,1,0,185.0,141.0,98.0,25.60,67.0,72.0,0


In [4]:
#Drop unncessary columns and display DataFrame
df.drop(columns=['education','cigsPerDay','prevalentStroke'], inplace=True)
df

Unnamed: 0,sex,age,currentSmoker,BPMeds,prevalentHyp,diabetes,totChol,sysBP,diaBP,BMI,heartRate,glucose,TenYearCHD
0,1,39,0,0.0,0,0,195.0,106.0,70.0,26.97,80.0,77.0,0
1,0,46,0,0.0,0,0,250.0,121.0,81.0,28.73,95.0,76.0,0
2,1,48,1,0.0,0,0,245.0,127.5,80.0,25.34,75.0,70.0,0
3,0,61,1,0.0,1,0,225.0,150.0,95.0,28.58,65.0,103.0,1
4,0,46,1,0.0,0,0,285.0,130.0,84.0,23.10,85.0,85.0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
3653,1,50,1,0.0,1,0,313.0,179.0,92.0,25.97,66.0,86.0,1
3654,1,51,1,0.0,0,0,207.0,126.5,80.0,19.71,65.0,68.0,0
3655,0,52,0,0.0,0,0,269.0,133.5,83.0,21.47,80.0,107.0,0
3656,1,40,0,0.0,1,0,185.0,141.0,98.0,25.60,67.0,72.0,0


In [5]:
columns=['sex','age','currentSmoker','BPMeds','prevalentHyp',
         'diabetes','totChol','sysBP','diaBP','BMI','heartRate','glucose',
        'TenYearCHD']

# Create features and split data into training and testing

In [6]:
#Create features
X=df.copy()
X=X.drop(columns='TenYearCHD')

#Create target
y=df[['TenYearCHD']]

y.shape

(3658, 1)

In [7]:
#Check balance of target values
y['TenYearCHD'].value_counts()

0    3101
1     557
Name: TenYearCHD, dtype: int64

In [8]:
#Split data into Train and Test Sets
X_train, X_test, y_train, y_test=train_test_split(X,y, random_state=1, stratify=y)
Counter(y_train)

Counter({'TenYearCHD': 1})

# Scale the data

In [14]:
#Create a StandardScaler instance
scaler=StandardScaler()

#Fit StandardScaler()
X_scaler=scaler.fit(X_train)

#Scale the data
X_train_scaled=X_scaler.transform(X_train)

X_test_scaled=X_scaler.transform(X_test)

# Resample with Random oversampler

In [15]:
#Implement ramdom oversampling
ros = RandomOverSampler(random_state=1)
X_resampled, y_resampled = ros.fit_resample(X_train_scaled, y_train)

In [16]:
#Train the logistic regression model
model = LogisticRegression(solver='lbfgs', random_state=1)
model.fit(X_resampled, y_resampled)

LogisticRegression(random_state=1)

In [17]:
#Make predictions and display confusion matrix
y_pred = model.predict(X_test_scaled)
confusion_matrix(y_test, y_pred)

array([[522, 254],
       [ 45,  94]], dtype=int64)

In [18]:
#Display confusion maxtrix as a DataFrame
cm=confusion_matrix(y_test, y_pred)
cm_df=pd.DataFrame(cm, index=['Actual 0','Actual 1'], columns=['Predicted 0', 'Predicted 1'])
cm_df

Unnamed: 0,Predicted 0,Predicted 1
Actual 0,522,254
Actual 1,45,94


In [19]:
#Calculate the balanced accuracy score
balanced_accuracy_score(y_test, y_pred)

0.6744697025884447

In [20]:
# Print the imbalanced classification report
print(classification_report_imbalanced(y_test, y_pred))

                   pre       rec       spe        f1       geo       iba       sup

          0       0.92      0.67      0.68      0.78      0.67      0.45       776
          1       0.27      0.68      0.67      0.39      0.67      0.46       139

avg / total       0.82      0.67      0.68      0.72      0.67      0.45       915

