In [1]:
import warnings
warnings.filterwarnings('ignore')
import numpy as np
import pandas as pd
import os
from collections import Counter
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from imblearn.over_sampling import RandomOverSampler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import balanced_accuracy_score
from sklearn.metrics import confusion_matrix
from imblearn.metrics import classification_report_imbalanced
from sklearn.metrics import classification_report
from imblearn.combine import SMOTEENN
import sqlalchemy as sa
from sqlalchemy import create_engine
import psycopg2 as pg
from imblearn.over_sampling import RandomOverSampler

# Preliminary data preprocessing
For the data preprocessing the data was imported from the database in AWS and connected to postgresql using sqlalchemy. The data was checked for duplicate and null values. The null valued were dropped and there were no duplicate entries. The column 'male' was changed to 'sex' for clarity. This part of the preprocessing was done when the data was imported to the database.

In [2]:
#Connect to Postgresql database 
conn=sa.create_engine('postgresql://root:postgres@dataanalyticsdb.cxnhjzyey4ka.us-east-2.rds.amazonaws.com:5432/coursefinalproject')

In [3]:
#Load and display data
df=pd.read_sql_query("SELECT * FROM fragmingham", con=conn)
df

Unnamed: 0,sex,age,education,currentSmoker,cigsPerDay,BPMeds,prevalentStroke,prevalentHyp,diabetes,totChol,sysBP,diaBP,BMI,heartRate,glucose,TenYearCHD
0,1,39,4.0,0,0.0,0.0,0,0,0,195.0,106.0,70.0,26.97,80.0,77.0,0
1,0,46,2.0,0,0.0,0.0,0,0,0,250.0,121.0,81.0,28.73,95.0,76.0,0
2,1,48,1.0,1,20.0,0.0,0,0,0,245.0,127.5,80.0,25.34,75.0,70.0,0
3,0,61,3.0,1,30.0,0.0,0,1,0,225.0,150.0,95.0,28.58,65.0,103.0,1
4,0,46,3.0,1,23.0,0.0,0,0,0,285.0,130.0,84.0,23.10,85.0,85.0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3653,1,50,1.0,1,1.0,0.0,0,1,0,313.0,179.0,92.0,25.97,66.0,86.0,1
3654,1,51,3.0,1,43.0,0.0,0,0,0,207.0,126.5,80.0,19.71,65.0,68.0,0
3655,0,52,2.0,0,0.0,0.0,0,0,0,269.0,133.5,83.0,21.47,80.0,107.0,0
3656,1,40,3.0,0,0.0,0.0,0,1,0,185.0,141.0,98.0,25.60,67.0,72.0,0


In [4]:
#Check for null values
df.isnull().sum()

sex                0
age                0
education          0
currentSmoker      0
cigsPerDay         0
BPMeds             0
prevalentStroke    0
prevalentHyp       0
diabetes           0
totChol            0
sysBP              0
diaBP              0
BMI                0
heartRate          0
glucose            0
TenYearCHD         0
dtype: int64

In [5]:
#Perform basic statistic analysis
df.describe()

Unnamed: 0,sex,age,education,currentSmoker,cigsPerDay,BPMeds,prevalentStroke,prevalentHyp,diabetes,totChol,sysBP,diaBP,BMI,heartRate,glucose,TenYearCHD
count,3658.0,3658.0,3658.0,3658.0,3658.0,3658.0,3658.0,3658.0,3658.0,3658.0,3658.0,3658.0,3658.0,3658.0,3658.0,3658.0
mean,0.443685,49.551941,1.980317,0.489065,9.025424,0.030344,0.005741,0.311646,0.027064,236.847731,132.370558,82.917031,25.782802,75.730727,81.852925,0.152269
std,0.496886,8.562029,1.022656,0.499949,11.92159,0.171557,0.075561,0.463229,0.162292,44.097681,22.086866,11.974258,4.065601,11.981525,23.904164,0.359331
min,0.0,32.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,113.0,83.5,48.0,15.54,44.0,40.0,0.0
25%,0.0,42.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,206.0,117.0,75.0,23.08,68.0,71.0,0.0
50%,0.0,49.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,234.0,128.0,82.0,25.38,75.0,78.0,0.0
75%,1.0,56.0,3.0,1.0,20.0,0.0,0.0,1.0,0.0,263.0,143.875,90.0,28.0375,82.0,87.0,0.0
max,1.0,70.0,4.0,1.0,70.0,1.0,1.0,1.0,1.0,600.0,295.0,142.5,56.8,143.0,394.0,1.0


The dataset contains two similar features, one of them is if the participant smokes *currentSmoker* and number of cigarettes per day *cigsPerDay* that the person smokes. *currenSmoker* is categorical (0=non-smoker or 1=smoker) and *cigsPerDay* is continuous (float64). If *currentSmoker* is 0, then *cigsPerDay* would be 0. These two features seemed to be redundant in the dataset so *currentSmoker* was dropped in favour of *cigsPerDay* as it was considered that the number of smoked cigarettes per day could have more weight on the outcome.

In [6]:
#Drop unncessary columns and display DataFrame
df.drop(columns=['currentSmoker'], inplace=True)
df

Unnamed: 0,sex,age,education,cigsPerDay,BPMeds,prevalentStroke,prevalentHyp,diabetes,totChol,sysBP,diaBP,BMI,heartRate,glucose,TenYearCHD
0,1,39,4.0,0.0,0.0,0,0,0,195.0,106.0,70.0,26.97,80.0,77.0,0
1,0,46,2.0,0.0,0.0,0,0,0,250.0,121.0,81.0,28.73,95.0,76.0,0
2,1,48,1.0,20.0,0.0,0,0,0,245.0,127.5,80.0,25.34,75.0,70.0,0
3,0,61,3.0,30.0,0.0,0,1,0,225.0,150.0,95.0,28.58,65.0,103.0,1
4,0,46,3.0,23.0,0.0,0,0,0,285.0,130.0,84.0,23.10,85.0,85.0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3653,1,50,1.0,1.0,0.0,0,1,0,313.0,179.0,92.0,25.97,66.0,86.0,1
3654,1,51,3.0,43.0,0.0,0,0,0,207.0,126.5,80.0,19.71,65.0,68.0,0
3655,0,52,2.0,0.0,0.0,0,0,0,269.0,133.5,83.0,21.47,80.0,107.0,0
3656,1,40,3.0,0.0,0.0,0,1,0,185.0,141.0,98.0,25.60,67.0,72.0,0


# Create features and split data into training and testing
- Our target is *TenYearCHD* which is the result of the Framingham risk score that determines whether a person is at risk of developing coronary heart disease in 10 years. The values of the target are categorical as int64 (0=is not at risk of developing CHD in ten years/1=at risk of developing CHD in ten years).
- The rest of the features are: *sex, age, education, currentSmoker, cigsPerDay, BPMeds, prevalentStroke, prevalentHyp, diabetes, totChol, sysBP, diaBP, BMI, heartRate and glucose.*

In [7]:
#Create features
X=df.copy()
X=X.drop(columns='TenYearCHD')

#Create target
y=df[['TenYearCHD']]

y.shape

(3658, 1)

In [8]:
#Check balance of target values
y['TenYearCHD'].value_counts()

0    3101
1     557
Name: TenYearCHD, dtype: int64

# How data was split into training and testing sets
The data was split into training and testing sets using the scikitlearn module test_train_split. Given the imbalance in the dataset in the target class *(TenYearCHD)* the data was stratified `(stratify=y)` during the split.



In [9]:
#Split data into Train and Test Sets
X_train, X_test, y_train, y_test=train_test_split(X,y, random_state=1, stratify=y)


In [10]:
print (f'X_Train: {X_train.shape}')
print (f'X_test: {X_test.shape}')
print (f'y_train: {y_train.shape}')
print (f'y_test: {y_test.shape}')

X_Train: (2743, 14)
X_test: (915, 14)
y_train: (2743, 1)
y_test: (915, 1)


# Scale the data
Given the difference in values and that some features are continuous and some are categorical the data was standardized using *scikitlearn* `StandardScaler`.

In [11]:
#Create a StandardScaler instance
scaler=StandardScaler()

#Fit StandardScaler()
X_scaler=scaler.fit(X_train)

#Scale the data
X_train_scaled=X_scaler.transform(X_train)

X_test_scaled=X_scaler.transform(X_test)

# Resample with Random oversampler
Given that the dataset is imbalanced we resampled the data using `RandomOverSampler` from the *imblearn* library.

In [12]:
#Implement ramdom oversampling
ros = RandomOverSampler(random_state=1)
X_resampled, y_resampled = ros.fit_resample(X_train_scaled, y_train)

In [13]:
print (f'X_resampled: {len(X_resampled)}')
print (f'y_resampled: {len(y_resampled)}')

X_resampled: 4650
y_resampled: 4650


In [14]:
y_resampled.value_counts()

TenYearCHD
0             2325
1             2325
dtype: int64

# Model choice
The first Machine Learning model we chose is *Logistic Regression* as we are trying to predict a discrete binary outcome. *Logistic regression* is easier to implement, interpret, and very efficient to train. It can have good accuracy for many simple data sets and it performs well when the dataset is linearly separable. Although we believe Logistic regression might work in this case, logistic regression inherently runs on a linear model and there are other models available like Naive Bayes and SVM which could also prove useful.

In [15]:
#Train the logistic regression model
model = LogisticRegression(solver='lbfgs', random_state=1)
model.fit(X_resampled, y_resampled)

LogisticRegression(random_state=1)

In [16]:
#Make predictions and display confusion matrix
y_pred = model.predict(X_test_scaled)
confusion_matrix(y_test, y_pred)

array([[510, 266],
       [ 44,  95]], dtype=int64)

In [17]:
#Display confusion maxtrix as a DataFrame
cm=confusion_matrix(y_test, y_pred)
cm_df=pd.DataFrame(cm, index=['Actual 0','Actual 1'], columns=['Predicted 0', 'Predicted 1'])
cm_df

Unnamed: 0,Predicted 0,Predicted 1
Actual 0,510,266
Actual 1,44,95


In [18]:
# Print the imbalanced classification report
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.92      0.66      0.77       776
           1       0.26      0.68      0.38       139

    accuracy                           0.66       915
   macro avg       0.59      0.67      0.57       915
weighted avg       0.82      0.66      0.71       915

