In [1]:
import numpy as np
import pandas as pd

import warnings
warnings.filterwarnings('ignore')

In [2]:
# loading dataset:
data_file = r'C:\Users\AnitaM\Downloads\framingham.csv'
hd = pd.read_csv(data_file)

In [3]:
hd.shape

(4238, 16)

In [4]:
hd.head()

Unnamed: 0,male,age,education,currentSmoker,cigsPerDay,BPMeds,prevalentStroke,prevalentHyp,diabetes,totChol,sysBP,diaBP,BMI,heartRate,glucose,TenYearCHD
0,1,39,4.0,0,0.0,0.0,0,0,0,195.0,106.0,70.0,26.97,80.0,77.0,0
1,0,46,2.0,0,0.0,0.0,0,0,0,250.0,121.0,81.0,28.73,95.0,76.0,0
2,1,48,1.0,1,20.0,0.0,0,0,0,245.0,127.5,80.0,25.34,75.0,70.0,0
3,0,61,3.0,1,30.0,0.0,0,1,0,225.0,150.0,95.0,28.58,65.0,103.0,1
4,0,46,3.0,1,23.0,0.0,0,0,0,285.0,130.0,84.0,23.1,85.0,85.0,0


In [5]:
# Split the data into train and test:
from sklearn.model_selection import train_test_split
hd_train, hd_test = train_test_split(hd, test_size=0.2, random_state=2, stratify=hd['TenYearCHD'])

In [6]:
hd_train.shape

(3390, 16)

In [7]:
hd_test.shape

(848, 16)

In [8]:
# Filling NAs in train and test:

hd_train['education'].fillna(hd_train['education'].median(), inplace=True)
hd_train['cigsPerDay'].fillna(hd_train['cigsPerDay'].median(), inplace=True)
hd_train['totChol'].fillna(hd_train['totChol'].median(), inplace=True)
hd_train['BMI'].fillna(hd_train['BMI'].median(), inplace=True)
hd_train['heartRate'].fillna(hd_train['heartRate'].median(), inplace=True)
hd_train['glucose'].fillna(hd_train['glucose'].median(), inplace=True)

hd_test['education'].fillna(hd_test['education'].median(), inplace=True)
hd_test['cigsPerDay'].fillna(hd_test['cigsPerDay'].median(), inplace=True)
hd_test['totChol'].fillna(hd_test['totChol'].median(), inplace=True)
hd_test['BMI'].fillna(hd_test['BMI'].median(), inplace=True)
hd_test['heartRate'].fillna(hd_test['heartRate'].median(), inplace=True)
hd_test['glucose'].fillna(hd_test['glucose'].median(), inplace=True)

hd_train.dropna(inplace=True)
hd_test.dropna(inplace=True)

In [9]:
# Standardize the data:

from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()

num_features = ['age', 'education', 'cigsPerDay', 'totChol', 'sysBP', 'diaBP', 'BMI', 'heartRate', 'glucose']

scaler.fit(hd_train[num_features])
scaled_train = scaler.transform(hd_train[num_features])

for i,col in enumerate(num_features):
    hd_train[col] = scaled_train[:,i]
    
scaled_test = scaler.fit_transform(hd_test[num_features])
for i,col in enumerate(num_features):
    hd_test[col] = scaled_test[:,i]

In [10]:
hd_train.head()

Unnamed: 0,male,age,education,currentSmoker,cigsPerDay,BPMeds,prevalentStroke,prevalentHyp,diabetes,totChol,sysBP,diaBP,BMI,heartRate,glucose,TenYearCHD
2259,1,-1.106479,1.008149,1,2.624769,0.0,0,0,0,-0.657985,-0.563928,-0.243709,0.053617,-0.087331,-0.669347,0
3035,1,0.402902,2.001454,0,-0.760594,0.0,0,0,0,-0.276325,-1.087157,-0.835471,0.1952,-1.840087,-0.367038,0
1241,1,0.170689,-0.978462,1,0.932088,0.0,0,1,0,0.195138,-0.199942,-1.173621,0.44856,0.329992,-0.410225,1
2584,0,-0.75816,-0.978462,1,-0.67596,0.0,0,0,0,0.419644,-0.154444,0.263517,0.031261,1.665425,-0.410225,0
3528,1,1.331752,2.001454,1,0.508917,0.0,0,0,0,-0.45593,-0.063447,0.009904,0.172845,-1.172371,-0.021542,1


In [11]:
hd_test.head()

Unnamed: 0,male,age,education,currentSmoker,cigsPerDay,BPMeds,prevalentStroke,prevalentHyp,diabetes,totChol,sysBP,diaBP,BMI,heartRate,glucose,TenYearCHD
2007,1,1.736748,-0.960855,1,0.506582,1.0,0,1,0,0.748666,2.687087,-0.15762,-0.148502,1.608052,-0.184775,1
42,0,0.293686,-0.960855,0,-0.72086,1.0,0,1,0,-0.045815,0.732985,0.753831,-0.205324,-0.422219,-0.139352,1
1298,1,-1.149375,-0.960855,0,-0.72086,0.0,0,1,0,-1.377739,1.869091,-1.980524,0.481282,-0.422219,-0.321041,1
3372,1,0.654452,1.032117,1,0.506582,0.0,0,0,0,-0.443056,-0.471288,-0.116191,0.590192,-0.341008,0.723669,0
2492,0,1.135472,-0.960855,1,0.097435,1.0,0,1,0,1.823552,1.959979,-0.074761,3.28216,3.638323,0.133181,1


In [12]:
hd_train.reset_index(drop=True, inplace=True)
hd_test.reset_index(drop=True, inplace=True)

x_train = hd_train.drop('TenYearCHD', 1)
y_train = hd_train['TenYearCHD']

x_test = hd_test.drop('TenYearCHD', 1)
y_test = hd_test['TenYearCHD']

In [15]:
x_train.shape

(3346, 15)

In [16]:
y_train.shape

(3346,)

In [17]:
x_test.shape

(839, 15)

In [18]:
y_test.shape

(839,)

In [13]:
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from xgboost.sklearn import XGBClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier

from sklearn.model_selection import cross_val_score
from sklearn.metrics import classification_report

from imblearn.over_sampling import SMOTE

### Without balancing the imbalaced class

In [19]:
lr_model = LogisticRegression()
lr_model.fit(x_train, y_train)
print(classification_report(y_test, lr_model.predict(x_test)))

              precision    recall  f1-score   support

           0       0.86      1.00      0.92       712
           1       0.82      0.07      0.13       127

    accuracy                           0.86       839
   macro avg       0.84      0.53      0.53       839
weighted avg       0.85      0.86      0.80       839



In [21]:
dtree_model = DecisionTreeClassifier()
dtree_model.fit(x_train, y_train)
print(classification_report(y_test, dtree_model.predict(x_test)))

              precision    recall  f1-score   support

           0       0.87      0.84      0.86       712
           1       0.24      0.28      0.26       127

    accuracy                           0.76       839
   macro avg       0.55      0.56      0.56       839
weighted avg       0.77      0.76      0.76       839



In [22]:
rf_model = RandomForestClassifier()
rf_model.fit(x_train, y_train)
print(classification_report(y_test, rf_model.predict(x_test)))

              precision    recall  f1-score   support

           0       0.85      0.99      0.92       712
           1       0.47      0.06      0.10       127

    accuracy                           0.85       839
   macro avg       0.66      0.52      0.51       839
weighted avg       0.80      0.85      0.79       839



In [23]:
xgb_model = XGBClassifier()
xgb_model.fit(x_train, y_train)
print(classification_report(y_test, xgb_model.predict(x_test)))

              precision    recall  f1-score   support

           0       0.86      0.96      0.91       712
           1       0.38      0.14      0.21       127

    accuracy                           0.83       839
   macro avg       0.62      0.55      0.56       839
weighted avg       0.79      0.83      0.80       839



In [24]:
knn_model = KNeighborsClassifier()
knn_model.fit(x_train, y_train)
print(classification_report(y_test, knn_model.predict(x_test)))

              precision    recall  f1-score   support

           0       0.86      0.96      0.91       712
           1       0.32      0.09      0.15       127

    accuracy                           0.83       839
   macro avg       0.59      0.53      0.53       839
weighted avg       0.77      0.83      0.79       839



In [25]:
svm_model = SVC()
svm_model.fit(x_train, y_train)
print(classification_report(y_test, svm_model.predict(x_test)))

              precision    recall  f1-score   support

           0       0.85      1.00      0.92       712
           1       1.00      0.02      0.03       127

    accuracy                           0.85       839
   macro avg       0.93      0.51      0.48       839
weighted avg       0.87      0.85      0.78       839



In [26]:
# Recall values without balancing:
# lr_model : 0.07
# dtree_model : 0.28
# rf_model : 0.06
# xgb_model : 0.14
# knn_model : 0.09
# svm_model : 0.02

### Balancing the imbalanced class

In [27]:
print("Before OverSampling, counts of label '1': {}".format(sum(y_train == 1)))
print("Before OverSampling, counts of label '0': {} \n".format(sum(y_train == 0)))

Before OverSampling, counts of label '1': 506
Before OverSampling, counts of label '0': 2840 



In [29]:
# Balancing using SMOTE:
sm = SMOTE(random_state=2)
x_train_smote, y_train_smote = sm.fit_resample(x_train, y_train.ravel())

print('After OverSampling, the shape of train_X: {}'.format(x_train_smote.shape))
print('After OverSampling, the shape of train_y: {} \n'.format(y_train_smote.shape))
  
print("After OverSampling, counts of label '1': {}".format(sum(y_train_smote == 1)))
print("After OverSampling, counts of label '0': {}".format(sum(y_train_smote == 0)))

After OverSampling, the shape of train_X: (5680, 15)
After OverSampling, the shape of train_y: (5680,) 

After OverSampling, counts of label '1': 2840
After OverSampling, counts of label '0': 2840


In [30]:
lr_model = LogisticRegression()
lr_model.fit(x_train_smote, y_train_smote)
print(classification_report(y_test, lr_model.predict(x_test)))

              precision    recall  f1-score   support

           0       0.92      0.65      0.76       712
           1       0.26      0.68      0.37       127

    accuracy                           0.66       839
   macro avg       0.59      0.67      0.57       839
weighted avg       0.82      0.66      0.70       839



In [31]:
dtree_model = DecisionTreeClassifier()
dtree_model.fit(x_train_smote, y_train_smote)
print(classification_report(y_test, dtree_model.predict(x_test)))

              precision    recall  f1-score   support

           0       0.88      0.38      0.53       712
           1       0.17      0.70      0.27       127

    accuracy                           0.43       839
   macro avg       0.52      0.54      0.40       839
weighted avg       0.77      0.43      0.49       839



In [32]:
xgb_model = XGBClassifier()
xgb_model.fit(x_train_smote, y_train_smote)
print(classification_report(y_test, xgb_model.predict(x_test)))

              precision    recall  f1-score   support

           0       0.87      0.12      0.20       712
           1       0.15      0.91      0.26       127

    accuracy                           0.23       839
   macro avg       0.51      0.51      0.23       839
weighted avg       0.76      0.23      0.21       839



In [33]:
knn_model = KNeighborsClassifier()
knn_model.fit(x_train_smote, y_train_smote)
print(classification_report(y_test, knn_model.predict(x_test)))

              precision    recall  f1-score   support

           0       0.88      0.64      0.74       712
           1       0.20      0.52      0.29       127

    accuracy                           0.62       839
   macro avg       0.54      0.58      0.52       839
weighted avg       0.78      0.62      0.67       839



In [34]:
svm_model = SVC()
svm_model.fit(x_train_smote, y_train_smote)
print(classification_report(y_test, svm_model.predict(x_test)))

              precision    recall  f1-score   support

           0       0.88      0.69      0.77       712
           1       0.21      0.46      0.29       127

    accuracy                           0.65       839
   macro avg       0.54      0.57      0.53       839
weighted avg       0.78      0.65      0.70       839



In [35]:
rf_model = RandomForestClassifier()
rf_model.fit(x_train_smote, y_train_smote)
print(classification_report(y_test, rf_model.predict(x_test)))

              precision    recall  f1-score   support

           0       0.89      0.57      0.70       712
           1       0.20      0.62      0.31       127

    accuracy                           0.58       839
   macro avg       0.55      0.60      0.50       839
weighted avg       0.79      0.58      0.64       839



In [36]:
# Recall values with balancing:
# lr_model : 0.68
# dtree_model : 0.70
# rf_model : 0.62
# xgb_model : 0.91
# knn_model : 0.52
# svm_model : 0.46

# the accuracy of the models have also come down.

# Use sampling to balance the class