In [3]:
# import packages

import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sb

# import dataset

data = pd.read_csv('framingham.csv')
data.head()

Unnamed: 0,male,age,education,currentSmoker,cigsPerDay,BPMeds,prevalentStroke,prevalentHyp,diabetes,totChol,sysBP,diaBP,BMI,heartRate,glucose,TenYearCHD
0,1,39,4.0,0,0.0,0.0,0,0,0,195.0,106.0,70.0,26.97,80.0,77.0,0
1,0,46,2.0,0,0.0,0.0,0,0,0,250.0,121.0,81.0,28.73,95.0,76.0,0
2,1,48,1.0,1,20.0,0.0,0,0,0,245.0,127.5,80.0,25.34,75.0,70.0,0
3,0,61,3.0,1,30.0,0.0,0,1,0,225.0,150.0,95.0,28.58,65.0,103.0,1
4,0,46,3.0,1,23.0,0.0,0,0,0,285.0,130.0,84.0,23.1,85.0,85.0,0


In [4]:
# view dataset
data.head()

Unnamed: 0,male,age,education,currentSmoker,cigsPerDay,BPMeds,prevalentStroke,prevalentHyp,diabetes,totChol,sysBP,diaBP,BMI,heartRate,glucose,TenYearCHD
0,1,39,4.0,0,0.0,0.0,0,0,0,195.0,106.0,70.0,26.97,80.0,77.0,0
1,0,46,2.0,0,0.0,0.0,0,0,0,250.0,121.0,81.0,28.73,95.0,76.0,0
2,1,48,1.0,1,20.0,0.0,0,0,0,245.0,127.5,80.0,25.34,75.0,70.0,0
3,0,61,3.0,1,30.0,0.0,0,1,0,225.0,150.0,95.0,28.58,65.0,103.0,1
4,0,46,3.0,1,23.0,0.0,0,0,0,285.0,130.0,84.0,23.1,85.0,85.0,0


In [5]:
# check dimensions
data.shape


(4240, 16)

In [6]:
# removing duplicates
data.drop_duplicates()
data.shape


(4240, 16)

In [7]:
# checking for blank cells
data.isnull().sum()

male                 0
age                  0
education          105
currentSmoker        0
cigsPerDay          29
BPMeds              53
prevalentStroke      0
prevalentHyp         0
diabetes             0
totChol             50
sysBP                0
diaBP                0
BMI                 19
heartRate            1
glucose            388
TenYearCHD           0
dtype: int64

In [8]:
data.fillna(0, inplace = True)

In [9]:
# checking for blank cells
data.isnull().sum()

male               0
age                0
education          0
currentSmoker      0
cigsPerDay         0
BPMeds             0
prevalentStroke    0
prevalentHyp       0
diabetes           0
totChol            0
sysBP              0
diaBP              0
BMI                0
heartRate          0
glucose            0
TenYearCHD         0
dtype: int64

In [10]:
# get the mean, standard deviation, count, mix and max of the dataset
data.describe()

Unnamed: 0,male,age,education,currentSmoker,cigsPerDay,BPMeds,prevalentStroke,prevalentHyp,diabetes,totChol,sysBP,diaBP,BMI,heartRate,glucose,TenYearCHD
count,4240.0,4240.0,4240.0,4240.0,4240.0,4240.0,4240.0,4240.0,4240.0,4240.0,4240.0,4240.0,4240.0,4240.0,4240.0,4240.0
mean,0.429245,49.580189,1.930425,0.494104,8.94434,0.029245,0.005896,0.310613,0.025708,233.908255,132.354599,82.897759,25.685184,75.861085,74.463208,0.151887
std,0.495027,8.572942,1.053026,0.500024,11.904777,0.168513,0.076569,0.462799,0.15828,51.166237,22.0333,11.910394,4.420501,12.080265,32.862256,0.358953
min,0.0,32.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,83.5,48.0,0.0,0.0,0.0,0.0
25%,0.0,42.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,205.0,117.0,75.0,23.05,68.0,68.0,0.0
50%,0.0,49.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,233.0,128.0,82.0,25.38,75.0,77.0,0.0
75%,1.0,56.0,3.0,1.0,20.0,0.0,0.0,1.0,0.0,262.0,144.0,90.0,28.0325,83.0,85.0,0.0
max,1.0,70.0,4.0,1.0,70.0,1.0,1.0,1.0,1.0,696.0,295.0,142.5,56.8,143.0,394.0,1.0


In [11]:
# show the datatypes 
data.dtypes

male                 int64
age                  int64
education          float64
currentSmoker        int64
cigsPerDay         float64
BPMeds             float64
prevalentStroke      int64
prevalentHyp         int64
diabetes             int64
totChol            float64
sysBP              float64
diaBP              float64
BMI                float64
heartRate          float64
glucose            float64
TenYearCHD           int64
dtype: object

In [12]:
# using creating a dictonary to convert specific objects 
convert_dict = {"male" : str,
               "education" : str,
               "currentSmoker" : str,
               "BPMeds" : str,
               "prevalentStroke" : str,
               "prevalentHyp" : str,
               "diabetes" : str,
               "TenYearCHD" : str}


In [13]:
# number of columns per data type 
list(set(data.dtypes.tolist()))

[dtype('int64'), dtype('float64')]

In [14]:
data.shape


(4240, 16)

In [15]:
# Feature Selection

# call packages
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2

In [16]:
# separate the independent columns
x = data.iloc[:, 0:14]

# separate the dependent column
y = data.iloc[:, 15]

In [17]:
x.head()

Unnamed: 0,male,age,education,currentSmoker,cigsPerDay,BPMeds,prevalentStroke,prevalentHyp,diabetes,totChol,sysBP,diaBP,BMI,heartRate
0,1,39,4.0,0,0.0,0.0,0,0,0,195.0,106.0,70.0,26.97,80.0
1,0,46,2.0,0,0.0,0.0,0,0,0,250.0,121.0,81.0,28.73,95.0
2,1,48,1.0,1,20.0,0.0,0,0,0,245.0,127.5,80.0,25.34,75.0
3,0,61,3.0,1,30.0,0.0,0,1,0,225.0,150.0,95.0,28.58,65.0
4,0,46,3.0,1,23.0,0.0,0,0,0,285.0,130.0,84.0,23.1,85.0


In [18]:
y.head()

0    0
1    0
2    0
3    1
4    0
Name: TenYearCHD, dtype: int64

In [19]:
# Using SelectKBest to extract top 10 features
from sklearn.feature_selection import SelectKBest
topfeatures = SelectKBest(score_func = chi2, k = 10)
fit = topfeatures.fit(x, y)
xyscore = pd.DataFrame(fit.scores_)
xycolumn = pd.DataFrame(x.columns)

In [20]:
# Concatenate both data frames to get features and scores
featurescore = pd.concat([xycolumn, xyscore], axis = 1)

# rename columns featurescore dataframe
featurescore.columns = ["Feature", "Score"]

In [21]:
# Sort the featurescore by higest-to-lowest scores
featurescore = featurescore.sort_values(by = "Score", ascending= False)
featurescore

Unnamed: 0,Feature,Score
10,sysBP,727.935535
1,age,319.266019
4,cigsPerDay,231.669558
9,totChol,210.439056
11,diaBP,152.748563
7,prevalentHyp,92.048736
8,diabetes,39.144944
5,BPMeds,30.759595
0,male,18.89993
6,prevalentStroke,16.109887


In [22]:
# Selecting thr 10 most important features
features_list = featurescore["Feature"].tolist()[:10]
features_list

['sysBP',
 'age',
 'cigsPerDay',
 'totChol',
 'diaBP',
 'prevalentHyp',
 'diabetes',
 'BPMeds',
 'male',
 'prevalentStroke']

In [23]:
# New Dataset with Selected Features¶

data = data[["male", "age", "cigsPerDay", "prevalentStroke", "prevalentHyp", "diabetes", "totChol", "sysBP", "TenYearCHD"]]
data.head()

Unnamed: 0,male,age,cigsPerDay,prevalentStroke,prevalentHyp,diabetes,totChol,sysBP,TenYearCHD
0,1,39,0.0,0,0,0,195.0,106.0,0
1,0,46,0.0,0,0,0,250.0,121.0,0
2,1,48,20.0,0,0,0,245.0,127.5,0
3,0,61,30.0,0,1,0,225.0,150.0,1
4,0,46,23.0,0,0,0,285.0,130.0,0


In [24]:
# Calling packages

from sklearn.preprocessing import MinMaxScaler 

In [25]:
# Create scaler
scaler = MinMaxScaler()

# create scaled data
data_scaled = pd.DataFrame(scaler.fit_transform (data), columns = data.columns)

# view scaled data
data_scaled.describe()

Unnamed: 0,male,age,cigsPerDay,prevalentStroke,prevalentHyp,diabetes,totChol,sysBP,TenYearCHD
count,4240.0,4240.0,4240.0,4240.0,4240.0,4240.0,4240.0,4240.0,4240.0
mean,0.429245,0.462637,0.127776,0.005896,0.310613,0.025708,0.336075,0.230991,0.151887
std,0.495027,0.225604,0.170068,0.076569,0.462799,0.15828,0.073515,0.104176,0.358953
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.0,0.263158,0.0,0.0,0.0,0.0,0.29454,0.158392,0.0
50%,0.0,0.447368,0.0,0.0,0.0,0.0,0.33477,0.210402,0.0
75%,1.0,0.631579,0.285714,0.0,1.0,0.0,0.376437,0.286052,0.0
max,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


In [26]:
# Divide Train & Test Data
# Divide data into train and test data with a 75-25 split first.

# Create X and Y data

x = data_scaled.drop(["TenYearCHD"], axis = 1)
y = data_scaled["TenYearCHD"]

# Data Split library 
from sklearn.model_selection import train_test_split

# 80-20 Split 

X_train, X_test, y_train, y_test = train_test_split(x, y, test_size = 0.20)

In [56]:
# Decision Tree classifier 

from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score #works

dtree_model = DecisionTreeClassifier().fit(X_train, y_train)
print("train accuracy:",dtree_model.score(X_train,y_train))
print("test accuracy:",dtree_model.score(X_test,y_test))
dtree_predictions = dtree_model.predict(X_test)
print('Model accuracy score: {0:0.4f}'. format(accuracy_score(y_test, dtree_predictions)))

train accuracy: 0.9997051886792453
test accuracy: 0.7441037735849056
Model accuracy score: 0.7441


In [33]:
# Naive Bayes 

from sklearn.naive_bayes import GaussianNB
# instantiate the model
gnb = GaussianNB()
# fit the model
gnb.fit(X_train, y_train)
print("train accuracy:",gnb.score(X_train,y_train))
print("test accuracy:",gnb.score(X_test,y_test))
gnb_pred = gnb.predict(X_test)
print('Model accuracy score: {0:0.4f}'. format(accuracy_score(y_test, gnb_pred)))

train accuracy: 0.8369693396226415
test accuracy: 0.8290094339622641
Model accuracy score: 0.8290


In [35]:
#  Support Vector Machine

from sklearn.svm import SVC
svm=SVC(random_state=1)
svm.fit(X_train,y_train)
print("train accuracy:",svm.score(X_train,y_train))
print("test accuracy:",svm.score(X_test,y_test))
svm_pred = svm.predict(X_test)
print('Model accuracy score: {0:0.4f}'. format(accuracy_score(y_test, svm_pred)))

train accuracy: 0.8537735849056604
test accuracy: 0.8349056603773585
Model accuracy score: 0.8349


In [39]:
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
from keras.models import Sequential
from keras.layers import Dense


#initialising the Aritificial Neural Network
classifier = Sequential()

#adding the input layer and first hidden layer
classifier.add(Dense(64, input_dim=8, activation='relu'))

#adding the second hidden layer
classifier.add(Dense(12, activation='relu'))
#classifier.add(Dense(output_dim = 6,init = 'uniform',activation = 'relu'))

#adding the output layer
classifier.add(Dense(1, activation='relu'))
#classifier.add(Dense(output_dim = 1,init = 'uniform',activation = 'sigmoid'))

#compling the Artificial neural network
classifier.compile(optimizer = 'adam', loss = 'binary_crossentropy', metrics = ['accuracy'])

#fitting the training set
history=classifier.fit(X_train, y_train, batch_size = 10, epochs = 50, verbose = 1)


Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50


In [40]:
y_pred = classifier.predict(X_test)

score = classifier.evaluate(X_test, y_test, verbose=1)

print(score)

[0.4350600838661194, 0.8408018946647644]
