# In-Class Assignment 17 - Evan Callaghan

In [1]:
## 1. Using the pandas library to read the csv data file and create a data-frame called heart

import boto3
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from sklearn.ensemble import GradientBoostingClassifier, AdaBoostClassifier, RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.metrics import recall_score, accuracy_score

## Defining the bucket
s3 = boto3.resource('s3')
bucket_name = 'data-445-bucket-callaghan'
bucket = s3.Bucket(bucket_name)

## Defining the csv file
file_key = 'framingham.csv'

bucket_object = bucket.Object(file_key)
file_object = bucket_object.get()
file_content_stream = file_object.get('Body')

heart = pd.read_csv(file_content_stream)

heart.head()

Unnamed: 0,male,age,education,currentSmoker,cigsPerDay,BPMeds,prevalentStroke,prevalentHyp,diabetes,totChol,sysBP,diaBP,BMI,heartRate,glucose,TenYearCHD
0,1,39,4.0,0,0.0,0.0,0,0,0,195.0,106.0,70.0,26.97,80.0,77.0,0
1,0,46,2.0,0,0.0,0.0,0,0,0,250.0,121.0,81.0,28.73,95.0,76.0,0
2,1,48,1.0,1,20.0,0.0,0,0,0,245.0,127.5,80.0,25.34,75.0,70.0,0
3,0,61,3.0,1,30.0,0.0,0,1,0,225.0,150.0,95.0,28.58,65.0,103.0,1
4,0,46,3.0,1,23.0,0.0,0,0,0,285.0,130.0,84.0,23.1,85.0,85.0,0


In [2]:
## 2. Removing observations with missing values

heart = heart.dropna()

In [3]:
## 3. Using age, totChol, sysBP,BMI, heartRate, and glucose as the predictor variables, and TenYearCHD as 
## the target variable 

X = heart[['age', 'totChol', 'sysBP', 'BMI', 'heartRate', 'glucose']]
Y = heart['TenYearCHD']

## Splitting the data into two data-frames (taking into account the proportion of 0s and 1s)
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size = 0.2, stratify = Y)

## Standardizing the input data to 0-1 scale
scaler = MinMaxScaler(feature_range = (0,1))
X_train = scaler.fit_transform(X_train)
X_test = scaler.fit_transform(X_test)

In [4]:
## 4. Using the train data-frame to build an AdaBoost model (with 500 trees, max tree depth equal 3, and learning 
## rate equal to 0.01) in which: age, totChol, sysBP, BMI, heartRate, and glucose are the predictor variables, 
## and TenYearCHD is the target variable

## Building the model
ada_md = AdaBoostClassifier(base_estimator = DecisionTreeClassifier(max_depth = 3), n_estimators = 500, 
                           learning_rate = 0.01).fit(X_train, Y_train)

## Predicting on the test set
ada_preds = ada_md.predict_proba(X_test)[:, 1]

## Using 15% as the cut-off value
ada_preds_label = np.where(ada_preds < 0.15, 0, 1)

## Reporting the accuracy and recall of the model
print('Accuracy Score of AdaBoost Classifier:', accuracy_score(Y_test, ada_preds_label))
print('Recall Score of AdaBoost Classifier:', recall_score(Y_test, ada_preds_label))

Accuracy Score of AdaBoost Classifier: 0.16939890710382513
Recall Score of AdaBoost Classifier: 0.9821428571428571


In [5]:
## 5. Using the train data-frame to build a gradient boosting model using GradientBoostingClassifier (with 500 
## trees, max tree depth equal 3, and learning rate equal to 0.01) in which: age, totChol, sysBP, BMI, heartRate, 
## and glucose are the predictor variables, and TenYearCHD is the target variable

## Building the model
gb_md = GradientBoostingClassifier(max_depth = 3, n_estimators = 500, learning_rate = 0.01).fit(X_train, Y_train)

## Predicting on the test set
gb_preds = gb_md.predict_proba(X_test)[:, 1]

## Using 15% as the cut-off value
gb_preds_label = np.where(gb_preds < 0.15, 0, 1)

## Reporting the accuracy and recall of the model
print('Accuracy Score of Gradient Boosting Classifier:', accuracy_score(Y_test, gb_preds_label))
print('Recall Score of Gradient Boosting Classifier:', recall_score(Y_test, gb_preds_label))

Accuracy Score of Gradient Boosting Classifier: 0.5819672131147541
Recall Score of Gradient Boosting Classifier: 0.7142857142857143


In [6]:
## 6. Using the train data-frame to build a support vector machine model (with kernel = ’rbf’) in which: age, 
## totChol, sysBP, BMI, heartRate, and glucose are the predictor variables, and TenYearCHD is the target variable

## Building the model
sv_md = SVC(kernel = 'rbf', probability = True).fit(X_train, Y_train)

## Predicting on the test set
sv_preds = sv_md.predict_proba(X_test)[:, 1]

## Using 15% as the cut-off value
sv_preds_label = np.where(sv_preds < 0.15, 0, 1)

## Reporting the accuracy and recall of the model
print('Accuracy Score of Support Vector Classifier:', accuracy_score(Y_test, sv_preds_label))
print('Recall Score of Support Vector Classifier:', recall_score(Y_test, sv_preds_label))

Accuracy Score of Support Vector Classifier: 0.6147540983606558
Recall Score of Support Vector Classifier: 0.7321428571428571


In [7]:
## 7. Using the estimated likelihoods from part (4), (5) and (6) to create a random forest model with 500 trees 
## and maximum depth equal to 3

X_rf = pd.concat([pd.DataFrame(ada_preds), pd.DataFrame(gb_preds), pd.DataFrame(sv_preds), Y_test.reset_index(drop = True)], axis = 1)

## Defining the input and target variables
X = X_rf.drop(columns = 'TenYearCHD')
Y = Y_test

## Building the model
rf_md = RandomForestClassifier(n_estimators = 500, max_depth = 3).fit(X, Y)

## Extracting the ensemble likelihoods
rf_preds = rf_md.predict_proba(X)[:, 1]

## Using 15% as the cut-off value
rf_preds_label = np.where(rf_preds < 0.15, 0, 1)

## Reporting the accuracy and recall scores of the model
print('Accuracy Score of Random Forest Classifier:', accuracy_score(Y_test, rf_preds_label))
print('Recall Score of Random Forest Classifier:', recall_score(Y_test, rf_preds_label))

Accuracy Score of Random Forest Classifier: 0.6844262295081968
Recall Score of Random Forest Classifier: 0.7767857142857143


In [9]:
## Visualizing the final results
final_results = pd.concat([pd.DataFrame(ada_preds), pd.DataFrame(gb_preds), pd.DataFrame(sv_preds), 
                           pd.DataFrame(rf_preds), Y_test.reset_index(drop = True)], axis = 1)

final_results.columns = ['Ada Preds', 'GB Preds', 'SV Preds', 'RF Preds', 'TenYearCHD']

final_results.head()

Unnamed: 0,Ada Preds,GB Preds,SV Preds,RF Preds,TenYearCHD
0,0.328935,0.091609,0.152328,0.113867,0
1,0.347719,0.077219,0.147353,0.057153,0
2,0.247907,0.034742,0.148897,0.130468,1
3,0.378284,0.091891,0.151951,0.119164,0
4,0.189972,0.081743,0.147848,0.077076,0
