In [10]:
import boto3
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.multiclass import OneVsRestClassifier
from sklearn.metrics import classification_report

## Defining the bucket
s3 = boto3.resource('s3')
bucket_name = 'data-448-bucket-callaghan'
bucket = s3.Bucket(bucket_name)

## Defining the csv file
file_key = 'drug200.csv'

bucket_object = bucket.Object(file_key)
file_object = bucket_object.get()
file_content_stream = file_object.get('Body')

## 1. Using the pandas library to read the csv data file and create a data-frame called drug

drug = pd.read_csv(file_content_stream)

drug.head()

Unnamed: 0,Age,Sex,BP,Cholesterol,Na_to_K,Drug
0,23,F,HIGH,HIGH,25.355,DrugY
1,47,M,LOW,HIGH,13.093,drugC
2,47,M,LOW,HIGH,10.114,drugC
3,28,F,NORMAL,HIGH,7.798,drugX
4,61,F,LOW,HIGH,18.043,DrugY


In [11]:
## 2. Creating the frequency table of the target variable Drug

drug['Drug'].value_counts()

DrugY    91
drugX    54
drugA    23
drugB    16
drugC    16
Name: Drug, dtype: int64

In [12]:
## 3. Using the where function from numpy to create a new variable called Drug_numb

drug['Drug_numb'] = np.where(drug['Drug'] == 'drugA', 1, 
                            np.where(drug['Drug'] == 'drugB', 2, 
                                    np.where(drug['Drug'] == 'drugC', 3, 
                                            np.where(drug['Drug'] == 'drugX', 4, 5))))

In [13]:
## 4. Changing Sex, BP and Cholesterol from labels to dummy variables

drug['Sex_dummy'] = np.where(drug['Sex'] == 'M', 1, 0)

drug = pd.concat([drug, pd.get_dummies(drug['BP'])], axis = 1)
drug = drug.rename(columns = {'HIGH': 'BP_HIGH', 'LOW': 'BP_LOW', 'NORMAL': 'BP_NORMAL'})

drug = pd.concat([drug, pd.get_dummies(drug['Cholesterol'])], axis = 1)
drug = drug.rename(columns = {'HIGH': 'Chol_HIGH', 'NORMAL': 'Chol_NORMAL'})

In [14]:
## 5. Using Age, Sex (dummy variable), BP (dummy variables), Cholesterol (dummy variable), and Na_to_K as 
## the input variables, and Drug numb as the target variable to split the data into two data-frames 

## Defining the input and target variables
X = drug[['Age', 'Sex_dummy', 'BP_HIGH', 'BP_LOW', 'Chol_HIGH', 'Na_to_K']]
Y = drug['Drug_numb']

## Splitting the data
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size = 0.2, stratify = Y)

In [15]:
## 6. Using train data-frame and the one-vs-all multi-class classification strategy with the random forest 
## model (with 500 trees and the maximum depth of each tree equal to 3) to build a multi-class classification model

## Building the model
one_vs_rest_rf = OneVsRestClassifier(estimator = RandomForestClassifier(n_estimators = 500, max_depth = 3)).fit(X_train, Y_train)

## Predicting on the test set
one_vs_rest_rf_preds = one_vs_rest_rf.predict(X_test)

## Computing the classification report
print(classification_report(Y_test, one_vs_rest_rf_preds))

              precision    recall  f1-score   support

           1       1.00      1.00      1.00         5
           2       1.00      1.00      1.00         3
           3       1.00      0.67      0.80         3
           4       0.92      1.00      0.96        11
           5       1.00      1.00      1.00        18

    accuracy                           0.97        40
   macro avg       0.98      0.93      0.95        40
weighted avg       0.98      0.97      0.97        40



In [16]:
## 7. Using train data-frame and the one-vs-all multi-class classification strategy with the AdaBoost model 
## (with 500 trees, the maximum depth of each tree equal to 3, and learning rate equal to 0.01) to build a multi-class 
## classification model

## Building the model
one_vs_rest_ada = OneVsRestClassifier(estimator = AdaBoostClassifier(base_estimator = DecisionTreeClassifier(max_depth = 3), 
                                                                     n_estimators = 500, learning_rate = 0.01)).fit(X_train, Y_train)

## Predicting on the test set
one_vs_rest_ada_preds = one_vs_rest_ada.predict(X_test)

## Computing the classification report
print(classification_report(Y_test, one_vs_rest_ada_preds))

              precision    recall  f1-score   support

           1       1.00      1.00      1.00         5
           2       1.00      1.00      1.00         3
           3       1.00      1.00      1.00         3
           4       1.00      1.00      1.00        11
           5       1.00      1.00      1.00        18

    accuracy                           1.00        40
   macro avg       1.00      1.00      1.00        40
weighted avg       1.00      1.00      1.00        40



In [None]:
## 8. Using the results from part 5 and 6, we would use the AdaBoost Classifier model to predict Drug.