In [1]:
import boto3
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.multiclass import OneVsRestClassifier
from sklearn.metrics import confusion_matrix

## Defining the bucket
s3 = boto3.resource('s3')
bucket_name = 'data-448-bucket-callaghan'
bucket = s3.Bucket(bucket_name)

## Defining the csv file
file_key = 'Iris.csv'

bucket_object = bucket.Object(file_key)
file_object = bucket_object.get()
file_content_stream = file_object.get('Body')

## 1. Using the pandas library to read the csv data file and create a data-frame called iris

iris = pd.read_csv(file_content_stream)

iris.head()

Unnamed: 0,Id,SepalLengthCm,SepalWidthCm,PetalLengthCm,PetalWidthCm,Species
0,1,5.1,3.5,1.4,0.2,Iris-setosa
1,2,4.9,3.0,1.4,0.2,Iris-setosa
2,3,4.7,3.2,1.3,0.2,Iris-setosa
3,4,4.6,3.1,1.5,0.2,Iris-setosa
4,5,5.0,3.6,1.4,0.2,Iris-setosa


In [2]:
## 2. Creating the frequency table of the variable Species

iris['Species'].value_counts()

Iris-virginica     50
Iris-setosa        50
Iris-versicolor    50
Name: Species, dtype: int64

In [3]:
## 3. Using the where function from numpy to create a new variable called Species_numb

iris['Species_numb'] = np.where(iris['Species'] == 'Iris-virginica', 1, 
                               np.where(iris['Species'] == 'Iris-versicolor', 2, 3))

iris.head()

Unnamed: 0,Id,SepalLengthCm,SepalWidthCm,PetalLengthCm,PetalWidthCm,Species,Species_numb
0,1,5.1,3.5,1.4,0.2,Iris-setosa,3
1,2,4.9,3.0,1.4,0.2,Iris-setosa,3
2,3,4.7,3.2,1.3,0.2,Iris-setosa,3
3,4,4.6,3.1,1.5,0.2,Iris-setosa,3
4,5,5.0,3.6,1.4,0.2,Iris-setosa,3


In [4]:
## 4. Using SepalLengthCm, SepalWidthCm, PetalLengthCm, and PetalWidthCm as the input variables, and Species_numb 
## as the target variable to split the data into two data-frames

## Defining the input and target variable
X = iris[['SepalLengthCm', 'SepalWidthCm', 'PetalLengthCm', 'PetalWidthCm']]
Y = iris['Species_numb']

## Splitting the data
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size = 0.2, stratify = Y)

In [5]:
## 5. Standardizing the input variables in the train and test datasets (0-1 scale)

scaler = MinMaxScaler()

X_train = scaler.fit_transform(X_train)
X_test = scaler.fit_transform(X_test)

In [6]:
## 6. Using train dataset and the one-vs-all multi-class classification strategy with the random
## forest model (with 500 trees and the maximum depth of each tree equal to 3) to build a multi-class
## classification model

## Building the model
one_vs_all_rf = OneVsRestClassifier(estimator = RandomForestClassifier(n_estimators = 500, max_depth = 3)).fit(X_train, Y_train)

## Predicitng on the test set
one_vs_all_rf_preds = one_vs_all_rf.predict_proba(X_test)

## Assigning the label with the highest likelihood
one_vs_all_rf_preds = np.argmax(one_vs_all_rf_preds, axis = 1) + 1

## Computing the confusion matrix
print(confusion_matrix(Y_test, one_vs_all_rf_preds))

[[10  0  0]
 [ 1  9  0]
 [ 0  0 10]]


In [7]:
## 7. Using train dataset and and the one-vs-all multi-class classification strategy with the support
## vector machine model (with kernel equal to rbf and C = 0.1) to build a multi-class classification model

## Building the model
one_vs_all_svm = OneVsRestClassifier(estimator = SVC(C = 0.1, kernel = 'rbf', probability = True)).fit(X_train, Y_train)

## Predicitng on the test set
one_vs_all_svm_preds = one_vs_all_svm.predict_proba(X_test)

## Assigning the label with the highest likelihood
one_vs_all_svm_preds = np.argmax(one_vs_all_svm_preds, axis = 1) + 1

## Computing the confusion matrix
print(confusion_matrix(Y_test, one_vs_all_svm_preds))

[[10  0  0]
 [ 0 10  0]
 [ 0  0 10]]


In [None]:
## 8. Using the results from part 6 and 7, we would use the Support Vector Classifier model to predict iris species.