In [1]:
import pandas as pd
import numpy as np
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix
from sklearn import svm
from sklearn.neural_network import MLPClassifier
from sklearn.neighbors import KNeighborsClassifier

# Part 1: Load the dataset


In [2]:
# Load the dataset 
iris = load_iris()
iris_df = pd.DataFrame(iris['data'], columns = iris.feature_names) 
iris_df["target"] = iris.target 
iris_df.head(15)# Output the first 15 rows of the data


Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm),target
0,5.1,3.5,1.4,0.2,0
1,4.9,3.0,1.4,0.2,0
2,4.7,3.2,1.3,0.2,0
3,4.6,3.1,1.5,0.2,0
4,5.0,3.6,1.4,0.2,0
5,5.4,3.9,1.7,0.4,0
6,4.6,3.4,1.4,0.3,0
7,5.0,3.4,1.5,0.2,0
8,4.4,2.9,1.4,0.2,0
9,4.9,3.1,1.5,0.1,0


In [3]:
# Display a summary of the table information (number of datapoints, etc.)
iris_df.info() 
iris_df.describe()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 150 entries, 0 to 149
Data columns (total 5 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   sepal length (cm)  150 non-null    float64
 1   sepal width (cm)   150 non-null    float64
 2   petal length (cm)  150 non-null    float64
 3   petal width (cm)   150 non-null    float64
 4   target             150 non-null    int32  
dtypes: float64(4), int32(1)
memory usage: 5.4 KB


Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm),target
count,150.0,150.0,150.0,150.0,150.0
mean,5.843333,3.057333,3.758,1.199333,1.0
std,0.828066,0.435866,1.765298,0.762238,0.819232
min,4.3,2.0,1.0,0.1,0.0
25%,5.1,2.8,1.6,0.3,0.0
50%,5.8,3.0,4.35,1.3,1.0
75%,6.4,3.3,5.1,1.8,2.0
max,7.9,4.4,6.9,2.5,2.0


# About the dataset


  <div> The dataset consists of 150 rows with no null values and 5 columns represent sepal length (cm), sepal width (cm), petal length (cm), petal width (cm), and target.
   The 5 columns consist of 4 features and 1 label.
   
Features
    
   - sepal length (cm)
   - sepal width (cm)
   - petal length (cm)
   - petal width (cm)
   
Label
    
   - target <div>

# Part 2: Split the dataset into train and test



In [4]:
# Take the dataset and split it into our features (X) and label (y)
X = iris.data
y = iris.target
# Use sklearn to split the features and labels into a training/test set. (90% train, 10% test)
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=.90, random_state=42)

# Part 3: Logistic Regression

In [5]:
logReg = LogisticRegression()
logReg.fit(X_train, y_train)
y_prediction = logReg.predict(X_test)
y_prediction_prob = logReg.predict_proba(X_test)
classes = load_iris().target_names
for class_name, proba in zip(classes, y_prediction_prob):
    index = np.argmax(proba)
    print(f'predicted class = {class_name} and confidence = {proba[index]:.2%}')

score = logReg.score(X_test, y_test)
print("Accuracy of Logistic Regression Classifier on test set:",score)

confusion = confusion_matrix(y_test, y_prediction)
print(f'Confusion Matrix:\n{confusion}')
intercept = logReg.intercept_
coeff = logReg.coef_.T
print(f'intercept: {intercept} \ncoeff: {coeff}')

predicted class = setosa and confidence = 83.72%
predicted class = versicolor and confidence = 94.95%
predicted class = virginica and confidence = 99.89%
Accuracy of Logistic Regression Classifier on test set: 1.0
Confusion Matrix:
[[6 0 0]
 [0 6 0]
 [0 0 3]]
intercept: [  9.50133366   1.91164253 -11.4129762 ] 
coeff: [[-0.42685097  0.51314447 -0.0862935 ]
 [ 0.97283315 -0.22362072 -0.74921243]
 [-2.44463796 -0.21514806  2.65978602]
 [-1.0318789  -0.85146024  1.88333914]]


   Each coefficient relatives to one feature, we have four features input, therefore we get four coefficent numbers in each subset of coeff. The score is 1.0 which means that on the given 10% of the dataset,our logistic regression classifier model predicts the outcome of 100, meaning the number of correct predictions made divided by the total number of datapoints in the test set.  

# Part 4: Support Vector Machine

In [6]:
svc = svm.SVC(probability=True)
svc.fit(X_train, y_train)
y_prediction_svc = svc.predict(X_test)
y_prediction_prob_svc = svc.predict_proba(X_test)
for class_name, proba in zip(classes, y_prediction_prob_svc):
    index = np.argmax(proba)
    print(f'predicted class = {class_name} and confidence = {proba[index]:.2%}')
score = svc.score(X_test, y_test)
print("Accuracy of  Support Vector Classifier on test set :",score)
confusion = confusion_matrix(y_test, y_prediction_svc)
print(f'Confusion Matrix:\n{confusion}')

predicted class = setosa and confidence = 91.83%
predicted class = versicolor and confidence = 95.01%
predicted class = virginica and confidence = 98.26%
Accuracy of  Support Vector Classifier on test set : 1.0
Confusion Matrix:
[[6 0 0]
 [0 6 0]
 [0 0 3]]


Support vector machine model gives us score of 1.0 as same as logistic regression model. In addition, the confusion matrix shows the same outcome as of the logistic regression.

# Part 5: Neural Network

In [34]:
mlp = MLPClassifier()
mlp.fit(X_train, y_train)
y_prediction_mlp = mlp.predict(X_test)
y_prediction_prob_mlp = mlp.predict_proba(X_test)
for class_name, proba in zip(classes, y_prediction_prob_mlp):
    index = np.argmax(proba)
    print(f'predicted class = {class_name} and confidence = {proba[index]:.2%}')
score = mlp.score(X_test, y_test)
print("Accuracy of Neural Network on test set :",score)
new_mlp = MLPClassifier(solver='sgd', learning_rate_init=0.01, hidden_layer_sizes=10, max_iter=1000, tol=0.001,random_state=0)
new_mlp.fit(X_train, y_train)
new_score = new_mlp.score(X_test, y_test)
print(f'Modified neural network score: {new_score}')

predicted class = setosa and confidence = 72.36%
predicted class = versicolor and confidence = 96.40%
predicted class = virginica and confidence = 96.35%
Accuracy of Neural Network on test set : 1.0
Modified neural network score: 0.9333333333333333




Neural network model gives us score of 1.0 as same as logistic regression model and support vector machine model.

# Part 6: K-Nearest Neighbors

In [31]:
knn = KNeighborsClassifier()
knn.fit(X_train, y_train)
# ii. For a sample datapoint, predict the probabilities for each possible class
y_prediction_knn = knn.predict(X_test)
y_prediction_prob_knn = knn.predict_proba(X_test)
for class_name, proba in zip(classes, y_prediction_prob_knn):
    index = np.argmax(proba)
    print(f'predicted class = {class_name} and confidence = {proba[index]:.2%}')
# iii. Report on the score for kNN, what does the score measure?
score = knn.score(X_test, y_test)
print('Accuracy of k-Neighbors on test set: {}'.format(score))

predicted class = setosa and confidence = 100.00%
predicted class = versicolor and confidence = 100.00%
predicted class = virginica and confidence = 100.00%
Accuracy of k-Neighbors on test set: 1.0


K-Nearest Neighbors model gives us score of 1.0 as same as logistic regression model and support vector machine model.The K-Nearest Neighbors model will estimate the likelihood that a data point will become a member of one group or another based on what group the data points nearest to it belong to.

# Part 7: Conclusions and takeaways


 According to the results for each model, they all get the socore of 1.0, which means all models are able to define the point correctly. That's really amazing and i did not expect it.In addition, K-Nearest Neighbors model predicts 100% correctly to each class which means it will be the best performed model on iris dataset in my experiment.I do feel surprised that these four models all perform well on iris dataset. However, i think need to more iterations for each model to decide what is best performed model.