Using the iris dataset to create two models, one supervised, one unsupervised, and compare how their predictions differ.

In [1]:
import numpy as np
import pandas as pd
import sklearn
from sklearn import datasets

In [2]:
# Load in the iris dataset
iris = datasets.load_iris()

In [None]:
# Define feature and target values

In [3]:
# Create the iris `data` dataset as a dataframe and name the columns with `feature_names`
df = pd.DataFrame(iris["data"], columns=iris["feature_names"])

# Include the target as well
df['target'] = iris["target"]

In [4]:
# Check your dataframe by `.head()`
df.head()

Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm),target
0,5.1,3.5,1.4,0.2,0
1,4.9,3.0,1.4,0.2,0
2,4.7,3.2,1.3,0.2,0
3,4.6,3.1,1.5,0.2,0
4,5.0,3.6,1.4,0.2,0


In [5]:
# Target values as an array to compare against supervised and unsupervised
df["target"].to_numpy()

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
       2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
       2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2])

# Supervised ML
- Model trained with a set of features (input x, and output y) to predict a target value
- Consists of regression and classification
- Metrics determine how close the prediction is to the true target value, measuring model success (min cost function) <br>
<strong>cost function</strong> is the difference between the predicted value (yhat) and the target value (y)

# Regression ML
Regression: is a type of supervised ML algorithm learns to predict a number from infinitely many possible numbers. (continuous) (ex: house price)<br>
Linear Regression Model fitting a straight line to your data.
Simple (Univariate) Linear Regression: Linear Regression with one variable (single input(x) --> ex: house size).<br>
Multiple Linear Regression: linear regression model with multiple input features.

In [6]:
from sklearn.linear_model import LinearRegression

In [7]:
# Create the linear Regression model and fit the model on features and targets

reg = LinearRegression().fit(df[iris["feature_names"]], df["target"])

In [8]:
# Score model via metrics
# Scoring of the linear regression model, but slighly deceiving since the iris dataset is classifying not regression
reg.score(df[iris["feature_names"]], df["target"])

0.9303939218549564

In [9]:
# predict target values

reg.predict(df[iris["feature_names"]])

array([-8.25493616e-02, -4.01284476e-02, -4.86276768e-02,  1.22998627e-02,
       -7.53667248e-02,  5.82910066e-02,  3.83367194e-02, -4.44863248e-02,
        1.98324281e-02, -8.21970989e-02, -1.01272512e-01,  7.59348686e-04,
       -8.98630676e-02, -1.02503649e-01, -2.26652208e-01, -4.10494982e-02,
       -3.31670043e-02, -2.16241562e-02, -3.21980063e-02, -1.07834994e-02,
       -4.35196609e-02,  5.41496547e-02, -1.22062394e-01,  1.76835660e-01,
        6.93528569e-02, -5.59002750e-03,  1.00228589e-01, -7.08754443e-02,
       -8.97319983e-02,  1.99658314e-02,  1.27831946e-02,  3.26017444e-02,
       -1.55848342e-01, -1.55367344e-01, -2.12718935e-02, -1.05063936e-01,
       -1.50176206e-01, -1.25101345e-01, -7.04002332e-03, -5.56769102e-02,
       -3.32980735e-02,  7.07502372e-02, -1.50559206e-02,  2.18071051e-01,
        1.41599717e-01,  3.19873432e-02, -4.88442021e-02, -1.45725887e-02,
       -9.00819270e-02, -6.33428789e-02,  1.20248442e+00,  1.28482413e+00,
        1.32433716e+00,  

In [20]:
# If we really wanted to, we could do something like round each regression value to an int
# and have it "act" like a classification model
# This is not required, but something to keep in mind for future reference
reg_cls = np.abs(np.rint(reg.predict(df[iris["feature_names"]])))
reg_cls

array([0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1.,
       1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
       1., 1., 2., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 2., 1.,
       1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 2., 2.,
       2., 2., 2., 2., 2., 2., 2., 2., 2., 2., 2., 2., 2., 2., 2., 2., 2.,
       1., 2., 2., 2., 2., 2., 2., 2., 2., 2., 2., 2., 2., 2., 1., 2., 2.,
       2., 2., 2., 2., 2., 2., 2., 2., 2., 2., 2., 2., 2., 2.])

In [21]:
# Evaluate accuracy
sum(reg_cls == df["target"]) / df.shape[0]

0.9733333333333334

# Classification: 
predict categories (discrete) (whether a picture is a cat or a dog, tumor is benign or malignant) or a small set of possible outputs(such as 0, 1, 2)<br>

In [22]:
# import classification model

from sklearn.linear_model import RidgeClassifier

In [23]:
# create classification model and fit it to features and targets
clf = RidgeClassifier(alpha=3.0).fit(df[iris["feature_names"]], df["target"])

In [24]:
# Score the model
# for RidgeClassifier, the score metric is accuracy (the number of correctly predicted values/by the total number of sample)
clf.score(df[iris["feature_names"]], df["target"])

0.86

In [25]:
# predict target values, these will look much better!
clf.predict(df[iris["feature_names"]])

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 2, 2, 2, 1, 1, 1, 2, 1, 1, 1, 1, 2, 1, 1, 1, 2,
       2, 1, 1, 1, 2, 1, 1, 1, 1, 2, 1, 2, 2, 1, 1, 1, 1, 1, 2, 2, 2, 1,
       2, 1, 1, 2, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 1, 2, 1, 2,
       2, 2, 2, 2, 2, 2, 2, 2, 2, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
       2, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2])

# Unsupervised ML
the data comes only with inputs x without output labels y, and the algorithm has to find some pattern in the data.<br>
input x, without output y

In [10]:
#import kmeans algorithm

from sklearn.cluster import KMeans

In [11]:
# create the kmeans model, and define how many clusters we want the data to be structured in,
# and fit the feature  values to the model(without using the target value).

kmeans = KMeans(n_clusters=3, random_state=0).fit(df[iris["feature_names"]])



In [16]:
# visualize labels assigned to each row
# 3 cluster --> 0, 1, 2

# Print the labels to see what value is in what cluster
kmeans.labels_

array([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 3, 3, 3, 0, 3, 0, 3, 0, 3, 0, 0, 0, 0, 3, 0, 3,
       0, 0, 3, 0, 3, 0, 3, 3, 3, 3, 3, 3, 3, 0, 0, 0, 0, 3, 0, 3, 3, 3,
       0, 0, 0, 3, 0, 0, 0, 0, 0, 3, 0, 0, 2, 3, 2, 2, 2, 2, 0, 2, 2, 2,
       3, 3, 2, 3, 3, 2, 2, 2, 2, 3, 2, 3, 2, 3, 2, 2, 3, 3, 2, 2, 2, 2,
       2, 3, 3, 2, 2, 2, 3, 2, 2, 2, 3, 2, 2, 2, 3, 3, 2, 3])

In [19]:
# use the inertia method on kmeans to calculate the distance metric from the data points to the cluster.
kmeans.inertia_

57.22847321428571

In [18]:
# What happens if we cluster more than the actual classes?
kmeans = KMeans(n_clusters=4, random_state=0).fit(df[iris["feature_names"]])



In [14]:
# Print the labels to see what value is in what cluster
kmeans.labels_

array([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 3, 3, 3, 0, 3, 0, 3, 0, 3, 0, 0, 0, 0, 3, 0, 3,
       0, 0, 3, 0, 3, 0, 3, 3, 3, 3, 3, 3, 3, 0, 0, 0, 0, 3, 0, 3, 3, 3,
       0, 0, 0, 3, 0, 0, 0, 0, 0, 3, 0, 0, 2, 3, 2, 2, 2, 2, 0, 2, 2, 2,
       3, 3, 2, 3, 3, 2, 2, 2, 2, 3, 2, 3, 2, 3, 2, 2, 3, 3, 2, 2, 2, 2,
       2, 3, 3, 2, 2, 2, 3, 2, 2, 2, 3, 2, 2, 2, 3, 3, 2, 3])

In [None]:
# the model find some additional structure (pattern) that doesn't exactly follow the target value.
# clustering just structure the data, it makes no decision if it relates to the target values or not.
# in this case, it did when we define clusters=3