# Data Science Lab 1 - Iris dataset

The goal is to determine wether or not the various classes of Iris are separated.

## Method 1 - distances

In [1]:
import pandas as pd
import numpy as np

In [2]:
iris = pd.read_csv("../data/iris.csv")
iris

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width,species
0,5.1,3.5,1.4,0.2,setosa
1,4.9,3.0,1.4,0.2,setosa
2,4.7,3.2,1.3,0.2,setosa
3,4.6,3.1,1.5,0.2,setosa
4,5.0,3.6,1.4,0.2,setosa
...,...,...,...,...,...
145,6.7,3.0,5.2,2.3,virginica
146,6.3,2.5,5.0,1.9,virginica
147,6.5,3.0,5.2,2.0,virginica
148,6.2,3.4,5.4,2.3,virginica


### General distance

In [3]:
def distance(v1, v2, metric='minkowski', L=2):
    if metric == "minkowski":
        return np.power(np.power(v1 - v2, L).sum(), 1/L)
    else:
        raise NotImplementedError

In [4]:
iris_center = iris.groupby("species").mean().transpose()
iris_center

species,setosa,versicolor,virginica
sepal_length,5.006,5.936,6.588
sepal_width,3.428,2.77,2.974
petal_length,1.462,4.26,5.552
petal_width,0.246,1.326,2.026


In [5]:
v1 = iris[iris['species'] == 'setosa'].iloc[0, :-1]
v2 = iris_center.loc[:, 'setosa']

distance(v1, v2)

0.14135062787267663

### Intra-class distance

In [6]:
iris[iris['species'] == 'setosa'].apply(lambda elt: distance(elt, v2), axis=1).max()

1.2480304483465139

In [7]:
def intra_class(df, class_name, cats=None, metric='minkowski'):
    cats = df.columns[-1] if cats is None else cats
    df = df[df[cats] == class_name]
    center = df.drop(cats, axis=1).mean()
    return df.apply(lambda elt: distance(elt, center, metric), axis=1).max()

In [8]:
intra_class(iris, 'setosa')

1.2480304483465139

### Inter-class distance

In [9]:
# Directional inter-class distance
def inter_class(df, source_class, target_class, cats=None, metric='minkowski'):
    cats = df.columns[-1] if cats is None else cats
    center = df[df[cats] == target_class].drop(cats, axis=1).mean()
    df = df[df[cats] == source_class]
    return df.apply(lambda elt: distance(elt, center, metric), axis=1).min()

In [10]:
inter_class(iris, 'versicolor', 'setosa')

1.9911755321919762

### Pair distances

In [11]:
from itertools import combinations

distances = []
for (class1, class2) in combinations(iris.loc[:, 'species'].unique(), 2):
    intra = intra_class(iris, class1)
    inter = inter_class(iris, class2, class1)
    distances.append([class1, class2, intra, inter, intra < inter])

pd.DataFrame(distances, columns=["class 1", "class 2", "intra", "inter", "separated"])

Unnamed: 0,class 1,class 2,intra,inter,separated
0,setosa,versicolor,1.24803,1.991176,True
1,setosa,virginica,1.24803,3.495137,True
2,versicolor,virginica,1.552569,0.757147,False


In [12]:
# Direction-sensitive separation tests
distances = []
for (class1, class2) in combinations(iris.loc[:, 'species'].unique(), 2):
    for i in range(2):
        intra = intra_class(iris, class1)
        inter = inter_class(iris, class2, class1)
        distances.append([class1, class2, intra, inter, intra < inter])
        class1, class2 = class2, class1

pd.DataFrame(distances, columns=["class 1", "class 2", "intra", "inter", "separated"])

Unnamed: 0,class 1,class 2,intra,inter,separated
0,setosa,versicolor,1.24803,1.991176,True
1,versicolor,setosa,1.552569,2.861271,True
2,setosa,virginica,1.24803,3.495137,True
3,virginica,setosa,2.070507,4.344813,True
4,versicolor,virginica,1.552569,0.757147,False
5,virginica,versicolor,2.070507,0.651306,False


We can conclude that *setosa* and *versicolor* are separated, as well as *setosa* and *virginica*.

However, we cannot assert that *versicolor* and *virginica* are separated.

## Method 2 - vissualisation