# Scale data or covariance matrix

In [1]:
import pandas as pd
import numpy as np

In [2]:
iris = pd.read_csv("../data/iris_cleaned.csv")

### Scale data

In [3]:
from sklearn.preprocessing import StandardScaler

x = iris.drop('species', axis=1)
y = iris['species']

x_scaled = StandardScaler().fit_transform(x)
iris_scaled = pd.DataFrame(x_scaled, columns=iris.columns[:-1])
iris_scaled["species"] = y
iris_scaled

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width,species
0,-0.910515,1.159151,-1.373764,-1.348525,setosa
1,-1.151122,-0.101922,-1.373764,-1.348525,setosa
2,-1.391729,0.402507,-1.430841,-1.348525,setosa
3,-1.512032,0.150292,-1.316687,-1.348525,setosa
4,-1.030819,1.411365,-1.373764,-1.348525,setosa
...,...,...,...,...,...
141,1.014339,-0.101922,0.795172,1.426359,virginica
142,0.533125,-1.362995,0.681018,0.897810,virginica
143,0.773732,-0.101922,0.795172,1.029947,virginica
144,0.412822,0.906936,0.909327,1.426359,virginica


### Covariance

In [4]:
iris_cov = np.cov(iris.drop('species', axis=1).transpose())
iris_cov_scaled = np.cov(iris_scaled.drop('species', axis=1).transpose())

In [5]:
iris_cov

array([[ 0.69571138, -0.0408649 ,  1.28947   ,  0.5210307 ],
       [-0.0408649 ,  0.15828673, -0.2840718 , -0.10290505],
       [ 1.28947   , -0.2840718 ,  3.09071138,  1.28413415],
       [ 0.5210307 , -0.10290505,  1.28413415,  0.57667832]])

In [6]:
iris_cov_scaled

array([[ 1.00689655, -0.12399339,  0.88542609,  0.82826022],
       [-0.12399339,  1.00689655, -0.4089418 , -0.34295134],
       [ 0.88542609, -0.4089418 ,  1.00689655,  0.96849908],
       [ 0.82826022, -0.34295134,  0.96849908,  1.00689655]])

### Pair distances

#### X not scaled and C not scaled

In [7]:
from distances import pair_distances

pair_distances(iris, iris_cov)

Unnamed: 0,class 1,class 2,intra_euclid,inter_euclid,separated_euclid,intra_mahala,inter_mahala,separated_mahala
0,setosa,versicolor,1.189613,1.96925,True,3.446018,1.732869,False
1,versicolor,setosa,1.579204,2.876259,True,2.517938,1.569693,False
2,setosa,virginica,1.189613,3.480992,True,3.446018,2.162917,False
3,virginica,setosa,2.070507,4.344813,True,3.730914,2.256873,False
4,versicolor,virginica,1.579204,0.735824,False,2.517938,0.951543,False
5,virginica,versicolor,2.070507,0.651306,False,3.730914,0.856452,False


#### X not scaled and C scaled

In [8]:
pair_distances(iris, iris_cov_scaled)

Unnamed: 0,class 1,class 2,intra_euclid,inter_euclid,separated_euclid,intra_mahala,inter_mahala,separated_mahala
0,setosa,versicolor,1.189613,1.96925,True,2.950456,4.780275,True
1,versicolor,setosa,1.579204,2.876259,True,4.987895,6.606341,True
2,setosa,virginica,1.189613,3.480992,True,2.950456,9.332597,True
3,virginica,setosa,2.070507,4.344813,True,4.684351,10.261252,True
4,versicolor,virginica,1.579204,0.735824,False,4.987895,1.013695,False
5,virginica,versicolor,2.070507,0.651306,False,4.684351,0.695198,False


#### X scaled and C scaled

In [9]:
pair_distances(iris_scaled, iris_cov_scaled)

Unnamed: 0,class 1,class 2,intra_euclid,inter_euclid,separated_euclid,intra_mahala,inter_mahala,separated_mahala
0,setosa,versicolor,2.777629,2.331023,False,3.446018,1.732869,False
1,versicolor,setosa,1.823241,2.4723,True,2.517938,1.569693,False
2,setosa,virginica,2.777629,3.203235,True,3.446018,2.162917,False
3,virginica,setosa,2.658353,3.574969,True,3.730914,2.256873,False
4,versicolor,virginica,1.823241,0.666197,False,2.517938,0.951543,False
5,virginica,versicolor,2.658353,0.554331,False,3.730914,0.856452,False


#### X scaled and C not scaled

In [10]:
pair_distances(iris_scaled, iris_cov)

Unnamed: 0,class 1,class 2,intra_euclid,inter_euclid,separated_euclid,intra_mahala,inter_mahala,separated_mahala
0,setosa,versicolor,2.777629,2.331023,False,8.812237,5.450628,False
1,versicolor,setosa,1.823241,2.4723,True,5.993904,4.911438,False
2,setosa,virginica,2.777629,3.203235,True,8.812237,8.028698,False
3,virginica,setosa,2.658353,3.574969,True,6.331167,8.133226,True
4,versicolor,virginica,1.823241,0.666197,False,5.993904,0.677832,False
5,virginica,versicolor,2.658353,0.554331,False,6.331167,1.525185,False


We can conclude that *setosa* and *versicolor* are separated, as well as *setosa* and *virginica*.

However, we cannot assert that *versicolor* and *virginica* are separated.