# Scale data or covariance matrix

In [1]:
import pandas as pd
import numpy as np

In [2]:
iris = pd.read_csv("../data/iris.csv")
iris_filtered = iris[iris.species == "setosa"].reset_index(drop=True)

### Scale data

In [3]:
from sklearn.preprocessing import StandardScaler

x = iris.drop('species', axis=1)
y = iris['species']

x_filtered = iris_filtered.drop('species', axis=1)
y_filtered = iris_filtered['species']

x_scaled = StandardScaler().fit_transform(x)
iris_scaled = pd.DataFrame(x_scaled, columns=iris.columns[:-1])
iris_scaled["species"] = y
iris_scaled

x_scaled_filtered = StandardScaler().fit_transform(x_filtered)
iris_scaled_filtered = pd.DataFrame(x_scaled_filtered, columns=iris.columns[:-1])
iris_scaled_filtered["species"] = y_filtered
iris_scaled_filtered

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width,species
0,0.269382,0.19187,-0.360636,-0.440924,setosa
1,-0.303771,-1.140559,-0.360636,-0.440924,setosa
2,-0.876924,-0.607588,-0.942306,-0.440924,setosa
3,-1.163501,-0.874073,0.221035,-0.440924,setosa
4,-0.017195,0.458355,-0.360636,-0.440924,setosa
5,1.129111,1.257813,1.384376,1.476136,setosa
6,-1.163501,-0.074616,-0.360636,0.517606,setosa
7,-0.017195,-0.074616,0.221035,-0.440924,setosa
8,-1.736653,-1.407045,-0.360636,-0.440924,setosa
9,-0.303771,-0.874073,0.221035,-1.399454,setosa


### Covariance

In [4]:
iris_cov = np.cov(iris_filtered.drop('species', axis=1).transpose())
iris_cov_scaled = np.cov(iris_scaled_filtered.drop('species', axis=1).transpose())

In [5]:
iris_cov

array([[0.12424898, 0.09921633, 0.0163551 , 0.01033061],
       [0.09921633, 0.1436898 , 0.01169796, 0.00929796],
       [0.0163551 , 0.01169796, 0.03015918, 0.00606939],
       [0.01033061, 0.00929796, 0.00606939, 0.01110612]])

In [6]:
iris_cov_scaled

array([[1.02040816, 0.7577007 , 0.27262833, 0.28377383],
       [0.7577007 , 1.02040816, 0.1813265 , 0.23750205],
       [0.27262833, 0.1813265 , 1.02040816, 0.338398  ],
       [0.28377383, 0.23750205, 0.338398  , 1.02040816]])

### Pair distances

#### X not scaled and C not scaled

In [7]:
from distances import pair_distances

pair_distances(iris, iris_cov)[["class 1", "class 2", "intra_mahala", "inter_mahala", "separated_mahala"]]

Unnamed: 0,class 1,class 2,intra_mahala,inter_mahala,separated_mahala
0,setosa,versicolor,3.511074,11.652742,True
1,versicolor,setosa,7.399422,15.880855,True
2,setosa,virginica,3.511074,20.942646,True
3,virginica,setosa,9.165297,24.353891,True
4,versicolor,virginica,7.399422,4.925287,False
5,virginica,versicolor,9.165297,4.208967,False


#### X not scaled and C scaled

In [8]:
pair_distances(iris, iris_cov_scaled)

Unnamed: 0,class 1,class 2,intra_euclid,inter_euclid,separated_euclid,intra_mahala,inter_mahala,separated_mahala
0,setosa,versicolor,1.24803,1.991176,True,1.251675,2.172485,True
1,versicolor,setosa,1.552569,2.861271,True,1.462245,2.946964,True
2,setosa,virginica,1.24803,3.495137,True,1.251675,3.437444,True
3,virginica,setosa,2.070507,4.344813,True,2.338991,4.277111,True
4,versicolor,virginica,1.552569,0.757147,False,1.462245,0.667338,False
5,virginica,versicolor,2.070507,0.651306,False,2.338991,0.649982,False


#### X scaled and C scaled

In [9]:
pair_distances(iris_scaled, iris_cov_scaled)

Unnamed: 0,class 1,class 2,intra_euclid,inter_euclid,separated_euclid,intra_mahala,inter_mahala,separated_mahala
0,setosa,versicolor,2.670556,2.323117,False,3.247705,2.616195,False
1,versicolor,setosa,2.190617,2.411207,True,2.326167,2.223506,False
2,setosa,virginica,2.670556,3.194547,True,3.247705,3.510694,True
3,virginica,setosa,2.525097,3.555583,True,3.075154,3.326545,True
4,versicolor,virginica,2.190617,0.692602,False,2.326167,0.681864,False
5,virginica,versicolor,2.525097,0.551886,False,3.075154,0.554494,False


#### X scaled and C not scaled

In [10]:
pair_distances(iris_scaled, iris_cov)

Unnamed: 0,class 1,class 2,intra_euclid,inter_euclid,separated_euclid,intra_mahala,inter_mahala,separated_mahala
0,setosa,versicolor,2.670556,2.323117,False,8.655777,14.013235,True
1,versicolor,setosa,2.190617,2.411207,True,7.27233,14.129801,True
2,setosa,virginica,2.670556,3.194547,True,8.655777,20.424137,True
3,virginica,setosa,2.525097,3.555583,True,9.341515,22.192119,True
4,versicolor,virginica,2.190617,0.692602,False,7.27233,3.260296,False
5,virginica,versicolor,2.525097,0.551886,False,9.341515,4.441214,False


We can conclude that *setosa* and *versicolor* are separated, as well as *setosa* and *virginica*.

However, we cannot assert that *versicolor* and *virginica* are separated.