# Statistical analysis
For machine learning. _María Camila Vásquez Correa_

### Necessary packages

In [15]:
import numpy as np
import itertools
import pandas as pd
from sklearn.datasets import load_iris
from scipy.stats import shapiro,normaltest,kurtosis,pearsonr,kendalltau,zscore,chi2_contingency, ttest_ind, spearmanr, f_oneway, ttest_rel
from statsmodels.tsa.stattools import adfuller, kpss
import warnings
warnings.simplefilter('ignore')

## Preprocessing

In [8]:
# Data loading
iris = load_iris().data
iris = pd.DataFrame(iris)
N = 4
n_samples = 150
# Missing values
iris.isna().any().any()

False

So, there is not missing values, so we can proceed to normalization

In [6]:
for i in range(N):
    iris[i] = iris[i]/np.max(iris[i])

The normalization technique used was $\frac{x_i}{x_{max}}$

## Descriptive analysis

In [9]:
iris.describe()

Unnamed: 0,0,1,2,3
count,150.0,150.0,150.0,150.0
mean,5.843333,3.057333,3.758,1.199333
std,0.828066,0.435866,1.765298,0.762238
min,4.3,2.0,1.0,0.1
25%,5.1,2.8,1.6,0.3
50%,5.8,3.0,4.35,1.3
75%,6.4,3.3,5.1,1.8
max,7.9,4.4,6.9,2.5


In [10]:
iris.median()

0    5.80
1    3.00
2    4.35
3    1.30
dtype: float64

In [13]:
kurtosis(iris)

array([-0.57356795,  0.18097632, -1.39553589, -1.33606741])

## Distribution tests and stationarity

In [18]:
descriptive = pd.DataFrame()
sw = []
normal = []
dfuller = []
kp = []
for i in range(N):
    stat, p1 = shapiro(iris[i])
    sw.append(1 if p1 > 0.05 else 0)
    stat, p2 = normaltest(iris[i])
    normal.append(1 if p2 > 0.05 else 0)
    stat, p3, lags, obs, crit, t = adfuller(iris[i])#, nlags = 'auto')
    dfuller.append(1 if p3 > 0.05 else 0)
    stat, p4, lags, crit = kpss(iris[1], nlags = 'auto')
    kp.append(1 if p3 > 0.05 else 0)
descriptive['Shapiro'] = sw
descriptive['D\'Angostino'] = normal
descriptive['Dickie Fuller'] = dfuller
descriptive['Kpss'] = kp
descriptive

Unnamed: 0,Shapiro,D'Angostino,Dickie Fuller,Kpss
0,0,1,1,1
1,1,1,1,1
2,0,0,1,1
3,0,0,1,1


## Independence tests

In [20]:
per = itertools.combinations(range(N),2)
distribution = pd.DataFrame()
pearson = []
spearman = []
kendall = []
chi = []
tstu = []
for i in per:
    stat, p1 = pearsonr(iris[i[0]],iris[i[1]])
    pearson.append(1 if p1 > 0.05 else 0)
    stat, p2 = spearmanr(iris[i[0]],iris[i[1]])
    spearman.append(1 if p2 > 0.05 else 0)
    stat, p4 = kendalltau(iris[i[0]],iris[i[1]])
    kendall.append(1 if p4 > 0.05 else 0)
    stat, p3, dof, expected = chi2_contingency(iris[i[0]],iris[i[1]])
    chi.append(1 if p3 > 0.05 else 0)
    stat, p5 = ttest_ind(iris[i[0]],iris[i[1]])
    tstu.append(1 if p5 > 0.05 else 0)
distribution['Pearson'] = pearson
distribution['Spearman'] = spearman
distribution['Kendall'] = kendall
distribution['Chi2'] = chi
distribution['Tstudent'] = tstu
distribution

Unnamed: 0,Pearson,Spearman,Kendall,Chi2,Tstudent
0,1,0,1,1,0
1,0,0,0,1,0
2,0,0,0,1,0
3,0,0,0,1,0
4,0,0,0,1,0
5,0,0,0,1,0


In [21]:
# Analysis of variance
stat, p = f_oneway(iris[1], iris[2], iris[3], iris[0])
print('stat=%.3f, p=%.3f' % (stat, p))
if p > 0.05:
	print('Probably the same distribution')
else:
	print('Probably different distributions')

stat=482.915, p=0.000
Probably different distributions


### Outliers

In [22]:
np.where(zscore(iris) >= 2.5)

(array([15, 33]), array([1, 1]))