# 산포통계

In [16]:
import numpy as np
from scipy import stats
import pandas as pd
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import StandardScaler

## 분산 계산

In [2]:
x = [1, 2, 3, 4, 5]
print(np.var(x, ddof = 1))
print(np.array(x).var())
print(pd.Series(x).var(ddof = 0))

2.5
2.0
2.0


## 표준편차 계산

In [3]:
x = [1, 2, 3, 4, 5]
print(np.std(x, ddof = 1))
print(np.array(x).std(ddof = 0))
print(pd.Series(x).std(ddof = 1))

1.5811388300841898
1.4142135623730951
1.5811388300841898


## 변동계수의 필요성

In [4]:
x1 = np.array([1, 2, 3, 4, 5])
x2 = x1 * 10

print(np.std(x1, ddof = 1))
print(np.std(x2, ddof = 1))

1.5811388300841898
15.811388300841896


In [5]:
print(stats.variation(x1))
print(stats.variation(x2))

0.47140452079103173
0.4714045207910317


In [6]:
print(np.std(x1, ddof = 1) / np.mean(x1))
print(np.std(x2, ddof = 1) / np.mean(x2))

0.5270462766947299
0.5270462766947299


## 스케일링

In [7]:
x1 = np.array([1, 2, 3, 4, 5])
x2 = x1 * 10

In [8]:
# standard scaling

z1 = (x1 - x1.mean()) / x1.std()
z2 = (x2 - x2.mean()) / x2.std()

print(z1)
print(z2)

[-1.41421356 -0.70710678  0.          0.70710678  1.41421356]
[-1.41421356 -0.70710678  0.          0.70710678  1.41421356]


In [9]:
# Min-max scaling

z1 = (x1 - x1.min()) / (x1.max() - x1.min())
z2 = (x2 - x2.min()) / (x2.max() - x2.min())

print(z1)
print(z2)

[0.   0.25 0.5  0.75 1.  ]
[0.   0.25 0.5  0.75 1.  ]


In [14]:
X = pd.DataFrame(
    {"x1":[1, 2, 3, 4, 5],
    "x2": [10, 20, 30, 40, 50]}
    )

X

Unnamed: 0,x1,x2
0,1,10
1,2,20
2,3,30
3,4,40
4,5,50


In [17]:
scaler = MinMaxScaler()
z = scaler.fit_transform(X)
pd.DataFrame(z, columns=['x1', 'x2'])

Unnamed: 0,x1,x2
0,0.0,0.0
1,0.25,0.25
2,0.5,0.5
3,0.75,0.75
4,1.0,1.0


In [19]:
ss_scaler = StandardScaler()
s = ss_scaler.fit_transform(X)
pd.DataFrame(s, columns=['x1', 'x2'])

Unnamed: 0,x1,x2
0,-1.414214,-1.414214
1,-0.707107,-0.707107
2,0.0,0.0
3,0.707107,0.707107
4,1.414214,1.414214


## 범위와 사분위 범위 계산

In [20]:
# np.random.normal(평균, 편차, size = 갯수)

x = np.random.normal(100, 20, size = 1000)
x

array([ 52.73904721,  92.46581377,  93.68802953,  61.30774379,
        99.57352737, 107.48026843, 105.30859962, 121.78567927,
       117.57519533,  76.12831197, 134.26491489,  51.46455215,
        97.82399436, 110.85134777,  96.30900214, 117.15918762,
       105.10435777,  89.85651527,  71.17960667,  81.3006357 ,
        97.6182541 ,  88.47815204, 119.04891128, 127.02144833,
        76.82590788,  87.76350792,  96.75922665,  92.417873  ,
        93.67964516, 111.43822695,  78.84323281,  85.16526722,
       103.6608268 ,  89.36552551, 103.69360037,  96.99247556,
       132.28618847,  99.18571793,  89.70834327, 103.69580862,
        97.76771188, 115.57052455, 127.09557892,  99.91051843,
        93.92680155,  98.29789086,  72.96852826,  98.63564913,
       118.05298004,  65.21447555, 124.25687342, 114.78375484,
       114.52500287,  70.38658575,  81.84594111,  95.85858916,
        79.55779188,  68.57514091, 112.1836658 , 133.64085168,
       126.7846973 ,  86.17027575, 109.46502043, 113.33

In [21]:
print(np.ptp(x))
print(np.max(x) - np.min(x))

114.0278689548434
114.0278689548434


In [22]:
print(np.quantile(x, 0.75) - np.quantile(x, 0.25))
print(stats.iqr(x))

27.333407026072337
27.333407026072337
