In [79]:
import math
import statistics
import numpy as np
import scipy.stats
import pandas as pd
import matplotlib as pyplot

Obliczanie statystyk opisowych

In [80]:
x = [8.0, 1, 2.5, 4, 28.0]
x_with_nan = [8.0, 1, 2.5, math.nan, 4, 28.0]
x

[8.0, 1, 2.5, 4, 28.0]

In [81]:
x_with_nan

[8.0, 1, 2.5, nan, 4, 28.0]

In [82]:
y, y_with_nan = np.array(x), np.array(x_with_nan)
z, z_with_nan = pd.Series(x), pd.Series(x_with_nan)
y

array([ 8. ,  1. ,  2.5,  4. , 28. ])

In [83]:
y_with_nan

array([ 8. ,  1. ,  2.5,  nan,  4. , 28. ])

In [84]:
z

0     8.0
1     1.0
2     2.5
3     4.0
4    28.0
dtype: float64

In [85]:
z_with_nan

0     8.0
1     1.0
2     2.5
3     NaN
4     4.0
5    28.0
dtype: float64

### Miary tendencji centralnej

Średnia arytmetyczna

In [86]:
mean_ = sum(x) / len(x)
mean_

8.7

In [87]:
mean_ = statistics.mean(x)
mean_

8.7

In [88]:
mean_ = statistics.fmean(x)
mean_

8.7

Średnia ważona

In [89]:
x = [8.0, 1, 2.5, 4, 28.0]
w = [0.1, 0.2, 0.3, 0.25, 0.15]
wmean = sum(w[i] * x[i] for i in range(len(x))) / sum(w)
wmean

6.95

In [90]:
wmean = sum(x_ * w_ for (x_, w_) in zip(x, w)) / sum(w)
wmean

6.95

Dla większych zbiorów danych liczenie średniej przez Numpy

In [91]:
y, z, w = np.array(x), pd.Series(x), np.array(w)
wmean = np.average(y, weights=w)
wmean

np.float64(6.95)

In [92]:
wmean = np.average(z, weights=w)
wmean

np.float64(6.95)

Iloczyn elementarny

In [93]:
(w * y).sum() / w.sum()

np.float64(6.95)

Średnia harmoniczna

In [94]:
hmean = len(x) / sum(1 / item for item in x)
hmean

2.7613412228796843

In [95]:
hmen = statistics.harmonic_mean(x)
hmean

2.7613412228796843

In [96]:
statistics.harmonic_mean(x_with_nan)

nan

In [97]:
statistics.harmonic_mean([1, 0, 2])

0

In [98]:
statistics.harmonic_mean([1, 2, -2])  # StatisticsError: harmonic mean does not support negative values

StatisticsError: harmonic mean does not support negative values

In [99]:
scipy.stats.hmean(y)

np.float64(2.7613412228796843)

In [100]:
scipy.stats.hmean(z)

np.float64(2.7613412228796843)

Średnia geometryczna

In [101]:
gmean = 1
for item in x:
    gmean *= item
gmean

2240.0

In [102]:
gmean **= 1 / len(x)
gmean

4.677885674856041

Mediana

In [103]:
n = len(x)
if n % 2:
    median_ = sorted(x)[round(0.5 * (n - 1))]
else:
    x_ord, index = sorted(x), round(0.5 * n)
    median_ = 0.5 * (x_ord[index - 1] + x_ord[index])

median_

4

Mediana z modułu statistics

In [104]:
median_ = statistics.median(x)
median_

4

In [105]:
median_ = statistics.median(x[:-1])
median_

3.25

Dla parzystej liczby elementów median high i low

In [106]:
statistics.median_low(x[:-1])

2.5

In [107]:
statistics.median_high(x[:-1])

4

x[:-1] to [1, 2.5, 4, 8.0]
Dwa elementy w środku to 2.5 i 4.

Metody zwracają wartości jeśli znajdują się w zbiorze wartości nan.

In [108]:
print(x_with_nan)

[8.0, 1, 2.5, nan, 4, 28.0]


In [109]:
statistics.median(x_with_nan)

6.0

In [110]:
statistics.median_low(x_with_nan)

4

In [111]:
statistics.median_high(x_with_nan)

8.0

Mediana z modułu NumPy

In [112]:
median_ = np.median(y)
median_

np.float64(4.0)

In [113]:
median_ = np.median(y[:-1])
median_

np.float64(3.25)

Dla ignorowania wartości nan jest funkcja np.nanmedian()

In [114]:
np.nanmedian(y_with_nan)

np.float64(4.0)

In [115]:
np.nanmedian(y_with_nan[:-1])

np.float64(3.25)

W Pandas metoda .median() domyślnie ignoruje wartości nan

In [116]:
z.median()

np.float64(4.0)

In [117]:
z_with_nan.median()

np.float64(4.0)

Tryb (moda)

In [118]:
u = [2, 3, 2, 8, 12]
mode_ = max((u.count(item), item) for item in set(u))[1]
mode_

2

Tryb w module statistics (funkcja multimode jeśli występuje więcej niż jedna wartość modalna)

In [119]:
mode_ = statistics.mode(u)
mode_

2

In [120]:
mode_ = statistics.multimode(u)
mode_

[2]

In [121]:
v = [12, 15, 12, 15, 21, 15, 12]

In [122]:
statistics.mode(v)

12

In [123]:
statistics.multimode(v)

[12, 15]

Wartości nan w module statistics

In [124]:
statistics.mode([2, math.nan, 2])

2

In [125]:
statistics.multimode([2, math.nan, 2])

[2]

In [126]:
statistics.mode([2, math.nan, 0, math.nan, 5])

nan

In [127]:
statistics.multimode([2, math.nan, 0, math.nan, 5])

[nan]

Tryb w module SciPy

In [128]:
u, v = np.array(u), np.array(v)
mode_ = scipy.stats.mode(u)
mode_

ModeResult(mode=np.int64(2), count=np.int64(2))

In [129]:
mode_ = scipy.stats.mode(v)
mode_

ModeResult(mode=np.int64(12), count=np.int64(3))

Notacja kropkowa w NumPy dla trybu i liczby jego wystąpień

In [130]:
mode_.mode

np.int64(12)

In [131]:
mode_.count

np.int64(3)

Metoda .mode() w obiektach Pandas Series obsługuje wartości multimodalne i ignoruje wartości nan

In [132]:
u, v, w = pd.Series(u), pd.Series(v), pd.Series([2, 2, math.nan])
u.mode()

0    2
dtype: int64

In [133]:
v.mode()

0    12
1    15
dtype: int64

In [134]:
w.mode()

0    2.0
dtype: float64

### Miary zmienności

Wariancja próby

In [135]:
n = len(x)
mean_ = sum(x) / n
var_ = sum((item - mean_) ** 2 for item in x) / (n - 1)
var_

123.2

In [136]:
var_ = statistics.variance(x)
var_

123.2

Wartości nan w module statistics

In [137]:
statistics.variance(x_with_nan)

nan

Wartości nan w module NumPy

In [138]:
var_ = np.var(y, ddof=1)
var_

np.float64(123.19999999999999)

In [139]:
var_ = v.var(ddof=1)
var_

np.float64(10.285714285714286)

In [140]:
np.var(y_with_nan, ddof=1)

np.float64(nan)

In [141]:
y_with_nan.var(ddof=1)

np.float64(nan)

In [142]:
np.nanvar(y_with_nan, ddof=1)

np.float64(123.19999999999999)

Wartości nan w module Pandas

In [143]:
z.var(ddof=1)

np.float64(123.19999999999999)

In [144]:
z_with_nan.var(ddof=1)

np.float64(123.19999999999999)

Wariancja populacji

In [145]:
statistics.pvariance(x)

98.56

In [146]:
np.var(y, ddof=0)

np.float64(98.55999999999999)

In [147]:
z.var(ddof=0)

np.float64(98.55999999999999)

Odchylenie standardowe

In [148]:
std_ = var_ ** 0.5
std_

np.float64(3.2071349029490928)

Odchylenie standardowe w module statistics

In [149]:
std_ = statistics.stdev(x)
std_

11.099549540409287

Odchylenie standardowe w module NumPy

In [150]:
np.std(y, ddof=1)

np.float64(11.099549540409285)

In [151]:
y.std(ddof=1)

np.float64(11.099549540409285)

In [152]:
np.std(y_with_nan, ddof=1)

np.float64(nan)

In [153]:
y_with_nan.std(ddof=1)

np.float64(nan)

In [154]:
np.nanstd(y_with_nan, ddof=1)

np.float64(11.099549540409285)

Odchylenie standardowe w module Pandas

In [155]:
z.std(ddof=1)

np.float64(11.099549540409285)

In [156]:
z_with_nan.std(ddof=1)

np.float64(11.099549540409285)

Skośność

In [157]:
x = [8.0, 1, 2.5, 4, 28.0]
n = len(x)
mean_ = sum(x) / n
var_ = sum((item - mean_) ** 2 for item in x) / (n - 1)
std_ = var_ ** 0.5
skew_ = sum((item - mean_) ** 3 for item in x) * n / ((n - 1) * (n - 2) * std_ ** 3)
skew_

1.947043227390592

W tym przypadku skośność jest dodatnia, co symbolizuje ogon po prawej stronie.

Skośność w module Scipy

In [158]:
y, y_with_nan = np.array(x), np.array(x_with_nan)
scipy.stats.skew(y, bias=False)

np.float64(1.9470432273905927)

In [159]:
scipy.stats.skew(y_with_nan, bias=False)

np.float64(nan)

Skośność w module Pandas

In [160]:
z, z_with_nan = pd.Series(x), pd.Series(x_with_nan)
z.skew()

np.float64(1.9470432273905924)

In [161]:
z_with_nan.skew()

np.float64(1.9470432273905924)

Kurtoza

In [162]:
from scipy.stats import kurtosis

In [163]:
print(kurtosis(x, axis=0, bias=True))

-0.030495095281138695


Kurtoza zbioru x jest mniejsza niż 3 co czyni ją rozkładem playkurtycznym.

Percentyle

In [164]:
x = [-5.0, -1.1, 0.1, 2.0, 8.0, 12.8, 21.0, 25.8, 41.0]
statistics.quantiles(x, n=2)

[8.0]

In [165]:
statistics.quantiles(x, n=4, method='inclusive')

[0.1, 8.0, 21.0]

In [166]:
y = np.array(x)
np.percentile(y, 5)

np.float64(-3.44)

In [167]:
np.percentile(y, 95)

np.float64(34.919999999999995)

In [168]:
np.percentile(y, [25, 50, 75])

array([ 0.1,  8. , 21. ])

In [169]:
np.median(y)

np.float64(8.0)

Ignorowanie wartości nan

In [170]:
y_with_nan = np.insert(y, 2, np.nan)
y_with_nan

array([-5. , -1.1,  nan,  0.1,  2. ,  8. , 12.8, 21. , 25.8, 41. ])

In [171]:
np.nanpercentile(y_with_nan, [25, 50, 75])

array([ 0.1,  8. , 21. ])

Kwantyle w module NumPy

In [172]:
np.quantile(y, 0.05)

np.float64(-3.44)

In [173]:
np.quantile(y, 0.95)

np.float64(34.919999999999995)

In [174]:
np.quantile(y, [0.25, 0.50, 0.75])

array([ 0.1,  8. , 21. ])

In [175]:
np.nanquantile(y_with_nan, [0.25, 0.50, 0.75])

array([ 0.1,  8. , 21. ])

Kwantyle w module Pandas

In [176]:
z, z_with_nan = pd.Series(y), pd.Series(y_with_nan)
z.quantile(0.05)

np.float64(-3.44)

In [177]:
z.quantile(0.95)

np.float64(34.919999999999995)

In [178]:
z.quantile([0.25, 0.50, 0.75])

0.25     0.1
0.50     8.0
0.75    21.0
dtype: float64

In [179]:
z_with_nan.quantile([0.25, 0.50, 0.75])

0.25     0.1
0.50     8.0
0.75    21.0
dtype: float64

Zakresy

In [180]:
np.ptp(y)

np.float64(46.0)

In [181]:
np.ptp(z)

np.float64(46.0)

In [182]:
np.ptp(y_with_nan)

np.float64(nan)

In [183]:
np.ptp(z_with_nan)

np.float64(nan)

Zakresy za pomocą maksimów i minimów sekwencji

In [184]:
np.amax(y) - np.amin(y)

np.float64(46.0)

In [185]:
np.nanmax(y_with_nan) - np.nanmin(y_with_nan)

np.float64(46.0)

In [186]:
y.max() - y.min()

np.float64(46.0)

In [187]:
z.max() - z.min()

np.float64(46.0)

In [188]:
z_with_nan.max() - z_with_nan.min()

np.float64(46.0)

Zakres interkwartylowy

In [189]:
quartiles = np.quantile(y, [0.25, 0.75])
quartiles[1] - quartiles[0]

np.float64(20.9)

In [190]:
quartiles = z.quantile([0.25, 0.75])
quartiles[0.75] - quartiles[0.25]

np.float64(20.9)

### Podsumowanie statystyk opisowych

Describe w module SciPy

In [191]:
result = scipy.stats.describe(y, ddof=1, bias=False)
result

DescribeResult(nobs=9, minmax=(np.float64(-5.0), np.float64(41.0)), mean=np.float64(11.622222222222222), variance=np.float64(228.75194444444446), skewness=np.float64(0.9249043136685094), kurtosis=np.float64(0.14770623629658886))

In [192]:
result.nobs

9

In [193]:
result.minmax[0]  # Min

np.float64(-5.0)

In [194]:
result.minmax[1]  # Max

np.float64(41.0)

In [195]:
result.mean

np.float64(11.622222222222222)

In [196]:
result.variance

np.float64(228.75194444444446)

In [197]:
result.skewness

np.float64(0.9249043136685094)

In [198]:
result.kurtosis

np.float64(0.14770623629658886)

Describe w module Pandas

In [199]:
result = z.describe()
result

count     9.000000
mean     11.622222
std      15.124548
min      -5.000000
25%       0.100000
50%       8.000000
75%      21.000000
max      41.000000
dtype: float64

In [200]:
result['mean']

np.float64(11.622222222222222)

In [201]:
result['std']

np.float64(15.12454774346805)

In [202]:
result['min']

np.float64(-5.0)

In [203]:
result['max']

np.float64(41.0)

In [204]:
result['25%']

np.float64(0.1)

In [205]:
result['50%']

np.float64(8.0)

In [206]:
result['75%']

np.float64(21.0)