In [1]:
import numpy as np
import statistics
import math
import scipy.stats
import pandas as pd

%matplotlib inline

import matplotlib as mpl
import matplotlib.pyplot as plt
import seaborn as sns


### Preparing Data

In [2]:
x = [8.0, 1, 2.5, 4, 28.0]
x_with_nan = [8.0, 1, 2.5, np.nan, 4, 28.0]
print(x)
print(x_with_nan)

[8.0, 1, 2.5, 4, 28.0]
[8.0, 1, 2.5, nan, 4, 28.0]


In [3]:
# create np.ndarray dan pd.Series
y, y_with_nan = np.array(x), np.array(x_with_nan)
z, z_with_nan = pd.Series(x), pd.Series(x_with_nan)
print(y)
print(y_with_nan)
print(z_with_nan)

[ 8.   1.   2.5  4.  28. ]
[ 8.   1.   2.5  nan  4.  28. ]
0     8.0
1     1.0
2     2.5
3     NaN
4     4.0
5    28.0
dtype: float64


In [4]:
X = np.random.normal(loc=0,scale=1, size=10)

### Pengukuran Tendensi Sentral

1. Mean
2. Weighted Mean
3. Geometric Mean
4. Harmonic Mean
5. Median
6. Mode

### Mean Standard

In [5]:
mean_ = sum(x) / len(x)
mean_

8.7

In [6]:
# using statistics libraries
mean_ = statistics.mean(x)
print(mean_)

8.7


In [7]:
mean_ = statistics.mean(x_with_nan)
print(mean_)

nan


In [8]:
# using numpy
mean_ = np.mean(y)
mean_

8.7

In [9]:
mean_ = y.mean()
mean_

8.7

In [10]:
print(np.mean(y_with_nan))
print(y_with_nan.mean())

nan
nan


In [11]:
np.nanmean(y_with_nan)

8.7

In [12]:
mean_ = z.mean()
mean_

8.7

In [13]:
z_with_nan.mean()

8.7

### Weighted Mean

In [14]:
0.2 * 2 + 0.5 * 4 + 0.3 * 8

4.8

In [15]:
x = [8.0, 1, 2.5, 4, 28.0]
w = [0.1, 0.2, 0.3, 0.25, 0.15]

wmean = sum(w[i] * x[i] for i in range(len(x))) / sum(w)
print(wmean)
wmean = sum(x_ * w_ for (x_, w_) in zip(x, w)) / sum(w)
print(wmean)

6.95
6.95


In [16]:
y, z, w = np.array(x), pd.Series(x), np.array(w)

wmean = np.average(y, weights=w)
print(wmean)

wmean = np.average(z, weights=w)
print(wmean)

6.95
6.95


In [17]:
(w*y).sum() / w.sum()

6.95

### Harmonic Mean

In [18]:
hmean = len(x) / sum(1 / item for item in x)
hmean

2.7613412228796843

In [19]:
hmean = statistics.harmonic_mean(x)
hmean

2.7613412228796843

In [20]:
scipy.stats.hmean(y)

2.7613412228796843

In [21]:
scipy.stats.hmean(z)

2.7613412228796843

### Geometric Mean

In [22]:
# manual implementation
gmean = 1

for item in x:
    gmean *= item

gmean **= 1 / len(x)
gmean

4.677885674856041

In [23]:
# using scipy
scipy.stats.gmean(x)

4.67788567485604

In [24]:
# using scipy
scipy.stats.gmean(z)

4.67788567485604

### Median

In [25]:
n = len(X)
if n % 2:
    median_ = sorted(x)[round(0.5*(n-1))]
else:
    x_ord, index = sorted(x), round(0.5 * n)
    median_ = 0.5 * (x_ord[index-1]+ x_ord[index])

median_

IndexError: list index out of range

In [None]:
x

[8.0, 1, 2.5, 4, 28.0]

In [None]:
statistics.median_low(x[:-1])

2.5

In [None]:
statistics.median_high(x[:-1])

4

In [None]:
print(statistics.median(x_with_nan))
print(statistics.median_low(x_with_nan))
print(statistics.median_high(x_with_nan))

6.0
4
8.0


In [None]:
median_ = np.median(y)
print(median_)

4.0


In [None]:
median_ = np.median(y[:-1])
print(median_)

3.25


### Mode

In [None]:
# menghitung modus secara manual
u = [2, 3, 2, 8, 12]

v = [12, 15, 12, 15, 21, 15, 12]

mode_ = max((u.count(item), item) for item in set(u))[1]
mode_

2

In [None]:
X_mode = [1,2,2,3,3,3,4,4,4,4]
# using statistics libraries
mode_ = statistics.mode(u)
mode_


2

In [None]:
# using scipy stats
u, v = np.array(u), np.array(v)

mode_ = scipy.stats.mode(u)
mode_

  mode_ = scipy.stats.mode(u)


ModeResult(mode=array([2]), count=array([2]))

In [None]:
mode_ = scipy.stats.mode(v)
mode_

  mode_ = scipy.stats.mode(v)


ModeResult(mode=array([12]), count=array([3]))

In [None]:
print(mode_.mode)
print(mode_.count)

[12]
[3]


In [None]:
mode_scipy = scipy.stats.mode(X_mode)

  mode_scipy = scipy.stats.mode(X_mode)


In [None]:
# mode in Pandas Series, ignores nan values by default
u,v,w = pd.Series(u), pd.Series(v), pd.Series([2, 2,math.nan])
print(u.mode())
print(v.mode())
print(w.mode())

0    2
dtype: int32
0    12
1    15
dtype: int32
0    2.0
dtype: float64


### Pengukuran Variabilitas
1. Variance
2. Standar Deviasi (Simpangan Baku)
3. Skewness
4. Percentiles
5. Ranges

In [None]:
n = len (x)
mean_ = sum(x)/ n

var_ = sum((item - mean_)**2 for item in x) / (n-1)
var_

123.19999999999999

In [None]:
var_ = statistics.variance(x)
var_

123.2

In [None]:
var_ = np.var(y, ddof=1)
var_

123.19999999999999

In [None]:
var_ = y.var(ddof=1)
var_

123.19999999999999

In [None]:
z.var(ddof=1)

123.19999999999999

### Standart Deviation

In [None]:
std_ = var_ ** 0.5
std_

11.099549540409285

In [None]:
std_ = statistics.stdev(x)
std_

11.099549540409287

In [None]:
np.std(y, ddof=1)

11.099549540409285

In [None]:
y.std(ddof=1)

11.099549540409285

In [None]:
z.std(ddof=1)

11.099549540409285

### Variance

In [None]:
# Menghitung variance manual
n = len(x)

mean_ = sum(x) / n

var_ = sum((item - mean_)**2 for item in x) / (n - 1)
var_

123.19999999999999

In [None]:
# using statistics 
var = statistics.variance(x)
var

123.2

In [None]:
# using numpy
var_np = np.var(np.array(x), ddof=1)
var_np

123.19999999999999

### Skewness

In [None]:
# gunakan rumus ekspresi yang lebih sederhana
x = [8.0, 1, 2.5, 4, 28.0]

n = len(x)

mean_ = sum(x) / n
var_ = sum((item - mean_)**2 for item in x) / (n-1)
std_ = var_ ** 0.5

skew_ = (sum((item - mean_)**3 for item in x) * n / ((n - 1) * (n - 2) * std_**3))

In [None]:
skew_ #this right tail

1.9470432273905929

In [None]:
y, y_with_nan = np.array(x), np.array(x_with_nan)
scipy.stats.skew(y,bias=False)

1.9470432273905927

In [None]:
scipy.stats.skew(y_with_nan,bias=False)

nan

In [None]:
z, z_with_nan = pd.Series(x), pd.Series(x_with_nan)
z.skew()

1.9470432273905924

In [None]:
z_with_nan.skew()

1.9470432273905924

### Percentiles

In [None]:
x = [-5.0, -1.1, 0.1, 2.0, 8.0, 12.8, 21.0, 25.8, 41.0]

In [None]:
statistics.quantiles(x, n=2)

[8.0]

In [None]:
statistics.quantiles(x, n=4, method='inclusive')

[0.1, 8.0, 21.0]

In [None]:
y = np.array(x)
np.percentile(y,5)

-3.44

In [None]:
np.percentile(y,95)

34.919999999999995

In [None]:
np.percentile(y_with_nan,[25,50,75])

array([nan, nan, nan])

In [None]:
np.median(y)

8.0

In [None]:
y_with_nan = np.insert(y,2, np.nan)
y_with_nan

array([-5. , -1.1,  nan,  0.1,  2. ,  8. , 12.8, 21. , 25.8, 41. ])

In [None]:
np.nanpercentile(y_with_nan,[25,50,75])

array([ 0.1,  8. , 21. ])

In [None]:
np.quantile(y, 0.05)

-3.44

In [None]:
np.quantile(y, 0.95)

34.919999999999995

In [None]:
np.quantile(y,[0.25,0.5,0.75])

array([ 0.1,  8. , 21. ])

In [None]:
np.nanquantile(y_with_nan,[0.25,0.5,0.75])

array([ 0.1,  8. , 21. ])

### Ranges

In [None]:
np.ptp(y)


46.0

In [None]:
np.ptp(z)

27.0

In [None]:
np.ptp(y_with_nan)

nan

In [27]:
np.ptp(z_with_nan)

nan

In [26]:
np.amax(y) - np.amin(y)

27.0

In [28]:
np.nanmax(y_with_nan) - np.nanmin(y_with_nan)

27.0

In [29]:
y.max() - y.min()

27.0

In [30]:
z.max()- z.min()

27.0

In [31]:
z_with_nan.max()- z_with_nan.min()

27.0

In [33]:
quartiles = np.quantile(y, [0.25, 0.75])
quartiles[1] - quartiles[0]

5.5