# Desctiptive Statistics by Python

In [1]:
import math
import statistics
import numpy as np
import pandas as pd
import scipy
import scipy.stats
import matplotlib.pyplot as plt
import seaborn as sns
import statsmodels.api as sm

from IPython.display import Math, Latex
from IPython.core.display import Image
from scipy.stats import uniform, norm, gamma, expon, poisson, binom, bernoulli

# seaborn configuration
sns.set(color_codes=True)
sns.set(rc={"figure.figsize": (5,5)})

In [2]:
scipy.__version__
np.__version__
pd.__version__
np.__version__

'1.19.2'

In [3]:
list_x = [8., 1, 2.5, 4, 28.]
list_x_nan = [8., 1, 2.5, math.nan, 4, 28.]

print(list_x)
print(list_x_nan)

[8.0, 1, 2.5, 4, 28.0]
[8.0, 1, 2.5, nan, 4, 28.0]


In [4]:
arr_x, arr_x_nan = np.array(list_x), np.array(list_x_nan)
series_x, series_x_nan = pd.Series(list_x), pd.Series(list_x_nan)

print(arr_x, arr_x_nan)
print(series_x, series_x_nan, sep="\n")

[ 8.   1.   2.5  4.  28. ] [ 8.   1.   2.5  nan  4.  28. ]
0     8.0
1     1.0
2     2.5
3     4.0
4    28.0
dtype: float64
0     8.0
1     1.0
2     2.5
3     NaN
4     4.0
5    28.0
dtype: float64


## Mean

In [5]:
native_mean = sum(list_x) / len(list_x)
stats_mean = statistics.mean(list_x)

print(native_mean)
print(stats_mean)

8.7
8.7


In [6]:
print(list_x_nan)
native_mean_nan = sum(list_x_nan) / len(list_x_nan)
stats_mean_nan = statistics.mean(list_x_nan)
arr_mean_nan = arr_x_nan.mean()
np_mean_nan = np.mean(arr_x_nan)

print(native_mean_nan)
print(stats_mean_nan)
print(arr_mean_nan)
print(np_mean_nan)

[8.0, 1, 2.5, nan, 4, 28.0]
nan
nan
nan
nan


In [7]:
np_mean_ignoring_nan = np.nanmean(arr_x_nan)

print(np_mean_ignoring_nan)

8.7


In [8]:
series_x_nan.mean(), series_x_nan.mean(), series_x_nan.mean(skipna= False)

(8.7, 8.7, nan)

## Weighted Mean

In [9]:
list_weight = [.1, .2, .3, .25, .15]
print(list_x)
print(list_weight)

[8.0, 1, 2.5, 4, 28.0]
[0.1, 0.2, 0.3, 0.25, 0.15]


In [10]:
jumlah = 0
for idx in range(len(list_x)):
    jumlah += list_weight[idx] * list_x[idx]
print(jumlah)

6.95


In [11]:
weighted_mean = sum(list_weight[idx] * list_x[idx] for idx in range(len(list_x))) / sum(list_weight)
weighted_mean

6.95

In [12]:
weighted_mean2 = sum(w*x for w, x in zip (list_weight, list_x)) / sum(list_weight)
weighted_mean2

6.95

In [13]:
weighted_mean3 = np.average(list_x, weights=list_weight)
weighted_mean3

6.95

In [14]:
harmonic_mean = len(list_x) / sum(1 / x for x in list_x)
scipy_harmonic_mean = scipy.stats.hmean(list_x)

print(harmonic_mean)
print(scipy_harmonic_mean)
print("Native Arithmatic Mean:", sum(list_x) / len(list_x))

2.7613412228796843
2.7613412228796843
Native Arithmatic Mean: 8.7


In [15]:
geometric_mean = 1
for value in list_x:
    geometric_mean *= value
geometric_mean **= (1/len(list_x))
print(geometric_mean)

scipy_geometric_mean = scipy.stats.gmean(list_x)
print(scipy_geometric_mean)
print(scipy.stats.gmean(list_x_nan))

4.677885674856041
4.67788567485604
nan


## Median

In [16]:
num_data = len(list_x)
print(num_data)

if num_data % 2:    #odd
    native_median = sorted(list_x)[round((num_data-1) / 2)]
else:       #even
    median_idx = num_data / 2
    native_median = (sorted(x)[median_idx-1] + sorted(x)[median_idx-1]) / 2

print(native_median)


5
4


In [17]:
num_data = len(list_x[:-1])
print(num_data)

if num_data % 2:    #odd
    native_median = sorted(list_x)[round((num_data-1) / 2)]
else:       #even
    median_idx = round(num_data / 2)
    native_median = (sorted(list_x)[median_idx-1] + sorted(list_x)[median_idx-1]) / 2

print(native_median)


4
2.5


In [18]:
print(statistics.median(list_x))
print(statistics.median(list_x[:-1]))

print(statistics.median_low(list_x[:-1]))
print(statistics.median_high(list_x[:-1]))

print(sorted(list_x_nan))
print(statistics.median(list_x_nan))
print(statistics.median_low(list_x_nan))
print(statistics.median_high(list_x_nan))

print(sorted(list_x_nan[1:]))
print(statistics.median(list_x_nan[1:]))
print(statistics.median_low(list_x_nan[1:]))
print(statistics.median_high(list_x_nan[1:]))

np_median = np.median(list_x)
print(np_median)
print(np.median(list_x[:-1]))
print(np.median(list_x_nan))
print(np.median(list_x_nan[1:]))

print(np.nanmedian(list_x_nan))
print(np.nanmedian(list_x_nan[1:]))

4
3.25
2.5
4
[1, 2.5, 4, 8.0, nan, 28.0]
6.0
4
8.0
[1, 2.5, nan, 4, 28.0]
nan
nan
nan
4.0
3.25
nan
nan
4.0
3.25


## Mode

In [19]:
list_u = [2, 8, 3, 2, 12]
list_v = [12, 15, 21, 15, 12, 15, 12]

print(list_u)
print(list_v)

[2, 8, 3, 2, 12]
[12, 15, 21, 15, 12, 15, 12]


In [20]:
native_mod_u = max([(list_u.count(value), value) for value in set (list_u)])[1]
native_mod_v = max([(list_v.count(value), value) for value in set (list_v)])[1]

print(native_mod_u)
print(native_mod_v)

stats_mod_u = statistics.mode(list_u)
print(stats_mod_u)

scipy_mode_u = scipy.stats.mode(list_u)
scipy_mode_v = scipy.stats.mode(list_v)
print(scipy_mode_u)
print(scipy_mode_v)

series_u, series_v = pd.Series(list_u), pd.Series(list_v)
print(series_u.mode())
print(series_v.mode())

series_w = pd.Series([2, 2, math.nan, math.nan])
print(series_w.mode())
print(series_w.mode(dropna= False))

2
15
2
ModeResult(mode=array([2]), count=array([2]))
ModeResult(mode=array([12]), count=array([3]))
0    2
dtype: int64
0    12
1    15
dtype: int64
0    2.0
dtype: float64
0    2.0
1    NaN
dtype: float64


## Variance

In [21]:
n = len(list_x)
avg = sum(list_x) / n

native_variance = sum((value-avg)**2 for value in list_x) / (n-1)
print(native_variance)

stats_variance = statistics.variance(list_x)
print(stats_variance)
print(statistics.variance(list_x_nan))

np_variance = np.var(list_x, ddof=1)
print(np_variance)
print(np.var(list_x_nan, ddof=1))

arr_variance = arr_x.var(ddof=1)
print(arr_variance)
print(arr_x_nan.var(ddof=1))

series_variance= series_x.var()
print(series_variance)
series_variance_nan = series_x_nan.var()
print(series_variance_nan)

123.19999999999999
123.2
nan
123.19999999999999
nan
123.19999999999999
nan
123.19999999999999
123.19999999999999


In [22]:
native_stdev = native_variance ** .5
print(native_stdev)

stats_stdev = statistics.stdev(list_x)
print(stats_stdev)

np_stdev = np.std(list_x, ddof=1)
print(np_stdev)

arr_stdev = arr_x.std(ddof=1)
print(arr_stdev)

series_stdev= series_x.std()
print(series_stdev)
series_stdev_nan = series_x_nan.std()
print(series_stdev_nan)

11.099549540409285
11.099549540409287
11.099549540409285
11.099549540409285
11.099549540409285
11.099549540409285


## Skewness

In [23]:
n = len(list_x)
avg = sum(list_x) / n
var_ = sum((value-avg)**2 for value in list_x) / (n-1)
stdev_ = var_ ** .5

skewness_ = (sum((value - avg)**3 for value in list_x) * n / ((n-1) * (n-2) * stdev_**3))
print(skewness_)

1.9470432273905929


In [24]:
scipy.stats.skew(list_x, bias=False)

1.9470432273905927

In [25]:
scipy.stats.skew(list_x_nan, nan_policy="omit")

masked_array(data=1.3061163,
             mask=False,
       fill_value=1e+20)

In [26]:
series_x.skew()

1.9470432273905924

In [27]:
series_x_nan.skew()

1.9470432273905924

In [28]:
print(arr_x)
percentile_25 = np.percentile(arr_x, 25)
percentile_50 = np.percentile(arr_x, 50)
percentile_75 = np.percentile(arr_x, 75)

print(percentile_25)
print(percentile_50)
print(percentile_75)

print(np.percentile(arr_x, [25, 50, 75]))

[ 8.   1.   2.5  4.  28. ]
2.5
4.0
8.0
[2.5 4.  8. ]


In [29]:
series_x.quantile([.25, .5, .75])

0.25    2.5
0.50    4.0
0.75    8.0
dtype: float64

In [30]:
series_x_nan.quantile([.25, .5, .75])

0.25    2.5
0.50    4.0
0.75    8.0
dtype: float64

## Range

In [31]:
print(np.ptp(list_x))
print(np.ptp(list_x_nan))

27.0
nan


In [32]:
np.nanmax(list_x_nan) - np.nanmin(list_x_nan)

27.0

In [33]:
print(series_x.max() - series_x.min())
print(series_x_nan.max() - series_x_nan.min())

27.0
27.0


### Interquartile Range

In [34]:
np_quartiles = np.percentile(arr_x, [25, 50, 75])
print(np_quartiles)

np_iqr = np_quartiles[-1] - np_quartiles[0]
print(np_iqr)

[2.5 4.  8. ]
5.5


## Covariance

In [35]:
list_x = list(range(-10, 11))
list_y = [0, 2, 2, 2, 2, 3, 3, 6, 7, 4, 7, 6, 6, 9, 4, 5, 5, 10, 11, 12, 14]
print(len(list_x), len(list_y))

21 21


In [36]:
mean_x, mean_y = sum(list_x)/len(list_x), sum(list_y)/len(list_y)
print(mean_x, mean_y)

sigma_x, sigma_y = np.array([x-mean_x for x in list_x]), np.array([y-mean_y for y in list_y])
sigma_x, sigma_y

cov_xy = (sum((list_x[idx] - mean_x) * (list_y[idx] - mean_y) for idx in range (n)) / (n-1))
print(cov_xy)

0.0 5.714285714285714
42.142857142857146


In [37]:
np.cov(list_x, list_y)

array([[38.5       , 19.95      ],
       [19.95      , 13.91428571]])

## Correlation

In [38]:
print(list_x)
print(list_y)
r, p = scipy.stats.pearsonr(list_x, list_y)

[-10, -9, -8, -7, -6, -5, -4, -3, -2, -1, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10]
[0, 2, 2, 2, 2, 3, 3, 6, 7, 4, 7, 6, 6, 9, 4, 5, 5, 10, 11, 12, 14]


In [39]:
np.corrcoef(list_x, list_y)

array([[1.        , 0.86195001],
       [0.86195001, 1.        ]])

In [40]:
series_x.corr(series_x)

1.0

In [41]:
pd.Series(list_x).corr(pd.Series(list_y))

0.8619500056316061

## Correlation in 2 D

In [42]:
data = np.random.randint()

TypeError: randint() takes at least 1 positional argument (0 given)