# Descriptive Statistics with Python

## 1. Measures of Central Tendency
- Mean
- Median
- Mode

In [1]:
import numpy as np
import scipy as sp
import math

In [2]:
# random data points
data = list(np.random.randint(low = 10, high = 100, size = 50))
print(data)

[21, 54, 40, 28, 75, 75, 36, 99, 24, 47, 62, 99, 10, 23, 90, 22, 29, 33, 25, 84, 89, 50, 66, 41, 73, 81, 13, 80, 40, 61, 45, 52, 41, 78, 21, 83, 26, 32, 68, 89, 91, 54, 46, 56, 45, 42, 30, 25, 82, 78]


### Mean

In [3]:
# mean = (sum of all data points)/total number of datapoints

# calculating mean manually
total_sum = sum(data)
n = len(data)
mean = total_sum/n
print("Mean(manually): ", mean)

# calculating mean using numpy 
np_mean = np.mean(data)
print("Mean using numpy: ", np_mean)

# calculating mean using scipy
sp_mean = sp.stats.tmean(data)
print("Mean using scipy: ", sp_mean)

Mean(manually):  53.08
Mean using numpy:  53.08
Mean using scipy:  53.08


### Median

In [4]:
# median = (n/2)th term, if n is odd
# median = [(n/2)th term + ((n/2)+1)th term]/2

# median(manually)
n = len(data)
asc_ord_data = sorted(data)
if n%2 == 0:
    first_term = asc_ord_data[(n//2)-1]
    second_term = asc_ord_data[(n//2)]
    median = (first_term+second_term)/2
    print("Median(manually): ", median)
else:
    median = data[n//2]
    print("Median(manually): ", median)

# median using numpy 
np_median = np.median(data)
print("Median using numpy: ", np_median)

# median using scipy
sp_median = sp.ndimage.median(data)
print("Median using scipy: ", sp_median)

Median(manually):  48.5
Median using numpy:  48.5
Median using scipy:  48.5


### Mode

In [5]:
# mode = most frequent data point or element in dataset

# mode(manually)
frequency_count = {}
for data_point in data:
    if data_point in frequency_count:
        frequency_count[data_point] += 1
    else:
        frequency_count[data_point] = 1
mode = max(frequency_count, key = frequency_count.get)
print("Mode(manually): ", mode)

# mode using scipy
mode = sp.stats.mode(data)
print("Mode using scipy: ", mode.mode)

Mode(manually):  21
Mode using scipy:  21


### Emperical Relation between mean, median and mode
<center><strong><h3>2Mean + Mode = 3Median</h3></strong></center>

## 2. Measures of Dispersion
- Range
- Variance
- Standard Deviation
- Mean Deviation
- Quartiles
- Percentiles

### Range

In [6]:
# range of data
R = max(data) - min(data)
print("Range of data is: ", R)

Range of data is:  89


### Variance

In [7]:
# varaince = (sum of square of difference of data points from center point(mean))/n

# Variance(manually)
n = len(data)
total_sum = sum(data)
mean = total_sum/n

sqr_sum = sum(list((xi-mean)**2 for xi in data))
variance = sqr_sum/n
print("Variance(manually): ", variance)                       # without bessel's correction

# variance using numpy
variance = np.var(data)
print("Variance using numpy: ", variance)                     # without bessel's correction

# variance using scipy
variance = sp.ndimage.variance(np.array(data))
print("Variance using scipy: ", variance)

Variance(manually):  626.9135999999999
Variance using numpy:  626.9135999999999
Variance using scipy:  626.9135999999999


### Standard Deviation

In [8]:
# standard devaition = square root of variance

# manually
SD = math.sqrt(variance)
print("Standard Deviation(manually): ", SD)

# using numpy
SD = np.std(data)
print("Standard Deviation using numpy: ", SD)

# using scipy
SD = sp.stats.tstd(data, ddof = 0)
print("Standard Deviation using scipy: ", SD)

Standard Deviation(manually):  25.03824274984169
Standard Deviation using numpy:  25.03824274984169
Standard Deviation using scipy:  25.03824274984169


### Mean Deviation

In [9]:
# mean deviation = (sum of absolute difference of center point and data points)/n

# manually
n = len(data)
mean = sum(data)/n
abs_diff = list(map(lambda xi: (xi-mean)*(-1) if (xi-mean)<0 else (xi-mean), data))
MD = sum(abs_diff)/n
print("Mean Deviation(manually): ", MD)

Mean Deviation(manually):  21.846400000000003


### Percentiles

In [10]:
# Percentiles = value of [i*(n+1)/100]th term

# manually
n = len(data)
i = 25
sorted_data = sorted(data)
term = i*(n+1)//100
p25 = sorted_data[term]
print("25th percentile(manually): ", p25)

# using numpy
p25 = np.percentile(data, 25)
print("25th percentile using numpy: ", p25)

# using scipy
p60 = sp.stats.scoreatpercentile(data, 60)
print("60th percentilw using scipy: ", p60)

25th percentile(manually):  30
25th percentile using numpy:  30.5
60th percentilw using scipy:  57.99999999999999


### Quantiles

In [11]:
# Quartiles = value of [i*(n+1)/4]th term

# manually
n = len(data)
i = 1
sorted_data = sorted(data)
term = i*(n+1)//4
Q1 = sorted_data[term]
print("First quartile Q1 (manually): ", Q1)

# using numpy
Q1 = np.percentile(data, 25)
print("First quartile Q1 using numpy: ", Q1)

# using scipy
Q3 = sp.stats.scoreatpercentile(data, 75)
print("Third quartile Q3 using scipy: ", Q3)

First quartile Q1 (manually):  30
First quartile Q1 using numpy:  30.5
Third quartile Q3 using scipy:  77.25


In [12]:
# IQR - Inter Quartile Range = Q3 - Q1

IQR = Q3-Q1
print("IQR: ", IQR)

IQR:  46.75


### 5 Number Summary
- Minimum value
- First quartile Q1
- Second quartile Q2
- Third quartile Q3
- Maximum value

In [13]:
# min value
min_value = min(data)
print("Minimum value: ", min_value)

# first quartile
Q1 = np.percentile(data, 25)
print("First Quartile: ", Q1)

# second quartile
Q2 = np.percentile(data, 50)
print("Second Quartile: ", Q2)

# third quartile
Q3 = np.percentile(data, 75)
print("Third Quartile: ", Q3)

# max value
max_value = max(data)
print("Maximum value of in data points: ", max_value)

Minimum value:  10
First Quartile:  30.5
Second Quartile:  48.5
Third Quartile:  77.25
Maximum value of in data points:  99


## 3. Measures of Shape
- Skewness
- Kurtosis

### Skewness

In [14]:
# Skewness = sum of ((datapointd-mean)**3)/n*SD**3

# using numpy
mean = np.mean(data)
SD = np.std(data)
skewness = np.mean((data-mean)**3)/SD**3
print("Skewness using numpy: ", skewness)

# using scipy
skewness = sp.stats.skew(data)
print("Skewness using scipy: ", skewness)

Skewness using numpy:  0.1883146512738904
Skewness using scipy:  0.18831465127389038


### Kurtosis

In [15]:
# kurtosis = [(sum of (datapoint-mean)**4/n*SD**4]

# using numpy
kurtosis = (np.mean((data-mean)**4)/SD**4) - 3
print("Kurtosis using numpy:", kurtosis)

# using scipy
kurtosis = sp.stats.kurtosis(data)
print("Kurtosis using scipy:", kurtosis)

Kurtosis using numpy: -1.2110616213691623
Kurtosis using scipy: -1.2110616213691618
