# Measures of Location
A single number to describe the characteristics of a set of data. <br>
We will look at the Mean, Weighted mean, Median, Mode.

In [1]:
import math
import statistics
import numpy as np
import pandas as pd
import scipy.stats
import matplotlib.pyplot as plt

In [2]:
(x, x_with_nan)  = ( [8.0, 1, 2.5, 4, 28.0], [8.0, 1, 2.5, math.nan, 4, 28.0])
(y, y_with_nan)  = ( np.array(x), np.array(x_with_nan))
(z, z_with_nan)  = ( pd.Series(x), pd.Series(x_with_nan))

## Sample Arithmetic mean 
$\bar{X}=\frac{1}{N} \sum_{i=0}^{N-1}x_{i}$ where $N$ is the number of observations

In [10]:
#Pure Python:
mean_ = sum(x)/len(x)
mean_nan = sum(x_with_nan)/ len(x_with_nan)   #len(x_with_nan) returns 6
mean_, mean_nan

(8.7, nan)

In [11]:
#Statistics library
mean_ = statistics.mean(x)
mean_nan = statistics.mean(x_with_nan)
mean_, mean_nan

(8.7, nan)

In [12]:
#NumPy - It offers both a function and a method
mean_ = np.mean(y)   #or y.mean()
mean_nan = np.mean(y_with_nan)   #or y_with_nan.mean()
mean_, mean_nan

(8.7, nan)

In [14]:
#NumPy - To ignore nan values
mean_ = np.nanmean(y)
mean_nan = np.nanmean(y_with_nan)
mean_, mean_nan

(8.7, 8.7)

In [15]:
#Pandas - ignores nan values by default
mean_ = z.mean()
mean_nan = z_with_nan.mean()
mean_, mean_nan

(8.7, 8.7)

## Weighted Mean 
$\bar{X}=\frac{\sum_{i}w_{i}x_{i}}{\sum_{i}w_{i}}$

In [10]:
x = [8.0, 1, 2.5, 4, 28.0]
x_with_nan = [8.0, 1, math.nan, 4, 28.0]
w = [0.1, 0.2, 0.3, 0.25, 0.15]

In [12]:
# Pure Python
wmean= sum(x[i]*w[i] for i in range(len(x)))/ sum(w)
wmean_nan = sum(x_with_nan[i]*w[i] for i in range(len(x)))/ sum(w)
wmean, wmean_nan

(6.95, nan)

In [14]:
# NumPy
y, w , y_with_nan = np.array(x), np.array(w), np.array(x_with_nan)
wmean = np.average(y, weights=w)
wmean_nan = np.average(y_with_nan, weights=w)
# wmean = (y*w).sum()/ w.sum()
wmean, wmean_nan

(6.95, nan)

In [9]:
#Pandas
z, w = pd.Series(x), pd.Series(w)
wmean = np.average(y, weights=w)
wmean

6.95

## Sample Median

$ M = \begin{cases} 
      x_{(\frac{N-1}{2})} & if\,N\,is\,odd \\
      \frac{x_{(\frac{N-2}{2})}\,+\, x_{(\frac{N}{2})}}{2} & if\,N\,is\,even \\
   \end{cases}
$

In [23]:
def sample_median(x):
    n=len(x)
    if n%2 == 1:
        return(sorted(x)[round((n-1)/2)])
    else:
        x_ord = sorted(x)
        return( (x_ord[round((n-2)/2)] + x_ord[round(n/2)])/2)

In [24]:
sample_median([8.0, 1, 2.5, 4, 28.0])

4

In [26]:
x

[8.0, 1, 2.5, 4, 28.0]

In [27]:
#using the statistics package:
statistics.median(x)

4

In [30]:
statistics.median(x[:-1])

3.25

In [40]:
# Statistics Median doesn't give nan when there are nan values among the data points:
#nan is considered as an item
statistics.median([8.0, 1, 2.5, 4, 28.0, math.nan])

6.0

In [31]:
statistics.median_low([8.0, 1, 2.5, 4, 28.0, math.nan])

4

In [32]:
statistics.median_high([8.0, 1, 2.5, 4, 28.0, math.nan])

8.0

In [36]:
y, y_with_nan

(array([ 8. ,  1. ,  2.5,  4. , 28. ]),
 array([ 8. ,  1. ,  2.5,  nan,  4. , 28. ]))

In [33]:
# Using the NumPy package:
np.median(y)

4.0

In [35]:
np.median(y_with_nan)

nan

In [39]:
#nan is not considered as an item
np.nanmedian(y_with_nan)

4.0

In [41]:
z, z_with_nan

(0     8.0
 1     1.0
 2     2.5
 3     4.0
 4    28.0
 dtype: float64, 0     8.0
 1     1.0
 2     2.5
 3     NaN
 4     4.0
 5    28.0
 dtype: float64)

In [42]:
# Using the Pandas package: Ignores nan values by default
# Change this behavior with the optional parameter skipna
z.median()

4.0

In [43]:
z_with_nan.median()

4.0

## Sample Mode:
The sample mode is the value in the dataset that occurs most frequently.

In [35]:
u=[2,3,2,8,12]
v=[12, 15, 12, 15, 21, 15, 12]

In [54]:
tuple((v.count(item), item) for item in set(v))

((3, 12), (1, 21), (3, 15))

In [52]:
max((v.count(item), item) for item in set(v))[1]
#max((u.count(item), item) for item in u)[1]   --This also works

15

In [33]:
#Using the statistics library
statistics.mode(v) #Returns the first encountered one

12

In [32]:
# New feature in python 3.8, which gives a list always.
statistics.multimode(v)

[12, 15]

statistics.mode() and statistics.multimode() handle `nan` values as regular values and can return `nan` as the modal value.

In [16]:
# Using SciPy
# When we have numpy array, use mode from the scipy package
# We can specify the axis as well as nan_policy
scipy.stats.mode(u)

ModeResult(mode=array([2]), count=array([2]))

In [17]:
# If there are multiple modal values in the dataset, then only the smallest value is returned.
v=[12, 15, 12, 15, 21, 15, 12]
scipy.stats.mode(v)

ModeResult(mode=array([12]), count=array([3]))

In [31]:
scipy.stats.mode(v).mode[0]

12

In [29]:
scipy.stats.mode(v).count[0]

3

In [21]:
# Using Pandas
u, v, w = pd.Series(u), pd.Series(v), pd.Series([2, math.nan, math.nan])

In [22]:
u

0     2
1     3
2     2
3     8
4    12
dtype: int64

In [23]:
# returns a new pd.Series that holds all modal values.
u.mode()

0    2
dtype: int64

In [24]:
v.mode()

0    12
1    15
dtype: int64

In [25]:
#The resulting Series will be in descending order so that the first element is the most frequently-occurring element.
v.value_counts()

12    3
15    3
21    1
dtype: int64

In [26]:
w.mode()

0    2.0
dtype: float64

In [27]:
w.mode(dropna=False)

0   NaN
dtype: float64

## References:
1. Python Statistics Fundamentals: How to Describe Your Data - https://realpython.com/python-statistics/#measures-of-correlation-between-pairs-of-data
2. NPTEL Course on Data Analytics with Python by Prof. Ramesh Anbanandam - https://nptel.ac.in/courses/106/107/106107220/