In [None]:
import math # This isn't commonly used
import statistics
import numpy as np
import scipy.stats
import pandas as pd

Above are some of the common packages used to perform calculations and analysis. Below we are just creating some sample data to perform calculations on.

In [3]:
x = [8.0, 1, 2.5, 4, 28.0]
x_with_nan = [8.0, 1, 2.5, math.nan, 4, 28.0]
print(x)

print(x_with_nan)

[8.0, 1, 2.5, 4, 28.0]
[8.0, 1, 2.5, nan, 4, 28.0]


Nan values are common in data science and data analysis. In python you can use any of the following interchangably to generate NAN values:
1. float('nan')
2. math.nan
3. np.nan

Note that comparing two nan values for equality will return false

In [6]:
float('nan') == math.nan == np.nan

False

In [7]:
# creating an ndarray and a pd Series out of the lists
y, y_with_nan = np.array(x), np.array(x_with_nan)
z, z_with_nan = pd.Series(x), pd.Series(x_with_nan)

## Measures of central tendancy

In this tutorial we'll cover:
1. Mean
2. Weighted Mean
3. Geometric Mean
4. Harmonic Mean
5. Median
6. Mode

### Mean

In [12]:
# calculating using the sum and len functions
mean_ = sum(x)/len(x)
mean_

8.7

In [13]:
# using the built-in Python statistics functions
mean_ = statistics.mean(x)
mean_

8.7

In [14]:
mean_ = statistics.fmean(x) # faster always returns a floating point number
mean_

8.7

In [15]:
# what happens when there is nan value...returns nan
print(statistics.mean(x_with_nan))
print(statistics.fmean(x_with_nan))

nan
nan


In [18]:
#using numpy function. There is a corresponding method as well
print(np.mean(y))
print(y.mean())

8.7
8.7


In [19]:
# in numpy you can ignore the nan values by using the nanmean() function/method
np.nanmean(y_with_nan)

8.7

In [23]:
#pandas also has a mean() function, but it ignores nan values by default. This is because of the default value of the skipna parameter
print(z.mean())
print(z_with_nan.mean())

8.7
8.7


### Weighted Means

You can implement the weighted mean in pure Python by combining sum() with either range() or zip()

In [29]:
x = [8.0, 1, 2.5, 4, 28.0]
w = [0.1, 0.2, 0.3, 0.25, 0.15]

wmean = sum(w[i] * x[i] for i in range(len(x))) / sum(w)
print(wmean)

wmean = sum(x_ * w_ for (x_, w_) in zip(x, w)) / sum(w)
print(wmean)

6.95
6.95


In [32]:
# numpy may be better for larger datasets
y, z, w = np.array(x), pd.Series(x), np.array(w)
wmean = np.average(y, weights=w)
print(wmean)

wmean = np.average(z, weights=w)
print(wmean)

6.95
6.95


### Harmonic mean

The harmonic mean is the reciprocal of the mean of the reciprocals of all items in the dataset

In [34]:
# using pure Python
hmean = len(x) / sum(1/item for item in x)
hmean

2.7613412228796843

In [36]:
# using the statistics library
hmean = statistics.harmonic_mean(x)
hmean

2.7613412228796843

Note that if there's at least one zero then it'll return 0. If there's at least one nan then it will return nan. If there's at least one negative number then it will return an error

In [37]:
print(statistics.harmonic_mean(x_with_nan))

print(statistics.harmonic_mean([1, 0, 2]))

print(statistics.harmonic_mean([1, 2, -2]))  # Raises StatisticsError

nan
0


StatisticsError: harmonic mean does not support negative values

In [38]:
# using the scipy.stats
scipy.stats.hmean(y)

2.7613412228796843

In [39]:
scipy.stats.hmean(z)

2.7613412228796843

### Geometric Mean

The Geometric mean is the n-th root of the product of all n elements in a list x

In [43]:
# using pure python
gmean = 1
gmean2 = 1
for item in x:
    gmean *= item
    gmean2 *= item
    
    
gmean **= 1/len(x)
print(gmean)
print(gmean2**(1/len(x)))

4.677885674856041
4.677885674856041


In [44]:
# using the the statistics package
gmean = statistics.geometric_mean(x)
gmean

4.67788567485604

In [47]:
#returns nan if nan values are included
gmean = statistics.geometric_mean(x_with_nan)
gmean

nan

In [48]:
#scipy can also calculate the geometric mean
scipy.stats.gmean(y)

4.67788567485604

In [50]:
scipy.stats.gmean(z)

4.67788567485604

### Median