# ZScore

## Import Libraries

In [1]:
import pandas as pd
import numpy as np

## Generate sample Data

In [3]:
# this is a list
data = [12, 15, 18, 21, 24, 27, 30, 33, 36, 39]

In [4]:
# if we have a list, its easiest to just put in a pandas series
series = pd.Series(data)

## Generate with scipy

In [20]:
# this is the function that we want to recreate on our own
from scipy import stats
sci_data = np.array(data)
zscores = stats.zscore(sci_data)
print("Z-scores:", zscores)

Z-scores: [-1.5666989  -1.21854359 -0.87038828 -0.52223297 -0.17407766  0.17407766
  0.52223297  0.87038828  1.21854359  1.5666989 ]


## The Formula

### Calculate the mean and Standard deviation

In [29]:
mean = series.mean()
std_dev = series.std() # note, this is using Bessels's correction (ddof=1) which calculates the sample standard deviation
# but scipy.stsats.zscore uses the population standard deviation (ddof=0)
# its the same as this
sample_std_dev = series.std(ddof=1)

In [30]:
print(f"mean: {mean}, std_dev: {std_dev}, sample_std_dev: {sample_std_dev}")    

mean: 25.5, std_dev: 9.082951062292475, sample_std_dev: 9.082951062292475


In [21]:
# standard deviation using the population
pop_std_dev = series.std(ddof=0)  # ddof=0 for population standard deviation
print(f"Population standard deviation: {pop_std_dev}")

Population standard deviation: 8.616843969807043


### Calculate the ZScore

In [None]:
# this calculation would be using a sample instead of a population
zscore = (series - mean) / std_dev
print(f"zscore: {zscore}")

zscore: 0   -1.486301
1   -1.156012
2   -0.825723
3   -0.495434
4   -0.165145
5    0.165145
6    0.495434
7    0.825723
8    1.156012
9    1.486301
dtype: float64


In [31]:
# here is the breakdown using the population
zscore_pop = (series - mean) / pop_std_dev
print(f"zscore using population std dev: {zscore_pop}")
# this is the one that we want to match with scipy.stats.zscore

zscore using population std dev: 0   -1.566699
1   -1.218544
2   -0.870388
3   -0.522233
4   -0.174078
5    0.174078
6    0.522233
7    0.870388
8    1.218544
9    1.566699
dtype: float64


### Manually Calculate the zscore

In [10]:
mean = sum(series) / len(series)
print(f"mean: {mean}")

mean: 25.5


In [11]:
differences = [(x - mean) for x in series]
print(f"differences: {differences}")

differences: [-13.5, -10.5, -7.5, -4.5, -1.5, 1.5, 4.5, 7.5, 10.5, 13.5]


In [22]:
std_dev = (sum([diff ** 2 for diff in differences]) / len(series)) ** 0.5
print(f"std_dev: {std_dev}")
# this matches the standard deviation that we calculated manually above using population instead of sample

std_dev: 8.616843969807043


In [18]:
zscores = [(x - mean) / std_dev for x in series]
print(f"zscores: {zscores}")

zscores: [-1.5666989036012806, -1.2185435916898848, -0.8703882797784892, -0.5222329678670935, -0.17407765595569785, 0.17407765595569785, 0.5222329678670935, 0.8703882797784892, 1.2185435916898848, 1.5666989036012806]
