# Importing libraries and initializing data

In [1]:
import matplotlib.pyplot as plt

# 3 arrays of 20 random numbers ranging from 20 to 50
arr1 = [29, 42, 30, 44, 48, 35, 27, 26, 49, 30, 38, 44, 42, 50, 35, 34, 25, 28, 31, 33]
arr2 = [20, 30, 41, 39, 38, 27, 22, 48, 44, 24, 45, 49, 33, 40, 46, 31, 29, 29, 50, 38]
arr3 = [49, 37, 23, 50, 44, 27, 43, 39, 32, 22, 25, 41, 48, 29, 44, 27, 41, 38, 42, 30]

# generated using the code below
# import random
# random_numbers = [random.randint(20, 50) for _ in range(20)]
# print(random_numbers)

# Normalisation (Z-score)
$$x'=\frac{x-\mu}{\sigma}$$
- $\mu=$ Mean of the dataset.
- $\sigma=$ Standard deviation of the dataset.

Normalisation is a variation of scaling that represents the number of standard deviations away from the mean.<br/>
The normalised distribution always has mean$=0$ and standard-deviation$=1$. It’s useful when there are a few outliers, but not so extreme that you need clipping.
# Standardisation (Min-max scaling)
$$x' = \frac{x-x_{min}}{x_{max}-x_{min}}$$
Standardisation is a good choice when both of the following conditions are met:
- You know the approximate upper and lower bounds on your data with few or no outliers.
- Your data is approximately uniformly distributed across that range.

A good example is age. Most age values falls between 0 and 90, and every part of the range has a substantial number of people.
In contrast, you would not use scaling on income, because only a few people have very high incomes. The upper bound of the linear scale for income would be very high, and most people would be squeezed into a small part of the scale.

In [2]:
mean = lambda arr: sum(arr) / len(arr)

def variance(dataset):
    average = mean(dataset)
    numerator = sum([(n - average) ** 2 for n in dataset])
    denominator = len(dataset) - 1
    return numerator / denominator

def standardize(dataset):
    average = mean(dataset)
    numerator = sum([(n - average) ** 2 for n in dataset])
    denominator = len(dataset) - 1
    var = numerator / denominator
    standard_deviation = var ** 0.5
    return [(n - average) / standard_deviation for n in dataset]

def normalize(arr):
    val_min = min(arr)
    val_max = max(arr)
    range = val_max - val_min
    return [(i - val_min) / range for i in arr]

In [3]:
normal_arr2 = normalize(arr2)
normal_arr3 = normalize(arr3)

standard_arr2 = standardize(arr2)
standard_arr3 = standardize(arr3)

print(f"Mean of standardised array-2: {round(mean(standard_arr2),3)}")
print(f"Variance of standardised array-2: {round(variance(standard_arr2),3)}\n")

print(f"Mean of standardised array-3: {round(mean(standard_arr3),3)}")
print(f"Variance of standardised array-3: {round(variance(standard_arr3),3)}")

Mean of standardised array-2: 0.0
Variance of standardised array-2: 1.0

Mean of standardised array-3: 0.0
Variance of standardised array-3: 1.0


In [4]:
# figure 1
plt.scatter(arr2, arr3, 20, color = 'red')
plt.grid(True); plt.tight_layout()
plt.savefig('fig1.png'); plt.close()

# figure 2
plt.scatter(normal_arr2, normal_arr3, 20, color = 'blue')
plt.grid(True); plt.tight_layout()
plt.savefig('fig2.png'); plt.close()

# figure 3
plt.scatter(standard_arr2, standard_arr3, 20, color = 'blue')
plt.grid(True); plt.tight_layout()
plt.savefig('fig3.png'); plt.close()

# Normalisation output
|Original Graph|Normalised Graph|
|-|-|
|![](fig1.png)|![](fig2.png)|
# Standardisation output
|Original Graph|Standardised Graph|
|-|-|
|![](fig1.png)|![](fig3.png)|
***