# Import modules

In [29]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from scipy import stats

# Generating data with lower and upper bound

In [30]:
mu, sigma = 3, 1 
def normal(mu, sigma, size, lower = -np.inf, upper = np.inf):
    dist= np.random.normal(mu, sigma, size)
    return dist

data_1 = normal(mu,sigma,100)
data_2 = normal(mu,sigma,1000)
data_3 = normal(mu,sigma,10000)

# Statistical analysis

In [31]:
mean_1 = np.mean(data_1)
mean_2 = np.mean(data_2)
mean_3 = np.mean(data_3)
print(f"mean data_1: {mean_1}")
print(f"mean data_2: {mean_2}")
print(f"mean data_3: {mean_3}")

mean data_1: 3.190543309175293
mean data_2: 3.012075782737554
mean data_3: 3.0020685075426


### The mean is quite close to 3. The data sets with more data points are closer to the desired value, which is not surprising.

In [32]:
median_1 = np.median(data_1)
median_2 = np.median(data_2)
median_3 = np.median(data_3)
print(f"median data_1: {median_1}")
print(f"median data_2: {median_2}")
print(f"median data_3: {median_3}")

median data_1: 3.2029473016980132
median data_2: 3.0304477474055203
median data_3: 2.99643786219648


### The median is very similar for the data sets with more data points.

In [33]:
stats.mode(data_1),stats.mode(data_2),stats.mode(data_3)

(ModeResult(mode=0.5898639229808773, count=1),
 ModeResult(mode=-0.08497356322807637, count=1),
 ModeResult(mode=-1.040131987059909, count=1))

### The count for the mode is 1 in all cases, because the data comes from a continous distribution. Furthermore, the data sets with more data points have lower starting values due to using a wider available range.

In [34]:
q0_data_1 = np.quantile(data_1, 0.0)
q1_data_1 = np.quantile(data_1, 0.25)
q3_data_1 = np.quantile(data_1, 0.75)
q4_data_1 = np.quantile(data_1, 1.0)
q0_data_2 = np.quantile(data_2, 0.0)
q1_data_2 = np.quantile(data_2, 0.25)
q3_data_2 = np.quantile(data_2, 0.75)
q4_data_2 = np.quantile(data_2, 1.0)
q0_data_3 = np.quantile(data_3, 0.0)
q1_data_3 = np.quantile(data_3, 0.25)
q3_data_3 = np.quantile(data_3, 0.75)
q4_data_3 = np.quantile(data_3, 1.0)
print(f" q0 data_1: {q0_data_1}")
print(f" q1 data_1: {q1_data_1}")
print(f" q3 data_1: {q3_data_1}")
print(f" q4 data_1: {q4_data_1}")
print(f" q0 data_2: {q0_data_2}")
print(f" q1 data_2: {q1_data_2}")
print(f" q3 data_2: {q3_data_2}")
print(f" q4 data_2: {q4_data_2}")
print(f" q0 data_3: {q0_data_3}")
print(f" q1 data_3: {q1_data_3}")
print(f" q3 data_3: {q3_data_3}")
print(f" q4 data_3: {q4_data_3}")

 q0 data_1: 0.5898639229808773
 q1 data_1: 2.3893228410704492
 q3 data_1: 3.814873959474063
 q4 data_1: 5.830030129616279
 q0 data_2: -0.08497356322807637
 q1 data_2: 2.3407030951403343
 q3 data_2: 3.676836771716551
 q4 data_2: 6.373507728842229
 q0 data_3: -1.040131987059909
 q1 data_3: 2.335323986902663
 q3 data_3: 3.6617376266634554
 q4 data_3: 6.578889023467152


### The main difference is the value of the maximum q4. The data sets with more data points reach higher values, which is also not surprising.

In [35]:
range_1 = max(data_1) - min(data_1)
range_2 = max(data_2) - min(data_2)
range_3 = max(data_3) - min(data_3)
print(f"range data_1: {range_1}")
print(f"range data_2: {range_2}")
print(f"range data_3: {range_3}")

range data_1: 5.2401662066354024
range data_2: 6.458481292070306
range data_3: 7.619021010527061


### This is also reflected in the range. The data sets with more data points show a larger range.

In [36]:
iqr_1 = q3_data_1 - q1_data_1
iqr_2 = q3_data_2 - q1_data_2
iqr_3 = q3_data_3 - q1_data_3
print(f"iqr data_1: {iqr_1}")
print(f"iqr data_2: {iqr_2}")
print(f"iqr data_3: {iqr_3}")

iqr data_1: 1.4255511184036136
iqr data_2: 1.3361336765762166
iqr data_3: 1.3264136397607924


### The difference of the first and third quartile is larger for the data set with the smallest amount of data points.

In [37]:
variance_1 = np.var(data_1, ddof=1)
variance_2 = np.var(data_2, ddof=1)
variance_3 = np.var(data_3, ddof=1)

In [38]:
print(f"variance data_1: {variance_1}")
print(f"variance data_2: {variance_2}")
print(f"variance data_3: {variance_3}")

variance data_1: 1.1738166299802788
variance data_2: 0.976367765027452
variance data_3: 0.990748354533231


### The variance for the larger data sets is closer to the desired value.

In [39]:
std_1 = np.std(data_1, ddof=1)
std_2 = np.std(data_2, ddof=1)
std_3 = np.std(data_3, ddof=1)

In [40]:
print(f"std data_1: {std_1}")
print(f"std data_2: {std_2}")
print(f"std data_3: {std_3}")

std data_1: 1.0834281840437228
std data_2: 0.9881132349217129
std data_3: 0.9953634283683679


### The same is true for the standard deviations.

In [41]:
skew_1 = stats.skew(data_1)
skew_2 = stats.skew(data_2)
skew_3 = stats.skew(data_3)

In [42]:
print(f"skew data_1: {skew_1}")
print(f"skew data_2: {skew_2}")
print(f"skew data_3: {skew_3}")

skew data_1: 0.06073146315113005
skew data_2: 0.018805108816393863
skew data_3: 0.0382279901212806


### The skew is the largest for the smallest data set, reflecting the imperfections for a small number of points.