# Import modules

In [197]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from scipy import stats

# Generating data

In [198]:
mu, sigma = 3, 1 
def normal(mu, sigma, size, lower = -np.inf, upper = np.inf):
    dist= np.random.normal(mu, sigma, size)
    return dist

data_1 = normal(mu,sigma,100)
data_2 = normal(mu,sigma,1000)
data_3 = normal(mu,sigma,10000)

# Statistical analysis

In [199]:
mean_1 = np.mean(data_1)
mean_2 = np.mean(data_2)
mean_3 = np.mean(data_3)
print(f"mean data_1: {mean_1}")
print(f"mean data_2: {mean_2}")
print(f"mean data_3: {mean_3}")

mean data_1: 2.8718457735966982
mean data_2: 2.9625094661157307
mean data_3: 2.999897055674959


### The mean is quite close to 3. The data sets with more data points are closer to the desired value, which is not surprising.

In [200]:
median_1 = np.median(data_1)
median_2 = np.median(data_2)
median_3 = np.median(data_3)
print(f"median data_1: {median_1}")
print(f"median data_2: {median_2}")
print(f"median data_3: {median_3}")

median data_1: 2.838053640887441
median data_2: 2.9431884733225475
median data_3: 2.9888404299754363


### The median is very similar for the data sets with more data points.

In [201]:
stats.mode(data_1),stats.mode(data_2),stats.mode(data_3)

(ModeResult(mode=-0.03193041151745968, count=1),
 ModeResult(mode=-0.045927166799357266, count=1),
 ModeResult(mode=-0.5923074772558081, count=1))

### The count for the mode is 1 in all cases, because the data comes from a continous distribution. Furthermore, the data sets with more data points have lower starting values due to using a wider available range.

In [202]:
q0_data_1 = np.quantile(data_1, 0.0)
q1_data_1 = np.quantile(data_1, 0.25)
q3_data_1 = np.quantile(data_1, 0.75)
q4_data_1 = np.quantile(data_1, 1.0)
q0_data_2 = np.quantile(data_2, 0.0)
q1_data_2 = np.quantile(data_2, 0.25)
q3_data_2 = np.quantile(data_2, 0.75)
q4_data_2 = np.quantile(data_2, 1.0)
q0_data_3 = np.quantile(data_3, 0.0)
q1_data_3 = np.quantile(data_3, 0.25)
q3_data_3 = np.quantile(data_3, 0.75)
q4_data_3 = np.quantile(data_3, 1.0)
print(f" q0 data_1: {q0_data_1}")
print(f" q1 data_1: {q1_data_1}")
print(f" q3 data_1: {q3_data_1}")
print(f" q4 data_1: {q4_data_1}")
print(f" q0 data_2: {q0_data_2}")
print(f" q1 data_2: {q1_data_2}")
print(f" q3 data_2: {q3_data_2}")
print(f" q4 data_2: {q4_data_2}")
print(f" q0 data_3: {q0_data_3}")
print(f" q1 data_3: {q1_data_3}")
print(f" q3 data_3: {q3_data_3}")
print(f" q4 data_3: {q4_data_3}")

 q0 data_1: -0.03193041151745968
 q1 data_1: 2.2279452197525904
 q3 data_1: 3.56370059602341
 q4 data_1: 5.5520035252817195
 q0 data_2: -0.045927166799357266
 q1 data_2: 2.319556398799892
 q3 data_2: 3.6166793486722892
 q4 data_2: 6.575192289021015
 q0 data_3: -0.5923074772558081
 q1 data_3: 2.3257090283730433
 q3 data_3: 3.667896614667841
 q4 data_3: 6.958725219854904


### The main difference is the value of the maximum q4. The data sets with more data points reach higher values, which is also not surprising.

In [203]:
range_1 = max(data_1) - min(data_1)
range_2 = max(data_2) - min(data_2)
range_3 = max(data_3) - min(data_3)
print(f"range data_1: {range_1}")
print(f"range data_2: {range_2}")
print(f"range data_3: {range_3}")

range data_1: 5.583933936799179
range data_2: 6.621119455820372
range data_3: 7.5510326971107125


### This is also reflected in the range. The data sets with more data points show a larger range.

In [204]:
iqr_1 = q3_data_1 - q1_data_1
iqr_2 = q3_data_2 - q1_data_2
iqr_3 = q3_data_3 - q1_data_3
print(f"iqr data_1: {iqr_1}")
print(f"iqr data_2: {iqr_2}")
print(f"iqr data_3: {iqr_3}")

iqr data_1: 1.3357553762708196
iqr data_2: 1.297122949872397
iqr data_3: 1.3421875862947976


### The difference between the third and first quartile is quite similar for the three data sets.

In [205]:
variance_1 = np.var(data_1, ddof=1)
variance_2 = np.var(data_2, ddof=1)
variance_3 = np.var(data_3, ddof=1)

In [206]:
print(f"variance data_1: {variance_1}")
print(f"variance data_2: {variance_2}")
print(f"variance data_3: {variance_3}")

variance data_1: 1.1576254467489204
variance data_2: 0.9643114874538787
variance data_3: 0.9936064737706433


### The variance for the larger data sets is closer to the desired value.

In [207]:
std_1 = np.std(data_1, ddof=1)
std_2 = np.std(data_2, ddof=1)
std_3 = np.std(data_3, ddof=1)

In [208]:
print(f"std data_1: {std_1}")
print(f"std data_2: {std_2}")
print(f"std data_3: {std_3}")

std data_1: 1.0759300380363588
std data_2: 0.9819936290291698
std data_3: 0.9967981108382195


### The same is true for the standard deviations.

In [209]:
skew_1 = stats.skew(data_1)
skew_2 = stats.skew(data_2)
skew_3 = stats.skew(data_3)

In [210]:
print(f"skew data_1: {skew_1}")
print(f"skew data_2: {skew_2}")
print(f"skew data_3: {skew_3}")

skew data_1: -0.12519702215626924
skew data_2: 0.05948190629790866
skew data_3: 0.04674485836843171


### The skew is the largest for the smallest data set, reflecting the imperfections for a small number of points.