# 1:5

A sample of 133 customers in an area are asked how many times they shopped in a certain store in the last month  

In [1]:
import pandas as pd
import plotly_express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots
from math import sqrt

amount_times = [0, 1, 2, 3, 4, 5, 6, 7]
abs_freq = [12, 26, 38, 21, 17, 12, 5, 2]

sample_table = pd.DataFrame({"Amount Times": amount_times, "Absolute Frequency": abs_freq})# .set_index("Amount Times")

sample_table

Unnamed: 0,Amount Times,Absolute Frequency
0,0,12
1,1,26
2,2,38
3,3,21
4,4,17
5,5,12
6,6,5
7,7,2


Based on the table:

---
### a) Show Absolute and Relative Frequencies in a Bar Graph

In [2]:
# calculating relative frequency
sample_table["Relative Frequency"] = sample_table["Absolute Frequency"] / sample_table["Absolute Frequency"].sum()
sample_table

Unnamed: 0,Amount Times,Absolute Frequency,Relative Frequency
0,0,12,0.090226
1,1,26,0.195489
2,2,38,0.285714
3,3,21,0.157895
4,4,17,0.12782
5,5,12,0.090226
6,6,5,0.037594
7,7,2,0.015038


In [3]:
# double checking that relative frequency was calculated correctly and adds up to 1 as expected:
sample_table["Relative Frequency"].sum()

1.0

In [4]:
# plotting graphs side by side
fig = make_subplots(rows=1, cols=2)

trace1 = px.bar(sample_table, x = "Amount Times", y = "Absolute Frequency")
trace2 = px.bar(sample_table, x = "Amount Times", y = "Relative Frequency")

fig.add_trace(go.Bar(x=trace1.data[0]['x'], y=trace1.data[0]['y'], name = "Absolute Frequency"), row=1, col=1)
fig.add_trace(go.Bar(x=trace2.data[0]['x'], y=trace2.data[0]['y'], name = "Relative Frequency"), row=1, col=2)

fig.update_layout(title = "Store Visits in 1 month", width = 1100, height = 300, margin = dict(l= 10, t = 60, b = 10))

fig.show()

---
### b) Calculate Mean and Standard Deviation

In [5]:
# calculating mean manually

# sum(sample_table["Amount Times"] * sample_table["Absolute Frequency"]) manually:
amount_visits = (0 * 12) + (1 * 26) + (2 * 38) + (3 * 21) + (4 * 17) + (5 * 12) + (6 * 5) + (7 * 2)
print(f"{amount_visits = }")

# sum(sample_table["Absolute Frequency"]) manually:
amount_datapoints = 12 + 26 + 38 + 21 + 17 + 12 + 5 + 2
print(f"{amount_datapoints = }")

mean = amount_visits / amount_datapoints
mean

amount_visits = 337
amount_datapoints = 133


2.5338345864661656

In [6]:
# redoing everything manually
x0 = [0] * 12
x1 = [1] * 26
x2 = [2] * 38
x3 = [3] * 21
x4 = [4] * 17
x5 = [5] * 12
x6 = [6] * 5
x7 = [7] * 2
datapoints = x0 + x1 + x2 + x3 + x4 + x5 + x6 + x7

# --- amount of datapoints in sample ---
# using pandas: sum(sample_table["Absolute Frequency"])
# manually: 12 + 26 + 38 + 21 + 17 + 12 + 5 + 2
n = len(datapoints)
print(f"{n = }")

# --- calculating mean ---
# using pandas: sum(sample_table["Amount Times"] * sample_table["Absolute Frequency"]) / n
# manually: 0 * 12 + 1 * 26 + ... + 2 * 7 / n
mean = sum(datapoints) / len(datapoints)
print(f"{mean = }")

# --- calculating variance ---
# starting with the sum
# using normal python syntax: 
# count = sum([(datapoint - mean)**2 for datapoint in datapoints])
# count = sum([datapoint**2 for datapoint in datapoints]) - (n * mean**2)
# manually: (12 * 0**2 + ... + 2 * 7**2) - n * mean**2
count = (len(x0) * x0[0]**2 \
+ len(x1) * x1[0]**2        \
+ len(x2) * x2[0]**2        \
+ len(x3) * x3[0]**2        \
+ len(x4) * x4[0]**2        \
+ len(x5) * x5[0]**2        \
+ len(x6) * x6[0]**2        \
+ len(x7) * x7[0]**2)       \
- n * mean**2 # subtract mean**2 n times at the end instead of doing it for every x

print(f"{count = }")

# then dividing to get variance
variance = count / (n - 1) # n - 1 because its a sample
print(f"{variance = }")

# --- calculating standard deviation ---
# which is the square root of variance
std = sqrt(variance)
std

n = 133
mean = 2.5338345864661656
count = 363.0977443609021
variance = 2.7507404875825916


1.6585356455568243