# Chapter 2: Everything you ever wanted to know about statistics


## Self-test Answers
**Page 40:**
1.
We came across some data about the number of friends that 11 people had on Facebook (22, 40, 53, 57, 93, 98, 103, 108, 116, 121, 252). We calculated the mean for these data as 96.64. Now calculate the sums of squares, variance and standard deviation.

In [48]:
# Manually computed solution
data = [22, 40, 53, 57, 93, 98, 103, 108, 116, 121, 252]
   
   # sum of squared errors (SSE)
print('Manual version')

mean = round(sum(data) / len(data), 2) # 96.64
print(f"Mean: {mean}")
sse = 0
for num in data:
    sqrt_error = (num - mean) * (num - mean)
    sse += sqrt_error
print(f"The mean is usually off by: {round(sse, 2)} squared friends")

    # Variance
variance = sse / (len(data) - 1) # divided by the degrees of freedom N-1
print(f"On average, the mean is off by: {round(variance, 2)} friends squared")

    # Standard deviation 
std_dev = variance ** 0.5 # get the sqrt to make it easier to read
print(f"On average, the mean is off by: {round(std_dev, 2)} friends")


# using pandas
import pandas as pd
import numpy as np

df = pd.DataFrame({'friends': data})
df['errors'] = df['friends'] - df['friends'].mean()

print('\n')
print('Pandas version')
print(f"Mean: {df['friends'].mean().round(2)}")
print(f"The mean is usually off by: {round(sum(df['errors'] ** 2),2)} squared friends")
print(f"On average, the mean is off by: {round(sum(df['errors'] ** 2) / (len(df['friends'])-1),2)} friends squared")
print(f"On average, the mean is off by: {round(np.sqrt(sum(df['errors'] ** 2) / (len(df['friends'])-1)),2)} friends")


Manual version
Mean: 96.64
The mean is usually off by: 37544.55 squared friends
On average, the mean is off by: 3754.45 friends squared
On average, the mean is off by: 61.27 friends


Pandas version
Mean: 96.64
The mean is usually off by: 37544.55 squared friends
On average, the mean is off by: 3754.45 friends squared
On average, the mean is off by: 61.27 friends


**Page 40** 2. Calculate these values again but excluding the outlier (252)

In [49]:
# detect outliers
def return_outlier(column):
    avg = column.mean()
    sd = column.std()
    low = avg - sd*2
    high = avg + sd*2
    return low, high

low, high = return_outlier(df['friends'])
df = df[(df['friends'] >= low) & (df['friends'] <= high)]
df['errors'] = df['friends'] - df['friends'].mean()
print('Pandas version, outliers removed')
print(f"Mean: {df['friends'].mean().round(2)}")
print(f"The mean is usually off by: {round(sum(df['errors'] ** 2),2)} squared friends")
print(f"On average, the mean is off by: {round(sum(df['errors'] ** 2) / (len(df['friends'])-1),2)} friends squared")
print(f"On average, the mean is off by: {round(np.sqrt(sum(df['errors'] ** 2) / (len(df['friends'])-1)),2)} friends")


Pandas version, outliers removed
Mean: 81.1
The mean is usually off by: 10992.9 squared friends
On average, the mean is off by: 1221.43 friends squared
On average, the mean is off by: 34.95 friends


Calculate the range but excluding the score of 252

In [56]:
# base python version
def get_min(data):
    min = data[0]
    for i in data:
        if i < min:
            min = i    
    return min

def get_max(data):
    max = data[0]
    for i in data:
        if i > max:
            max = i
    return max

def get_range(data):
    min = get_min(data)
    max = get_max(data)
    return max - min

print('Base python version')
print('min:', get_min(df['friends']))
print('max:', get_max(df['friends']))
print('range:', get_range(df['friends']))

# Pandas version
print('\n')
print('Pandas version')
print('min:', df['friends'].min())
print('max:', df['friends'].max())
print('range:', df['friends'].max() - df['friends'].min())

Base python version
min: 22
max: 121
range: 99


Pandas version
min: 22
max: 121
range: 99


### Page 47
In section 1.7.2.2 we came across some data about the number of friends that 11 people had on Facebook.
We calculated the mean for these data as 96.64 and standard deviation as 61.27. Calculate a 95% confidence interval for this mean.

Recalculate the confidence interval assuming that the sample size was 56.

In [62]:
# SE = s / sqrt(N)
se = 61.27 / 11 ** 0.5

low = round(96.64 - (se * 2.23),2)
high = round(96.64 + (se * 2.23),2)

print('low: ', low, "high: ", high)

se = 61.27 / 56 ** 0.5
low = round(96.64 - (se * 1.96),2)
high = round(96.64 + (se * 1.96),2)

print('low: ', low, "high: ", high)

low:  55.44 high:  137.84
low:  80.59 high:  112.69


Task 4: In Chapter 1 we used an example of the time taken for 21 heavy smokers to fall off a treadmill at the fastest setting (18, 16, 18, 24, 23, 22, 22, 23, 26, 29, 32, 34, 34, 36, 36, 43, 42, 49, 46, 46, 57). Calculate the sums of squares, variance, standard deviation, standard error and 95% confidence interval of these data. 1

In [2]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np

Load the data as an array

In [3]:
times = np.array(
    [18, 16, 18, 24, 23, 22, 22, 23, 26, 29, 32, 34, 34, 36, 36, 43, 42, 49, 46, 46, 57]
)

Mean, Mode and Median

In [9]:
def my_mean(arr):
    return round(sum(arr) / len(arr),2)

def my_median(arr):
    arr.sort()
    n = len(arr)
    if n % 2 == 0:
        return round((arr[n//2 - 1] + arr[n//2]) / 2,2)
    else:
        return arr[n//2]

from collections import Counter
def my_mode(arr):
    count = Counter(arr)
    return count.most_common(1)[0][0]

print("mean:", my_mean(times), "minutes")
print("mode:", my_mode(times), "minutes")
print("median:", my_median(times), "minutes")

mean: 32.19 minutes
mode: 18 minutes
median: 32 minutes


Sums of squares, variance, standard deviation, standard error and 95% confidence interval

In [14]:
def total_error(data):
    mean = my_mean(data)
    total_error = 0
    for i in data:
        total_error += i - mean
    return total_error


def my_sse(data):
    mean = my_mean(data)
    sse = 0
    for i in data:
        sse += (i - mean) ** 2
    return sse


def my_variance(data):
    sse = my_sse(data)
    return sse**0.5


def my_sd(data):
    return round(my_variance(data) / (len(data) - 1), 2)  # for sample sd

def my_se(data):
    return round((my_variance(data) / (len(data))) ** 0.5, 2)  # for sample sd


def confidence_interval(data):
    mean = my_mean(data)
    se = my_se(data)
    z = 1.96  # for 95% confidence interval
    margin_of_error = z * se
    lower_bound = round(mean - margin_of_error, 2)
    upper_bound = round(mean + margin_of_error, 2)
    return (lower_bound, upper_bound)

In [18]:
print("Sum of Squared Errors", round(my_sse(times), 2), "minutes squared")
print("Variance", round(my_variance(times), 2), "minutes")
print("Standard Deviation", my_sd(times), "minutes")
print("Standard Error", my_se(times), "minutes")
print("mean:", my_mean(times), "minutes")
print("95% Confidence Interval:", confidence_interval(times),"minutes")

Sum of Squared Errors 2685.24 minutes squared
Variance 51.82 minutes
Standard Deviation 2.59 minutes
Standard Error 1.57 minutes
mean: 32.19 minutes
95% Confidence Interval: (29.11, 35.27) minutes


Normalized data, z-score calucation