In [6]:
from scipy.stats import norm
from math import sqrt

The following code has been adapted from Thomas Nield's book Essential Math for Data Science. The case study is used to determine the population mean of a sample (sample size = 31, CLT) of golden retrievers. 

# Golden Retriever Weight: CDF, Z-Values, and Confiedence Intervals

In [10]:
#observing a golden retriever b/w 62 and 66 pounds is 49.2%

mean = 64.43
std_dev = 2.99

x = norm.cdf(66, mean, std_dev) - norm.cdf(62, mean, std_dev)
x

0.4920450147062894

In [11]:
#first, we must determine the critical z-value for a normal distribution

def critical_z_value(p):
    norm_dist = norm(loc=0.0, scale=1.0)
    left_tail_area = (1.0 - p) / 2.0
    upper_area = 1.0 - ((1.0 - p) / 2.0)
    return norm_dist.ppf(left_tail_area), norm_dist.ppf(upper_area)

print(critical_z_value(p=.95))

(-1.959963984540054, 1.959963984540054)


In [12]:
#Then we need to use this critical z-value to determine the confidence we have in our sampling

def confidence_interval(p, sample_mean, sample_std, n):
    lower, upper = critical_z_value(p)
    lower_ci = lower * (sample_std / sqrt(n))
    upper_ci = upper * (sample_std / sqrt(n))
    return sample_mean + lower_ci, sample_mean + upper_ci

print(confidence_interval(p=.95, sample_mean=64.43, sample_std=2.99, n=31))

(63.3774604290193, 65.4825395709807)


Based on the sample of 31 golden retriever weights with sample mean 64.408 and std_dev of 2.05, I am 95% confident the population mean lies b/w 63.4 and 65.5

# Drug Trial: P-values and One-Tailed Test

Null Hypothesis H0: new drug has no effect on recovery

H1: the drug succeeded in lowering the duration of a cold

In [13]:
#one-tail test
#probability of recovery from a cold b/w 15 and 21 days

mean = 18
std_dev = 1.5

x = norm.cdf(21, mean, std_dev) - norm.cdf(15, mean, std_dev)
x

0.9544997361036416

In [14]:
#new drug, avergae 16 day recovery

mean = 18
std_dev = 1.5

x = norm.ppf(.05, mean, std_dev)
x

15.53271955957279

if we achieve an average 15.53 or fewer days of recovery, our drug is considered statistically significant; however, our sample mean of recovery time is actually 16; therefore, the significance test has failed.

In [15]:
p_value = norm.cdf(16, mean, std_dev)
p_value

0.09121121972586788

Since the p-value is greater than the statistical significant threshold of 0.5, we do not consider the drug trial a sucess. We fail to reject the null hypothesis.

# Z-Score to X Value

In [None]:
#function to retrieve a z-score and an x value

def z_score(x, mean, std):
    return (x - mean) / std

def z_to_x(z, mean, std):
    return (z * std) + mean

mean = 140000
std_dev = 3000
x = 150000

z = z_score(x, mean, std_dev)
back_to_x = z_to_x(z, mean, std_dev)

print("Z-score: {}".format(z))
print("Back to X: {}".format(back_to_x))