In [None]:
# Python Standard Library
import math
import sys

# 3rd party, might need to conda or pip install
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import matplotlib
from scipy.stats import norm, zscore
import scipy
import seaborn as sns

print("matplotlib", matplotlib.__version__)
print("numpy:", np.__version__)
print("scipy:", scipy.__version__)
print("seaborn:", sns.__version__)
print("pandas:", pd.__version__)

In [None]:
print("python itself:", sys.version)
sys.path.append("./python")  # local subfolder

In [None]:
from statsfun import pascal

## Pascal's Triangle

The rows of Pascal's Triangle each form a binomial distribution.

In [None]:
row  = 15
data = pascal(row)
data

In [None]:
%matplotlib inline

In [None]:
fig  = plt.figure(figsize=(10,5))
ax1   = plt.subplot('121')  # I will explain
rect = ax1.patch  # a Rectangle instance
rect.set_facecolor('black')

plt.title("Row {} of Pascal's Triangle".format(row))
ax1.set_xlabel('Index of Coefficient')
ax1.set_ylabel('Value')
plt.plot(data, color="orange")

ax2   = plt.subplot('122', sharey=ax1)  # I will explain
rect = ax2.patch  # a Rectangle instance
rect.set_facecolor('black')
plt.setp(ax2.get_yticklabels(), visible=False)

plt.title("Bar View".format(row))
ax2.set_xlabel('Index of Coefficient')
plt.bar(range(len(data)), data, color="orange");

fig.tight_layout()
plt.show();

## Standard Normal Distribution

In [None]:
domain = np.linspace(-4,4,1000)  # 1000 equally spaced points between -4 and 4 inclusive
y = norm.pdf(domain, 0, 1)       # from scipy.stats
plt.plot(domain,y);              # total area under pdf == 1

## IQ Data

IQ data is often used to impart normal distribution concepts, because it's designed to have specific characteristics.  The standard deviation is defined to be 15 points, meaning every 15 points away from 100 represents one standard deviation, or a z-score of 1.0.

The z-score represents a score in terms of z-units or standard deviation units.  The probability of having a z-score of 3 or above, is the same as the probability as an IQ score of 100 + 15 + 15 + 15.  

Find the cummulative probability using the ```norm.cdf``` with the actual IQ score, given you already have a normal distribution with $\mu$ = 100 and $\sigma$ = 15.

In [None]:
domain = np.linspace(0,200,1000)
y = norm.pdf(domain, 100, 5)
plt.plot(domain,y);

In [None]:
def z_score(x, mu, sigma, n=1):
    return (x - mu)/(sigma/math.sqrt(n))

z = z_score(115, 100, 5) # one standard deviation above mean
z

Lets create a standard normal distribution.  Now that we have the z-score, the probability of having an IQ of 115 or below is about 84%.

In [None]:
d = norm(0, 1) # standard normal distribution
d.cdf(z)

In [None]:
d = norm(loc=100.0, scale=5.0) # back to the original bell curve
d.cdf(115)  # same answer as before

The probability of having an IQ of 115 or above is simply 1 minus the previous result.

In [None]:
d = norm(loc=100.0, scale=15.0)
1 - d.cdf(115)

In [None]:
z = z_score(93, 100, 15) # adjusted for sample size
z

In [None]:
d = norm(0, 1)
d.cdf(z)

In [None]:
d = norm(100, 15)
d.cdf(93) # again, same answer

In [None]:
z = z_score(115, 100, 15)
d = norm(0, 1)
d.cdf(z)

In [None]:
z

In [None]:
d.ppf(0.8413447460685429)  # going backwards from cumulative population to z-score

# Three Sigma Rule

* Plus/minus 1 sigma from mean:  about 68.2% of a population
* Plus/minus 2 sigmas from mean: about 95.4% of a population
* Plus/minus 3 sigmas from mean: about 99.7% of a population

In [None]:
d = norm(100, 15)
d.cdf(100+15) - d.cdf(100-15)

In [None]:
d = norm(100, 15)
d.cdf(100+2*15) - d.cdf(100-2*15)

In [None]:
d = norm(100, 15)
d.cdf(100+3*15) - d.cdf(100-3*15)

### Shading Areas Under the Curve

In [None]:
def draw_z_score(x, cond, mu, sigma, title):
    y = norm.pdf(x, mu, sigma)
    z = x[cond]
    plt.plot(x, y)
    plt.fill_between(z, 0, norm.pdf(z, mu, sigma))
    plt.title(title)
    plt.show()

In [None]:
d.cdf(115)-d.cdf(100-15)

In [None]:
domain = np.arange(0, 200, 0.001)
draw_z_score(domain, domain>115, 100, 15, 'z > 115')

In [None]:
from numpy import random
sns.distplot(random.normal(loc=100, scale=15, size=1000), hist=True)
plt.show() 

In [None]:
x = np.arange(-3,3,0.001)
z0 = -0.75
draw_z_score(x, x<z0, 0, 1, 'z<-0.75')

In [None]:
x = np.arange(-3,3,0.001)
z0 = 0.75
draw_z_score(x, (-z0 < x) & (x < z0), 0, 1, '-0.75<z<0.75')

In [None]:
x = np.arange(-3,3,0.001)
z0 = 0.75
draw_z_score(x, x > z0, 0, 1, ' z> 0.75')

![overview](http://greenteapress.com/thinkstats2/html/thinkstats2026.png)

Figure 6.2: A framework that relates representations of distribution functions.

*Think Stats* by Allen B. Downing
ISBN-13: 978-1491907337
ISBN-10: 1491907339 

[Hypothesis Testing](http://greenteapress.com/thinkstats2/html/thinkstats2010.html#sec95)
*Ibid.*