In [None]:
import matplotlib.pyplot as plt
import numpy as np
import scipy.stats as stats
import pandas as pd

df = pd.read_csv("http://www.mosaic-web.org/go/datasets/galton.csv")
print('This is the famous Galton Height Data collection set of adult heights and their parents heights from 1850\'s')
print(df.head(4))


In [None]:
print(df.describe(), "\n")
print(df.describe()['height']) #note df.describe() returns a data frame, can access columns individually by name


In [None]:
# lets look at the percentiles 
num_records = len(df)
mu = df.height.mean()
sigma = df.height.std()
q1, q3 = np.percentile(df.height, [25, 75]) # get the values where 25% of data is below this and 75% is below this
iqr = q3 - q1                               # inter - quartile range
lower_bound = q1 - (iqr * 1.5)              # tukey outlier lower limit which is the mean value - 1.5 * iqr
upper_bound = q3 + (iqr * 1.5)              # tukey outlier upper limit which is the mean value + 1.5 * iqr
#  print(mu, sigma, q1, q3, iqr, lower_bound, upper_bound)

In [None]:
print('mean height %.3f' % mu)
print('std height %.3f' % (sigma))
print('Quartile 1 %.2f' % q1)
print('Quartile 3 %.2f' % q3)
print('Inter Quartile Range %.2f' % iqr)
print('Tukey Limit Boundaries Lower: %.3f Upper: %.3f' % (lower_bound, upper_bound))
x = np.linspace(mu - 4*sigma, mu + 4*sigma, 100)    # set up a 100 linear spaced values between +/-4 std's of mean
n, bins, patches = plt.hist(df.height, 50, density=1, facecolor='green', alpha=0.75)
plt.plot(x, stats.norm.pdf(x, mu, sigma))
plt.plot([mu, mu], [0, stats.norm.pdf(mu, mu, sigma)], 'k')
for i in range(0,4):
    lower = mu - i*sigma
    upper = mu + i*sigma
    record_count = len(df[(df.height>=lower) & (df.height<=upper)])  # number of elements in the range
    print('+/- %d std: %.3f-%.3f num_records %d percentage %.3f' % (i, lower, upper, record_count, record_count/num_records))
    plt.plot([lower, upper], [stats.norm.pdf(lower, mu, sigma), stats.norm.pdf(upper, mu, sigma) ], 'k')
    
plt.grid(True)
plt.show()

In [None]:
print('Avg height %.3f inches: %.3f %.3f' % (mu, stats.norm.pdf(mu, mu, sigma), 1-stats.norm.pdf(mu, mu, sigma)))
print('my height 74 inches: %.5f' % (stats.norm.pdf(74, mu, sigma)))
print('Taller then %.5f%%' % stats.percentileofscore(df.height, 74, kind='rank'))


In [None]:
# separate the 2 modes of the data
df_m = df[df['sex']=='M']
#df_m.info()
df_m.describe()['height']

In [None]:
df_w = df[df['sex']=='F']
df_w.describe()['height']

In [None]:
def outliers_z_score(data):
    threshold = 3
    mean = np.mean(data)
    std = np.std(data)
    z_scores = [(y - mean) / std for y in data]
    return np.where(np.abs(z_scores) > threshold)
