# Alan Donahue Ch 9 Exercise

http://thinkstats2.com

Copyright 2016 Allen B. Downey

MIT License: https://opensource.org/licenses/MIT


In [120]:
from __future__ import print_function, division

%matplotlib inline

import numpy as np

import random

import thinkstats2
import thinkplot

In [121]:
class HypothesisTest(object):

    def __init__(self, data):
        self.data = data
        self.MakeModel()
        self.actual = self.TestStatistic(data)

    def PValue(self, iters=1000):
        self.test_stats = [self.TestStatistic(self.RunModel()) 
                           for _ in range(iters)]

        count = sum(1 for x in self.test_stats if x >= self.actual)
        return count / iters

    def TestStatistic(self, data):
        raise UnimplementedMethodException()

    def MakeModel(self):
        pass

    def RunModel(self):
        raise UnimplementedMethodException()

In [122]:
class DiffMeansPermute(thinkstats2.HypothesisTest):

    def TestStatistic(self, data):
        group1, group2 = data
        test_stat = abs(group1.mean() - group2.mean())
        return test_stat

    def MakeModel(self):
        group1, group2 = self.data
        self.n, self.m = len(group1), len(group2)
        self.pool = np.hstack((group1, group2))

    def RunModel(self):
        np.random.shuffle(self.pool)
        data = self.pool[:self.n], self.pool[self.n:]
        return data

In [123]:
import first

live, firsts, others = first.MakeFrames()
data = firsts.prglngth.values, others.prglngth.values

In [124]:
class DiffStdPermute(DiffMeansPermute):

    def TestStatistic(self, data):
        group1, group2 = data
        test_stat = group1.std() - group2.std()
        return test_stat

In [125]:
class CorrelationPermute(thinkstats2.HypothesisTest):

    def TestStatistic(self, data):
        xs, ys = data
        test_stat = abs(thinkstats2.Corr(xs, ys))
        return test_stat

    def RunModel(self):
        xs, ys = self.data
        xs = np.random.permutation(xs)
        return xs, ys

In [126]:
class PregLengthTest(thinkstats2.HypothesisTest):

    def MakeModel(self):
        firsts, others = self.data
        self.n = len(firsts)
        self.pool = np.hstack((firsts, others))

        pmf = thinkstats2.Pmf(self.pool)
        self.values = range(35, 44)
        self.expected_probs = np.array(pmf.Probs(self.values))

    def RunModel(self):
        np.random.shuffle(self.pool)
        data = self.pool[:self.n], self.pool[self.n:]
        return data
    
    def TestStatistic(self, data):
        firsts, others = data
        stat = self.ChiSquared(firsts) + self.ChiSquared(others)
        return stat

    def ChiSquared(self, lengths):
        hist = thinkstats2.Hist(lengths)
        observed = np.array(hist.Freqs(self.values))
        expected = self.expected_probs * len(lengths)
        stat = sum((observed - expected)**2 / expected)
        return stat

In [127]:
def FalseNegRate(data, num_runs=1000):
    """Computes the chance of a false negative based on resampling.

    data: pair of sequences
    num_runs: how many experiments to simulate

    returns: float false negative rate
    """
    group1, group2 = data
    count = 0

    for i in range(num_runs):
        sample1 = thinkstats2.Resample(group1)
        sample2 = thinkstats2.Resample(group2)
        ht = DiffMeansPermute((sample1, sample2))
        p_value = ht.PValue(iters=101)
        if p_value > 0.05:
            count += 1

    return count / num_runs

## Exercises

**Exercise:** As sample size increases, the power of a hypothesis test increases, which means it is more likely to be positive if the effect is real. Conversely, as sample size decreases, the test is less likely to be positive even if the effect is real.

To investigate this behavior, run the tests in this chapter with different subsets of the NSFG data. You can use `thinkstats2.SampleRows` to select a random subset of the rows in a DataFrame.

What happens to the p-values of these tests as sample size decreases? What is the smallest sample size that yields a positive test?

In [128]:
import first

def comparePregLength(df, iters = 1000):
    """
    Gets the p-value
    """
    
    firsts = df[df.birthord == 1]
    others = df[df.birthord != 1]

    lngth_data = firsts.prglngth.values, others.prglngth.values
    
    lngth_ht = DiffMeansPermute(lngth_data)
    lngth_pvalue = lngth_ht.PValue(iters = iters)
    
    return lngth_pvalue
    
    
    
def compareWeight(df, iters = 1000):
    """
    Gets the p-value
    """
    firsts = df[df.birthord == 1]
    others = df[df.birthord != 1]
    
    weight_data = (firsts.totalwgt_lb.dropna().values,
            others.totalwgt_lb.dropna().values)
    
    weight_ht = DiffMeansPermute(weight_data)
    weight_pvalue = weight_ht.PValue(iters = iters)
    
    return weight_pvalue

In [129]:
def chiSquaredTest(df, iters = 1000):
    """
    Gets the p-value for pregnancy length using Chi-Squared Test
    """
    
    firsts = df[df.birthord == 1]
    others = df[df.birthord != 1]

    length_data = firsts.prglngth.values, others.prglngth.values
        
    ht = PregLengthTest(length_data)
    p_value = ht.PValue(iters = iters)
    
    return p_value
    

In [130]:
def correlationTest(df, iters = 1000):
    """
    Test correlation
    """
    cleaned = df.dropna(subset=['agepreg', 'totalwgt_lb'])
    clean_data = cleaned.agepreg.values, cleaned.totalwgt_lb.values
    
    ht = CorrelationPermute(clean_data)
    pvalue = ht.PValue(iters = iters)
    
    return pvalue

In [131]:
n = len(live)

for i in range(6):
    sample = thinkstats2.SampleRows(live, n)
    prgLngth = comparePregLength(sample)
    weight = compareWeight(sample)
    chi = chiSquaredTest(sample)
    corr = correlationTest(sample)
    
    print(f"{n}:\t{prgLngth}\t{weight}\t{chi}\t{corr}")
    
    n = n // 2


9148:	0.163	0.0	0.0	0.0
4574:	0.973	0.001	0.0	0.0
2287:	0.158	0.257	0.002	0.013
1143:	0.039	0.161	0.011	0.262
571:	0.141	0.446	0.36	0.112
285:	0.821	0.494	0.067	0.266


The pattern is still a bit erratic, but as the sample size becomes smaller, the tests become negative.