In [78]:
import pandas as pd
import numpy as np
from scipy import stats
import math
from pydataset import data
from env import get_db_url

### For each of the following questions, formulate a null and alternative hypothesis (be as specific as you can be), then give an example of what a true positive, true negative, type I and type II errors would look like.



- **Is the website redesign any good?**

>**H<sub>o</sub>:** website redesign is not good, click rate has no significant change
>
>>False Negative **(type II error)** Click rate has not changed, but another metric has improved that we're not tracking
>
>**H<sub>a</sub>:** website redesign is good, click rate has significantly improved
>
>>False Positive **(type I error)** Click rate has signifaicantly improved but not due to website redesign


- **Is our television ad driving more sales?**

>**H<sub>o</sub>:** Ad has no effect on sales, no significant increase
>
>>False Negative **(type II error)** sales have not risen, but due to other causes
>
>**H<sub>a</sub>:** Ad has boosted sales
>
>>False Positive **(type I error)** Sales have risen, but due to other causes

- **Has the network latency gone up since we switched internet service providers?**

>**H<sub>o</sub>:** Network latency has not been affected by new internet provider
>
>>False Negative **(type II error)** latency has risen but measurment device is not working
>
>**H<sub>a</sub>:** Network latency has risen significantly
>
>>False Positive **(type I error)** latency has not risen but measurment device is not working

# T-tests

- Ace Realty wants to determine whether the average time it takes to sell homes is different for its two offices. A sample of 40 sales from office #1 revealed a mean of 90 days and a standard deviation of 15 days. A sample of 50 sales from office #2 revealed a mean of 100 days and a standard deviation of 20 days. Use a .05 level of significance.

In [3]:
#metrics
dist_1 = stats.norm(90, 15).rvs(40)
dist_2 = stats.norm(100,20).rvs(50)

n1 = 40
n2 = 50

μ1 = 90
μ2 = 100

σ1 = 15
σ2 = 20

alpha = .05

In [4]:
#two sample t-test
#h0: time1 = time2
#ha: time1 ≠ time2
deg_f = n1 + n2 - 2
σ_pool = math.sqrt(((n1 - 1)*(σ1 ** 2) + (n2 - 1)*(σ2 ** 2)) / (deg_f))

t = (μ1 - μ2) / (σ_pool * math.sqrt(1/n1 + 1/n2))
t

-2.6252287036468456

In [5]:
p = stats.t(deg_f).cdf(t)
p

0.005104926224619695

In [6]:
p > alpha

False

### Load the mpg dataset and use it to answer the following questions:


In [96]:
mpg = data('mpg')

In [97]:
mpg['avg_mpg'] = (mpg.hwy+mpg.cty)/2

In [98]:
mpg.head(5)

Unnamed: 0,manufacturer,model,displ,year,cyl,trans,drv,cty,hwy,fl,class,avg_mpg
1,audi,a4,1.8,1999,4,auto(l5),f,18,29,p,compact,23.5
2,audi,a4,1.8,1999,4,manual(m5),f,21,29,p,compact,25.0
3,audi,a4,2.0,2008,4,manual(m6),f,20,31,p,compact,25.5
4,audi,a4,2.0,2008,4,auto(av),f,21,30,p,compact,25.5
5,audi,a4,2.8,1999,6,auto(l5),f,16,26,p,compact,21.0


In [119]:
volks_hwy_mpg = mpg[mpg['manufacturer'] == 'volkswagen'].hwy
volks_hwy_mpg
stats.ttest_1samp(volks_hwy_mpg, mpg.hwy.mean())

Ttest_1sampResult(statistic=5.652041311757083, pvalue=6.068818710469792e-06)

In [116]:
compact_mpg = mpg[mpg['class'] == 'compact'].cty
midsize_mpg = mpg[mpg['class'] == 'midsize'].cty
stats.ttest_ind(compact_mpg, midsize_mpg)

Ttest_indResult(statistic=2.2845195147123536, pvalue=0.024803276085898066)

- Is there a difference in fuel-efficiency in cars from 2008 vs 1999?


In [99]:
μ = mpg.groupby('year').avg_mpg.mean()
σ = mpg.groupby('year').avg_mpg.std()
n = mpg.groupby('year').avg_mpg.count()

In [100]:
#Ho: mu_2008 == mu_1999
#Ha: mu_2008 != mu_1999
deg_f = n[1999] + n[2008] - 2
σ_pool = math.sqrt(((μ[1999] - 1)*(σ[1999] ** 2) + (μ[2008] - 1)*(σ[2008] ** 2)) / (deg_f))

t = (μ[1999] - μ[2008]) / (σ_pool * math.sqrt(1/n[1999] + 1/n[2008]))
t

0.5404265072353494

In [101]:
p = stats.t(deg_f).cdf(t)
p

0.7052893195269174

- Are compact cars more fuel-efficient than the average car?

In [102]:
#ho: compact_mpg <= avg_mpg
#ha: compact_mpg <  avg_mpg

In [103]:
μ_all = mpg.avg_mpg.mean()
σ_all = mpg.avg_mpg.std()
n_all = mpg.avg_mpg.count()

μ_com = mpg[mpg['class'] == 'compact'].avg_mpg.mean()
σ_com = mpg[mpg['class'] == 'compact'].avg_mpg.std()
n_com = mpg[mpg['class'] == 'compact'].avg_mpg.count()


In [104]:
deg_f = n_com + n_all - 2
σ_pool = math.sqrt(((μ_com - 1)*(σ_com ** 2) + (μ_all - 1)*(σ_all ** 2)) / (deg_f))

t = (μ_com - μ_all) / (σ_pool * math.sqrt(1/n_com + 1/n_all))
t

15.22973882764796

In [105]:
p = stats.t(deg_f).cdf(t)
p

1.0

- Do manual cars get better gas mileage than automatic cars?

In [106]:
#Ho: standard mpg = automatic mpg
#Ha: standard mpg > automatic mpg
mpg['simple_trans'] = mpg.trans.str.startswith('manual')
mpg.head(5)

Unnamed: 0,manufacturer,model,displ,year,cyl,trans,drv,cty,hwy,fl,class,avg_mpg,simple_trans
1,audi,a4,1.8,1999,4,auto(l5),f,18,29,p,compact,23.5,False
2,audi,a4,1.8,1999,4,manual(m5),f,21,29,p,compact,25.0,True
3,audi,a4,2.0,2008,4,manual(m6),f,20,31,p,compact,25.5,True
4,audi,a4,2.0,2008,4,auto(av),f,21,30,p,compact,25.5,False
5,audi,a4,2.8,1999,6,auto(l5),f,16,26,p,compact,21.0,False


In [107]:
μ = mpg.groupby('simple_trans').avg_mpg.mean()
σ = mpg.groupby('simple_trans').avg_mpg.std()
n = mpg.groupby('simple_trans').avg_mpg.count()

In [108]:
deg_f = n[True] + n[False] - 2
σ_pool = math.sqrt(((μ[True] - 1)*(σ[False] ** 2) + (μ[True] - 1)*(σ[False] ** 2)) / (deg_f))

t = (μ[True] - μ[False]) / (σ_pool * math.sqrt(1/n[True] + 1/n[False]))
t

11.107655934792621

In [69]:
p = stats.t(deg_f).cdf(t)
p

1.0

# Correlation



### Class Example
 - is there a correlation between city mpg and displacement?
 - $H_o$ there is a correleation
 - $H_a$ there is no correlation

In [126]:
mpg = data('mpg')
stats.pearsonr(mpg['cty'], mpg['displ'])

(-0.7985239689348551, 4.737914890205637e-53)

### Use the telco_churn data.


In [172]:
#get db url
url = get_db_url('telco_churn')
telco = pd.read_sql('''
    SELECT customer_id, tenure, monthly_charges, total_charges, churn, phone_service as phone, internet_service_type_id as internet
    FROM customers''',
    url)
telco['internet'] = ((telco.internet == 1) | (telco.internet == 2))
telco['phone_and_internet'] = (telco.internet) & (telco.phone)
telco.sort_values('total_charges').tail(7032)

Unnamed: 0,customer_id,tenure,monthly_charges,total_charges,churn,phone,internet,phone_and_internet
4386,6180-YBIQI,5,24.30,100.2,No,No,True,True
2222,3178-FESZO,1,100.25,100.25,Yes,Yes,True,True
1771,2587-YNLES,6,20.10,100.35,No,Yes,False,False
5542,7802-EFKNY,5,24.95,100.4,Yes,No,True,True
5126,7216-EWTRS,1,100.80,100.8,Yes,Yes,True,True
3315,4719-UMSIY,6,19.65,100.9,No,Yes,False,False
5762,8104-OSKWT,12,79.80,1001.2,No,Yes,True,True
4983,7009-PCARS,55,19.10,1001.5,No,Yes,False,False
4764,6705-LNMDD,20,50.00,1003.05,No,Yes,True,True
5844,8219-VYBVI,39,25.00,1004.35,No,Yes,False,False


- Does tenure correlate with monthly charges? 

In [130]:
x = telco.tenure
y = telco.monthly_charges
corr, p = stats.pearsonr(x,y)
{'correlation': corr, 'probability' : p}

{'correlation': 0.24789985628615002, 'probability': 4.0940449915016345e-99}

- Total charges?


In [182]:
no_null_telco = telco.sort_values('total_charges').tail(7032)
x = no_null_telco.tenure
y = no_null_telco.total_charges.apply(float)
corr, p = stats.pearsonr(x,y)
{'correlation': corr, 'probability' : p}

{'correlation': 0.825880460933202, 'probability': 0.0}

- What happens if you control for phone and internet service?

In [181]:
x = telco[telco['phone_and_internet'] == True].tenure
y = telco[telco['phone_and_internet'] == True].monthly_charges
corr, p = stats.pearsonr(x,y)
{'correlation': corr, 'probability' : p}

{'correlation': 0.3723066263198703, 'probability': 5.890963580482675e-181}

In [184]:
x = no_null_telco[no_null_telco['phone_and_internet'] == True].tenure
y = no_null_telco[no_null_telco['phone_and_internet'] == True].total_charges.apply(float)
corr, p = stats.pearsonr(x,y)
{'correlation': corr, 'probability' : p}

{'correlation': 0.9319346187418098, 'probability': 0.0}