In [None]:
from math import sqrt
from scipy import stats

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from pydataset import data
import statistics

alpha = 0.05

#### 1. Ace Realty wants to determine whether the average time it takes to sell homes is different for its two offices. A sample of 40 sales from office #1 revealed a mean of 90 days and a standard deviation of 15 days. A sample of 50 sales from office #2 revealed a mean of 100 days and a standard deviation of 20 days. Use a .05 level of significance.

What are we comparing?
- average time (numeric continuous values) to sell for two different groups (categories)
- One sample or two sample? 
- One tailed or two tailed?

Form a hypothesis:

$H_0$ = There is no difference in average time to sell at two offices   
$H_a$ = There is difference in average time to sell at two offices

Significance level $\alpha$ = 0.05

In [None]:
pd.Series(stats.norm(90,15).rvs(100000)).hist()

In [None]:
# Visualize two distribution:

x = np.arange(50,150)

y1 = stats.norm(90,15).pdf(x)
y2 = stats.norm(100,20).pdf(x)

In [None]:
plt.plot(x, y1, label = 'office 1')
plt.plot(x, y2, label = 'office 2')
plt.axvline(90, ls = ':')
plt.axvline(100, ls = ':', color = 'orange')

plt.legend()

T-test using descriptive stats: https://docs.scipy.org/doc/scipy/reference/generated/scipy.stats.ttest_ind_from_stats.html



In [None]:
#Using Scipy 
α = 0.05

t, p = stats.ttest_ind_from_stats(90,15,40,100,20,50, equal_var= False)
t,p

In [None]:
print(f'''
Because the p-value ({p}) is less than alpha value ({α}),we reject the null hypothesis''')

### 2. Load the mpg dataset and use it to answer the following questions:

a. Is there a difference in fuel-efficiency in cars from 2008 vs 1999?  
b. Are compact cars more fuel-efficient than the average car?  
c. Do manual cars get better gas mileage than automatic cars?

In [None]:
mpg = data('mpg')
mpg.head(3)

#### 1.  Is there a difference in fuel-efficiency in cars from 2008 vs 1999?

Comparing fuel economy two different sub-groups (2-sample, 2-tailed t-test)


$H_0$: there is no difference in fuel-efficiency in cars from 2008 vs 1999  
$H_a$: there is a difference in fuel-efficiency in cars from 2008 vs 1999

Calculate average fuel economy assuming 50% highway and 50% city driving


- Should I use arithmetic mean or harmonic mean for average mpg?
    - Arithmetic Mean: fe_am = (cty + hwy)/2
    - Harmonic Mean: fe_hm = 2/(1/cty + 1/hwy)

In [None]:
mpg['avg_fe'] = stats.hmean(mpg[['cty', 'hwy']], axis =1)
mpg.head()

In [None]:
fe_2008 = mpg[mpg.year == 2008].avg_fe
fe_1999 = mpg[mpg.year == 1999].avg_fe

In [None]:
# plot distribution for fe_2008
fe_2008.hist()

In [None]:
# plot distribution for fe_2008
fe_1999.hist()

In [None]:
# how many observations I have for each sample? (N>30,we we meet normality condition)

fe_2008.count(), fe_1999.count()

In [None]:
# is the variance same for both sample? Yes

fe_2008.var(), fe_1999.var()

In [None]:
# # stats Levene test - returns p value. small p-value means unequal variances
stats.levene(fe_2008, fe_1999)

In [None]:
# calculate t-statistic and p value

t, p = stats.ttest_ind(fe_2008, fe_1999)
t, p

In [None]:
print(f'''
Because p ({p:.3f}) > alpha (.05), we fail to reject the null\
 hypothesis that there is no difference in fuel-efficency in cars\
 from 2008 and 1999.
''')

In [None]:
fe_2008.mean(), fe_1999.mean()

In [None]:
plt.hist([fe_1999, fe_2008], label=["1999 cars", "2008 cars"])
plt.legend(loc="upper right")

#### 2. Are compact cars more fuel-efficient than the average car?


Comparing fuel economy of one group with population mean  (1-sample, 1-tailed t-test)

$H_0$: There is no difference in fuel-efficiency between compact cars and the population average fuel-efficiency  
$H_a$: Compact cars are more fuel efficient than the average car

for $H_a$ to be true, t > 0 and  p/2 < 0.05

In [None]:
fe_compact = mpg[mpg['class'] == 'compact'].avg_fe
μ = mpg.avg_fe.mean()

In [None]:
# look at distribution of fuel economy of compact cars. It also shows > 30 observations
# so we meet normality assunption for t-test

fe_compact.hist()

In [None]:
# calculate t and p values:

t, p = stats.ttest_1samp(fe_compact, μ)
t, p

In [None]:
print(f'''
Because p/2 ({p/2:.12f}) < alpha (.05), we reject the null hypothesis that there isno difference in fuel-efficiency between compact cars and the overall average.
''')

#### Do manual cars get better gas mileage than automatic cars?

- One-sample or two-sample t test?
- 1-tailed or 2-tailed?

$H_0$: there is no difference in fuel-efficiency between manual cars and automatic transmission cars  
$H_a$: there is a difference in fuel-efficiency between manual cars and automatic transmission cars

In [None]:
fe_auto = mpg[mpg.trans.str.contains('auto')].avg_fe
fe_manual = mpg[mpg.trans.str.contains('manual')].avg_fe

In [None]:
# look at the distribution. N >30

fe_auto.hist()

In [None]:
# look at the distribution. N >30

fe_manual.hist()

In [None]:
# look at variances
fe_auto.var(), fe_manual.var()

In [None]:
# # # stats Levene test - returns p value. small p-value means unequal variances
# stats.levene(fe_auto, fe_manual)

In [None]:
t, p = stats.ttest_ind(fe_manual, fe_auto, equal_var = False)
t, p/2, alpha

In [None]:
print(f'''
Because p/2 ({p/2:.6f}) < alpha (.05), we reject the null hypothesis that there is no difference in gas mileage between manual and automatic cars
''')

In [None]:
mpg['transmission_type'] = np.where(mpg.trans.str.contains('auto'), 'Auto Transmission', 'Manual Transmission')

In [None]:
mpg.groupby('transmission_type').avg_fe.mean().plot.bar()
plt.xticks(rotation=0)
plt.xlabel('')
plt.ylabel('Average mileage')
plt.title('Is mileage different by transmission type?')