# Inference Statistics Notebook
NY Taxi Cab Trip Duration

# 1. Load the Data

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import scipy.stats as st # some useful stuff
from scipy.stats import t

###  You will need to get the following data and put it into the jupyter directory to run this notebook
* NYC Taxi 2016: https://www.kaggle.com/c/nyc-taxi-trip-duration/data


In [3]:
# https://towardsdatascience.com/getting-started-with-google-colab-f2fff97f594c
#from google.colab import drive
#drive.mount('/gdrive',force_remount=True)

Mounted at /gdrive


In [3]:

df = pd.read_csv('./nyc_taxi_train_2016.csv')
df.head()

Unnamed: 0,id,vendor_id,pickup_datetime,dropoff_datetime,passenger_count,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,store_and_fwd_flag,trip_duration
0,id2875421,2,2016-03-14 17:24:55,2016-03-14 17:32:30,1,-73.982155,40.767937,-73.96463,40.765602,N,455
1,id2377394,1,2016-06-12 00:43:35,2016-06-12 00:54:38,1,-73.980415,40.738564,-73.999481,40.731152,N,663
2,id3858529,2,2016-01-19 11:35:24,2016-01-19 12:10:48,1,-73.979027,40.763939,-74.005333,40.710087,N,2124
3,id3504673,2,2016-04-06 19:32:31,2016-04-06 19:39:40,1,-74.01004,40.719971,-74.012268,40.706718,N,429
4,id2181028,2,2016-03-26 13:30:55,2016-03-26 13:38:10,1,-73.973053,40.793209,-73.972923,40.78252,N,435


In [4]:
df.describe()

Unnamed: 0,vendor_id,passenger_count,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,trip_duration
count,1458644.0,1458644.0,1458644.0,1458644.0,1458644.0,1458644.0,1458644.0
mean,1.53495,1.66453,-73.97349,40.75092,-73.97342,40.7518,959.4923
std,0.4987772,1.314242,0.07090186,0.03288119,0.07064327,0.03589056,5237.432
min,1.0,0.0,-121.9333,34.3597,-121.9333,32.18114,1.0
25%,1.0,1.0,-73.99187,40.73735,-73.99133,40.73588,397.0
50%,2.0,1.0,-73.98174,40.7541,-73.97975,40.75452,662.0
75%,2.0,2.0,-73.96733,40.76836,-73.96301,40.76981,1075.0
max,2.0,9.0,-61.33553,51.88108,-61.33553,43.92103,3526282.0


In [5]:
# 
population = df['pickup_latitude']
mu_p = population.mean()
sigma_p = population.std(ddof=0)
print('Caution: you dont always know the population μ and σ')
print("mu_p: ", mu_p, ", sigma_p:", sigma_p)


Caution: you dont always know the population μ and σ
mu_p:  40.750920908391734 , sigma_p: 0.032881174986482624


### Population Statistics

* population mean, μ = 40.75
* population σ = 0.0329

### Assumptions

Assume the scores the population samples (latitudes) are normally distributed data.

# 2. t-Statistic Confidence Interval

You receive a sample of N=2 latitudes from the population. What is the confidence interval of the sample mean.

In [6]:
n = 100
alpha = 0.05 # 95% confidence
x_sample = df['pickup_latitude'].sample(n = n)
print(x_sample)

766324     40.755150
417696     40.768528
152237     40.800297
235439     40.754917
316480     40.719379
             ...    
1418208    40.760918
848289     40.752968
674462     40.740898
561968     40.766834
94847      40.768669
Name: pickup_latitude, Length: 100, dtype: float64


In [7]:

mu=mu_p
x_bar=x_sample.mean()
x_s = x_sample.std()           # sample standard deviation ... note, the sample mean (df = n-1) is different than population mean
s=np.std(x_sample,ddof=1)      # sample standard deviation with numpy (ddof = 1, otherwise defaults to ddof = 0 the population mean)
n = len(x_sample)
SE = s / np.sqrt(n) 
ddof = n - 1
t_critical = t.ppf(1-alpha, df = ddof )
t_statistic = (x_bar - mu)/SE
p_value=2*t.sf(t_statistic, ddof)
c_top = x_bar + t_critical * SE
c_bottom = x_bar - t_critical *SE
p_confidence = (1 - alpha)*100

print('Population Statistics')
print('mu_p = %f , sigma_p = %f' %(mu_p, sigma_p))
print('x_sample =', x_sample.values)
print('\nSample Statistics')
print('x_sample =', x_sample.values)
print('n = ', n)
print('df =', ddof)
print('x_bar = %1.4f' %x_bar)
print('x_s = %1.4f, s =%1.4f' %(x_s, s))
print('t_critical = %1.4f' %t_critical)
print('t_statistic = %1.4f' %t_statistic)
print('SE = %1.4f' %SE)
print('p_value = %1.4f' %p_value)  # one-tail p-value

print('The %1.3f %c confidence interval = (%1.4f , %1.4f)' %( p_confidence, '%',c_bottom, c_top) )

# confidence at n = 2 very small is a bit bogus the 95% confidence can possibly totally not include the actual mean because got unlucky
# try n = 10 and you see much better results

Population Statistics
mu_p = 40.750921 , sigma_p = 0.032881
x_sample = [40.75514984 40.76852798 40.80029678 40.75491714 40.71937943 40.74504089
 40.73035049 40.74847794 40.70961761 40.76991653 40.76084137 40.75196075
 40.77970505 40.77485657 40.78976059 40.76418304 40.74412537 40.78069305
 40.72893143 40.70505524 40.73348618 40.75665665 40.72219849 40.74279785
 40.74172592 40.76224899 40.76580048 40.77803802 40.75478745 40.76787949
 40.73800659 40.73704147 40.75043869 40.72333145 40.76478958 40.76203918
 40.76558304 40.73724747 40.7491684  40.73079681 40.78291321 40.77909088
 40.75118256 40.74429321 40.7231369  40.75940323 40.74858856 40.76259232
 40.75533295 40.77376175 40.75831223 40.73825073 40.75307083 40.73846817
 40.77176666 40.77986526 40.73563766 40.75962448 40.77408218 40.75827408
 40.73691559 40.71906662 40.79418182 40.76358032 40.74404144 40.73271561
 40.77477264 40.76057816 40.77132797 40.74790955 40.71688843 40.75233078
 40.74520874 40.75095367 40.64689636 40.73652267 40.7

# 3. z-Test Hypothesis Test

You receive a sample of N=10 latitudes from the population and you are trying to assess if the sample is representative. Is the sample mean representative of the NY Taxi population pickup_latitude true mean within a 95% confidence (α = 0.05) or does it differ from the population mean? You have prior knowledge that the popuulation has a standar deviation of σ = 0.032881174986483304

To test this question we will use what is refered to as a one-sample z-test. First we state the null hypothesis and alternative hypothesis like this;

* H0: The sample is from the NY Taxi population, x_bar = μ.  
* HA: The sample is not from the NY Taxi population, x_bar != (not equal) μ.

Notice that this is a two tail test. If the the z_statistic falls inside of the +/- z_critical then the sample mean is representative of the population mean. Otherwise, if the z_statistic falls within a tail it is unlikely to come from a representative sample. 

Equivalently, we can state the condition in terms of p-value. If the p-value ≥ α then accept the null hypothosis that the sample mean is representative of the population mean. There is not sufficient evidence that the sample is non-representative. Alternatively, if the p-value < α then there is sufficient evidence that the population is non-representative. 


In [8]:
# Sample the population ... insert an arbitrary bias
n=10
# use one of the two samples below for illustration
  # let's take out the center of the population from the sample ... spreads out tails 
x_sample = df[(df['pickup_latitude'] < 40 ) | (df['pickup_latitude'] > 42 ) ]['pickup_latitude'].sample(n = n)
#x_sample = df['pickup_latitude'].sample(n = n)
print(x_sample)

646333     39.441147
275644     34.712234
1427116    38.946033
1416336    39.923027
952661     36.029301
1387351    39.457882
193116     37.777771
233451     38.898849
521870     42.458942
801241     38.898849
Name: pickup_latitude, dtype: float64


In [9]:
# compute the z-test
#  note, we know the population sigma from above

x_bar=x_sample.mean()
n = len(x_sample)
alpha=0.05
z_critical = st.norm.ppf(1-alpha)
SE = sigma_p/np.sqrt(n)
z_statistic = (x_bar - mu)/SE
p_value = 2*st.norm.sf(abs(z_statistic))


print('Population Statistics')
print('mu = %f , sigma = %f' %(mu_p, sigma_p))
print('x_sample =', x_sample.values)
print('\nSample Statistics')
print('n =',n)
print('x_bar = %1.4f' %x_bar)
print('(- z_critical  z_critical)= (%1.4f , %1.4f)' %(-z_critical,z_critical))
print('z_statistic = %1.4f' %(z_statistic))
print('p_value = %1.4f' %(p_value))

print('\nConclusion')
if ( p_value > alpha ):
  print('Because p-value ≥ α, we fail to reject the null hypothesis. There is not sufficient evidence to conclude that the sample mean differs from the population.')
else:
  print('Because p < α, we reject the null hypothesis. There is sufficient evidence to conclude that the sample mean is not representative of the population.')



Population Statistics
mu = 40.750921 , sigma = 0.032881
x_sample = [39.44114685 34.7122345  38.94603348 39.92302704 36.02930069 39.45788193
 37.777771   38.89884949 42.45894241 38.89884949]

Sample Statistics
n = 10
x_bar = 38.6544
(- z_critical  z_critical)= (-1.6449 , 1.6449)
z_statistic = -201.6281
p_value = 0.0000

Conclusion
Because p < α, we reject the null hypothesis. There is sufficient evidence to conclude that the sample mean is not representative of the population.


# 4. t-test Hypothesis Test

You receive a sample of N=10 latitudes. You suspect that the population mean latitude is greater than 40.74, but you need to prove your assertion. Ensure that your conclusion is stated with a 95% confidence (α = 0.05). You do not know the population standard deviation.

To test this question we will use what is refered to as a one-sample t-test. First we state the null hypothesis and alternative hypothesis like this;

* H0: μ ≤ 40.74   null, contrary to assertion
* HA: μ > 40.74   the assertion

Notice that this is a one-tail test. If the the t_statistic falls inside outside +/- t_critical then the you fail to reject the null hypothesis.  if the t_statistic falls to the rigth of t_critical then you reject the null hypothesis.

Equivalently, we can state the condition in terms of p-value. If the p-value ≥ α then accept the null hypothosis. Alternatively, if the p-value < α then reject the null hypothesis, since there is sufficient evidence to prove the alternative. 

In [10]:
# Try different values of n to see how it affects the results
n=10
x_sample = df['pickup_latitude'].sample(n = n)
print(x_sample)

604208     40.748791
172140     40.716843
601687     40.774261
1327751    40.729778
1162820    40.787754
863817     40.780315
1095638    40.783508
541392     40.766041
479641     40.720001
1049049    40.722534
Name: pickup_latitude, dtype: float64


In [11]:
mu=40.74
x_bar=x_sample.mean()
s=np.std(x_sample,ddof=1) # sample standard deviation  with numpy (ddof = 1, otherwise defaults to ddof = 0 the population mean)
n = len(x_sample)
SE = s/ np.sqrt(n) 
ddof = n - 1

t_critical = t.ppf(1-alpha, df = ddof )
t_statistic = (x_bar - mu)/SE
p_value=t.sf(t_statistic, ddof)
c_top = x_bar + t_critical * SE
c_bottom = x_bar - t_critical * SE
p_confidence = (1 - alpha)*100

print('population statistics')
print('mu_p = %f , sigma_p =%f' %(mu_p, sigma_p))

print('\nassertion')
print('mu = %f' %mu)
print('\nsample statistics')
print('x_sample =', x_sample)
print('n = ', n)
print('df =', ddof)
print('x_bar = %1.4f' %x_bar)
print('s = %1.4f' %s)
print('t_critical = %1.4f' %t_critical)
print('t_statistic = %1.4f' %t_statistic)
print('SE = %1.4f' %SE)
print('p_value = %1.4f' %p_value)  # one-tail p-value

print('The %1.3f %c confidence interval = (%1.4f , %1.4f)' %( p_confidence, '%',c_bottom, c_top) )


population statistics
mu_p = 40.750921 , sigma_p =0.032881

assertion
mu = 40.740000

sample statistics
x_sample = 604208     40.748791
172140     40.716843
601687     40.774261
1327751    40.729778
1162820    40.787754
863817     40.780315
1095638    40.783508
541392     40.766041
479641     40.720001
1049049    40.722534
Name: pickup_latitude, dtype: float64
n =  10
df = 9
x_bar = 40.7530
s = 0.0287
t_critical = 1.8331
t_statistic = 1.4329
SE = 0.0091
p_value = 0.0928
The 95.000 % confidence interval = (40.7364 , 40.7696)


# 5. t-Test Two Sample Hypothesis Test

We recall from the EDA exercise, vendor 2 takes long trip durations in addition to short trip durations. Vendor 1 only seems have short trip durations. An experiment is conducted to statistically substantiate that vendor 2 has a higher mean trip duration at least 250 seconds bigger than vendor 1. 

Two randomly chosen groups are sampled from the population. Use a significance level of α < 0.05 (95% confidence) to substantiate your conclusion.

Let μ1 represent the population mean for the vendor_id  = 1
Let μ2 represent the population mean for vendor_id = 2

We define the hypothesis as follows

* null hypothesis: H0: (μ2 - μ 1) - 250 = 0
* alternative hypothesis: Ha : μ2 - μ1 - 250 != 0

This is a two-sample (independent, "unpaired"), 2-sided t-test

The null hypothesis states that the expected difference is 250, while the asertion claims that the difference in the population mean is different than 250.


In [12]:
# try changing the number of samples to see how it affects the results below
n1=1000
n2=1500

x_sample1 = df[(df['vendor_id'] == 1 )]['trip_duration'].sample(n = n1) 
x_sample2 = df[(df['vendor_id'] == 2 )]['trip_duration'].sample(n = n2) 

mu1 = df[(df['vendor_id'] == 1 )]['trip_duration'].mean()
mu2 = df[(df['vendor_id'] == 2 )]['trip_duration'].mean()
delta_mu = mu2 - mu1

print('mu1 = %1.4f' %mu1)
print('mu2 = %1.4f' %mu2)
print('delta_mu = %1.4f' %delta_mu)


mu1 = 845.4382
mu2 = 1058.6432
delta_mu = 213.2049


In [13]:
deltaX = 250
n1 = len(x_sample1)
n2 = len(x_sample2)
x1_bar = x_sample1.mean()
x2_bar = x_sample2.mean()
s1=np.std(x_sample1,ddof=1) # sample standard deviation  with numpy (ddof = 1, otherwise defaults to ddof = 0 the population mean)
s2=np.std(x_sample2,ddof=1) # sample standard deviation  with numpy (ddof = 1, otherwise defaults to ddof = 0 the population mean)

SE = np.sqrt(s1**2 / n1  + s2**2 / n2)
t_statistic = (x1_bar - x2_bar - deltaX ) / SE
p_value = 2*st.norm.sf(abs(t_statistic))

print('p_value = %1.4f' %(p_value))


print('\nConclusion')
if ( p_value > alpha ):
  print('Because p-value ≥ α, we fail to reject the null hypothesis. There is not sufficient evidence to conclude that the trip duration is different than %1.2f s' %deltaX )
else:
  print('Because p < α, we reject the null hypothesis. There is sufficient evidence to conclude that the difference in trip duration is not equal to %1.2f' %deltaX)

p_value = 0.0000

Conclusion
Because p < α, we reject the null hypothesis. There is sufficient evidence to conclude that the difference in trip duration is not equal to 250.00
