In [1]:
%%bash

wc -l sample100.csv
head sample100.csv

101 sample100.csv
VendorID,tpep_pickup_datetime,tpep_dropoff_datetime,passenger_count,trip_distance,RatecodeID,store_and_fwd_flag,PULocationID,DOLocationID,payment_type,fare_amount,extra,mta_tax,tip_amount,tolls_amount,improvement_surcharge,total_amount
2,2016-12-15 12:18:37,2016-12-15 12:28:45,6,.50,1,N,141,237,2,7.5,0,0.5,0,0,0.3,8.3
1,2016-12-20 22:39:23,2016-12-20 22:50:27,1,2.00,1,N,246,164,1,9.5,0.5,0.5,3,0,0.3,13.8
1,2016-12-09 23:02:06,2016-12-09 23:11:03,1,1.10,1,N,114,79,1,7.5,0.5,0.5,1.75,0,0.3,10.55
1,2016-12-11 15:40:19,2016-12-11 16:09:53,1,11.90,1,N,138,224,1,34,0,0.5,8.05,5.54,0.3,48.39
2,2016-12-19 11:34:28,2016-12-19 11:47:51,1,1.96,1,N,261,144,2,10.5,0,0.5,0,0,0.3,11.3
2,2016-12-29 14:12:30,2016-12-29 14:22:57,5,1.81,1,N,237,143,2,9,0,0.5,0,0,0.3,9.8
1,2016-12-19 19:30:59,2016-12-19 19:38:48,1,1.50,1,N,237,239,1,7.5,1,0.5,1.85,0,0.3,11.15
2,2016-12-28 14:39:49,2016-12-28 14:50:42,2,1.47,1,N,43,237,2,9,0,0.5,0,0,0.3,9.8
2,2016-12-31 20:15:07,2016-12-31 20:19:

In [3]:
## libraries 
import numpy as np
import pandas as pd
from datetime import datetime, timedelta
from statsmodels.stats.proportion import proportion_confint
from statsmodels.stats.proportion import samplesize_confint_proportion
from statsmodels.stats.weightstats import _tconfint_generic



In [4]:
## proportion
is_tipped = pd.read_csv('sample100.csv').tip_amount>0
ph = is_tipped.mean()
ph



0.66000000000000003

In [5]:
## proportion 95% confidence interval

proportion_confint(sum(is_tipped), len(is_tipped), alpha = 0.05)

(0.56715478691048404, 0.75284521308951602)

In [6]:
## how to choose a sample size which is only 2% wide

int(np.ceil(samplesize_confint_proportion(is_tipped.mean(), 0.01)))




8621

In [7]:
## now perform the mean and confidence interval calculation over 
is_tipped = pd.read_csv('sample10000.csv').tip_amount>0
ph = is_tipped.mean()
print(ph)

proportion_confint(sum(is_tipped), len(is_tipped), alpha = 0.05)



0.6121


(0.60254965257891324, 0.62165034742108671)

### we can clearly see when the sample size is larger than 8621 the confidence interval becomes small (only 2% wide). Choosing a minmum sample size with given accuracy (half of desired: here 0.01) gives us min sample size estimate.

### Average taxi trip duration

In [35]:
# 100 sample size

sample100 = pd.read_csv('sample100.csv')

t1 = pd.to_datetime(sample100.tpep_pickup_datetime)
t2 = pd.to_datetime(sample100.tpep_dropoff_datetime)

sample100['duration'] = (t2 - t1).astype('timedelta64[m]')
# mean
mu = sample100['duration'].mean()
#std
s = sample100['duration'].std(ddof=1)/np.sqrt(len(sample100['duration']))

_tconfint_generic(mu, s, len(sample100['duration'])-1, 0.05, 'two-sided')

(11.209581486350903, 16.130418513649097)

In [36]:
# 10000 sample size

sample10000 = pd.read_csv('sample10000.csv')

t1 = pd.to_datetime(sample10000.tpep_pickup_datetime)
t2 = pd.to_datetime(sample10000.tpep_dropoff_datetime)

sample10000['duration'] = (t2 - t1).astype('timedelta64[m]')
# mean
mu = sample10000['duration'].mean()
#std
s = sample10000['duration'].std(ddof=1)/np.sqrt(len(sample10000['duration']))

_tconfint_generic(mu, s, len(sample10000['duration'])-1, 0.05, 'two-sided')

(15.344696311597962, 17.740303688402037)

### above we can see changing the sample size from 100 to 10000 should narrow the confidence interval by 10 times but the imporvement is far less. This is due to ouliers in sample100000 data and can be handaled using medians 

### Medians

In [38]:
median_100 = sample100['duration'].median()
median_10000 = sample10000['duration'].median()
print(median_100, median_10000)

10.0 11.0


In [56]:
def get_bootstrap_samples(data, n_samples):
    indices = np.random.randint(0, len(data), (n_samples, len(data)))
    samples = data[indices]
    return samples

def stat_intervals(stat, alpha):
    boundaries = np.percentile(stat, [100*alpha/2.0 , 100*(1-alpha/2.0)])
    return boundaries

In [72]:
median_duration = list(map(np.median, get_bootstrap_samples(sample100['duration'].values, 1000)))
confidence_interval_100 = stat_intervals(median_duration, 0.05)
print('confidence_interval_100 = ', confidence_interval_100)

confidence_interval_100 =  [  8.   11.5]


In [63]:
median_duration = list(map(np.median, get_bootstrap_samples(sample10000['duration'].values, 1000)))
confidence_interval_10000 = stat_intervals(median_duration, 0.05)
print('confidence_interval_10000 = ', confidence_interval_10000)

confidence_interval_10000 =  [ 11.  11.]


### here we can see that confidence interval width has decreased by 10 times by increaing the sample size 100 times (from 100 to 100000). This is squreroot N rule. One can notice the parallel in concepts:  mean ------ confidence interval is calculated by -----> standared deviation. For median ------- confidence interval is calculated by -------> bootstrap sample

## sample estimates Q&A

Q: 
Using the data dictionary, check how many passengers in the sample paid for their ride with cash.

In [73]:
df = pd.read_csv('sample10000.csv')


In [74]:
df.head()

Unnamed: 0,VendorID,tpep_pickup_datetime,tpep_dropoff_datetime,passenger_count,trip_distance,RatecodeID,store_and_fwd_flag,PULocationID,DOLocationID,payment_type,fare_amount,extra,mta_tax,tip_amount,tolls_amount,improvement_surcharge,total_amount
0,2,2016-12-19 20:13:04,2016-12-19 20:16:58,1,0.87,1,N,238,142,1,5.0,0.5,0.5,1.5,0.0,0.3,7.8
1,1,2016-12-27 19:28:48,2016-12-27 19:30:31,1,0.1,1,N,231,144,2,3.0,1.0,0.5,0.0,0.0,0.3,4.8
2,1,2016-12-17 00:57:37,2016-12-17 01:10:37,3,2.3,1,N,90,233,1,11.0,0.5,0.5,2.45,0.0,0.3,14.75
3,1,2016-12-01 20:32:54,2016-12-01 20:52:17,1,5.0,1,N,13,143,1,18.5,0.5,0.5,3.95,0.0,0.3,23.75
4,2,2016-12-12 19:25:14,2016-12-12 19:34:15,2,1.43,1,N,164,79,1,7.5,1.0,0.5,1.0,0.0,0.3,10.3


In [80]:
## proportion
is_tipped = df.payment_type==2
ph = is_tipped.mean()
int(ph*10000)


3543

In [81]:
## proportion 95% confidence interval

proportion_confint(sum(is_tipped), len(is_tipped), alpha = 0.01)

(0.34197979174644566, 0.36662020825355435)

In [87]:
mu = df.trip_distance.mean()

print(mu)



2.881264


In [88]:
sd = df.trip_distance.std(ddof=1)/np.sqrt(len(df.trip_distance))
print(sd)
_tconfint_generic(mu, sd, len(df.trip_distance)-1, 0.05, 'two-sided')

0.0358392541139


(2.8110118487985973, 2.9515161512014023)