In [1]:
import pandas as pd
import numpy as np

In [2]:
def load_taxi_data(filename):
    return (pd.read_csv(filename,
                       usecols=["tpep_pickup_datetime",
                               "passenger_count",
                               "trip_distance",
                               "fare_amount",
                               "extra",
                               "mta_tax",
                               "tip_amount",
                               "tolls_amount",
                               "improvement_surcharge",
                               "total_amount",
                               "congestion_surcharge"]))

In [3]:
filenames = ["/Users/ahmedabukar/Downloads/:/data/nyc_taxi_2019-01.csv",
            "/Users/ahmedabukar/Downloads/:/data/nyc_taxi_2019-07.csv"]
all_dfs = [load_taxi_data(filename) for filename in filenames]

In [4]:
df = pd.concat(all_dfs)
df.head()

Unnamed: 0,tpep_pickup_datetime,passenger_count,trip_distance,fare_amount,extra,mta_tax,tip_amount,tolls_amount,improvement_surcharge,total_amount,congestion_surcharge
0,2019-01-01 00:46:40,1.0,1.5,7.0,0.5,0.5,1.65,0.0,0.3,9.95,
1,2019-01-01 00:59:47,1.0,2.6,14.0,0.5,0.5,1.0,0.0,0.3,16.3,
2,2018-12-21 13:48:30,3.0,0.0,4.5,0.5,0.5,0.0,0.0,0.3,5.8,
3,2018-11-28 15:52:25,5.0,0.0,3.5,0.5,0.5,0.0,0.0,0.3,7.55,
4,2018-11-28 15:56:57,5.0,0.0,52.0,0.0,0.5,0.0,0.0,0.3,55.55,


In [5]:
# create a new column pre_tip_amount?
df["pre_tip_amount"] = (
    df[["fare_amount",
   "mta_tax",
   "extra",
   "tolls_amount",
   "improvement_surcharge",
   "congestion_surcharge"]]
    .sum(axis="columns")
)

In [6]:
# create a new column, tip_percentage, showing the percentage pre_tip_amount that the tip was?
df["tip_percentage"] = (df["tip_amount"] / df["pre_tip_amount"])

In [7]:
df["tpep_pickup_datetime"] = pd.to_datetime(df["tpep_pickup_datetime"])

In [8]:
# what was the mean tip percentage across all trips in the data set?
df["tip_percentage"].mean()

0.13003974566357937

In [17]:
# how many times did people tip more than the pre_tip amount?
(df["tip_percentage"] == 0).value_counts(normalize=True)

tip_percentage
False    0.67923
True     0.32077
Name: proportion, dtype: float64

In [10]:
# on which day of the week do people tip greatest percentage of the fare, 
# on average?
(
    df
    .groupby(df["tpep_pickup_datetime"].dt.day_name())
    ["tip_percentage"]
    .mean()
    .sort_values(ascending=False)
)

tpep_pickup_datetime
Thursday     0.133970
Wednesday    0.132221
Tuesday      0.131424
Friday       0.129136
Monday       0.128723
Sunday       0.126634
Saturday     0.125801
Name: tip_percentage, dtype: float64

In [11]:
# on which hour do people tip greatest percentage?
(
    df
    .groupby(df["tpep_pickup_datetime"].dt.hour)
    ["tip_percentage"]
    .mean()
    .sort_values(ascending=False)
    * 100
)

tpep_pickup_datetime
22    13.881613
20    13.816004
21    13.768452
8     13.711560
19    13.517362
23    13.497844
18    13.329193
9     13.301671
7     13.213395
0     13.149003
2     13.091431
1     13.070956
17    12.863954
10    12.719951
11    12.502223
16    12.465461
13    12.456654
12    12.437567
14    12.372725
15    12.354692
3     12.105323
6     11.991455
4     11.898723
5     11.202766
Name: tip_percentage, dtype: float64

In [12]:
# do people typically tip more in January or July?
(
    df
    .groupby(df["tpep_pickup_datetime"].dt.month)
    ["tip_percentage"]
    .mean()
    .sort_values(ascending=False)
    * 100
)

tpep_pickup_datetime
5     20.000000
8     15.809872
3     14.804581
9     14.143074
1     13.701105
2     13.222417
7     12.157036
12    10.936669
6     10.735443
10    10.000000
4      7.487685
11     4.602649
Name: tip_percentage, dtype: float64

In [13]:
# what was the 1-day period in our data set when people tipped the greatest percentage?
(
    df
    .groupby(df["tpep_pickup_datetime"].dt.day_name())
    ["tip_percentage"]
    .max()
    .sort_values(ascending=False)
)

tpep_pickup_datetime
Saturday     733.333333
Wednesday    700.000000
Tuesday      466.666667
Sunday       433.333333
Thursday     400.000000
Friday       350.000000
Monday       163.333333
Name: tip_percentage, dtype: float64

In [19]:
# we say 32% of the riders don't tip, of those who do, what percentage do they tip on average?
(
    df.loc[df["tip_percentage"] != 0, "tip_percentage"]
    .mean()
)

0.1914711859863634

In [37]:
# how many of the rides in the data set, supposedly from january and July 2019
# are outside from those dates

len(df) - len(
    pd.concat([
        df.set_index("tpep_pickup_datetime").sort_index()
        .loc["2019-01-01" : "2019-01-31"],
        df.set_index("tpep_pickup_datetime").sort_index()
        .loc["2019-07-01" : "2019-07-31"]
    ]).index
    
) 

816

In [39]:
df = df.set_index("tpep_pickup_datetime")

In [41]:
df = df.sort_index()

In [49]:
# looking only at dates of January and July 2019, 
# in what week did passengers tip the greatest percentage?
(
    pd.concat(
        [
            df.loc["2019-01-01" : "2019-01-31"],
            df.loc["2019-07-01" : "2019-07-31"]
        ]
    )
    .resample("W")
    ["tip_percentage"]
    .mean()
    .dropna()
    .sort_values(ascending=False) * 100
)

tpep_pickup_datetime
2019-02-03    14.197924
2019-01-27    13.892965
2019-01-20    13.853614
2019-01-13    13.790113
2019-01-06    12.698336
2019-08-04    12.491016
2019-07-14    12.345926
2019-07-21    12.334080
2019-07-28    12.303639
2019-07-07    11.295174
Name: tip_percentage, dtype: float64