## Outreachy-Bokeh Microtask for May 2023 cohort by Isaiah Akorita

### Objective of task

The purpose of this task is to perform exploratory data analysis on the New York City taxi trip dataset for a specified time period using the visualisation tools in the Bokeh Library. The source for the data file can be found [here](https://www.nyc.gov/site/tlc/about/tlc-trip-record-data.page) 

In [1]:
#Import all necessary libraries
import pandas as pd
import numpy as np
from bokeh.plotting import figure, show
from bokeh.io import output_notebook
from bokeh.models import FactorRange
from math import pi
from bokeh.palettes import Category20, Viridis256
from bokeh.transform import cumsum, linear_cmap
from bokeh.models import NumeralTickFormatter

In [2]:
#Load output_notebook to generate plots inline
output_notebook()

### Data loading and transformation

In [3]:
#Load data file.
file = r"yellow_tripdata_2022-11.parquet"
df = pd.read_parquet(file)

#Load a random sample of the data file up to 10 rows 
df.sample(10)

Unnamed: 0,VendorID,tpep_pickup_datetime,tpep_dropoff_datetime,passenger_count,trip_distance,RatecodeID,store_and_fwd_flag,PULocationID,DOLocationID,payment_type,fare_amount,extra,mta_tax,tip_amount,tolls_amount,improvement_surcharge,total_amount,congestion_surcharge,airport_fee
1079857,2,2022-11-10 07:09:27,2022-11-10 07:22:52,1.0,2.61,1.0,N,236,143,1,11.5,0.0,0.5,1.0,0.0,0.3,15.8,2.5,0.0
1192560,2,2022-11-10 23:48:18,2022-11-10 23:53:09,1.0,0.71,1.0,N,237,229,1,5.0,0.5,0.5,1.76,0.0,0.3,10.56,2.5,0.0
1516655,2,2022-11-13 18:15:14,2022-11-13 18:28:02,1.0,2.85,1.0,N,161,263,1,11.5,0.0,0.5,2.96,0.0,0.3,17.76,2.5,0.0
1567713,2,2022-11-14 12:28:48,2022-11-14 12:33:27,1.0,0.55,1.0,N,68,68,2,5.0,0.0,0.5,0.0,0.0,0.3,8.3,2.5,0.0
310388,2,2022-11-03 17:45:47,2022-11-03 18:02:46,1.0,1.76,1.0,N,163,234,2,11.5,1.0,0.5,0.0,0.0,0.3,15.8,2.5,0.0
2220225,2,2022-11-21 17:30:03,2022-11-21 17:45:40,1.0,1.36,1.0,N,161,163,1,10.5,1.0,0.5,1.0,0.0,0.3,15.8,2.5,0.0
2769179,2,2022-11-27 16:47:14,2022-11-27 16:57:06,1.0,1.43,1.0,N,163,237,1,8.5,0.0,0.5,2.36,0.0,0.3,14.16,2.5,0.0
1161401,2,2022-11-10 19:07:56,2022-11-10 19:36:53,1.0,7.06,1.0,N,161,13,1,26.5,1.0,0.5,4.62,0.0,0.3,35.42,2.5,0.0
2202387,2,2022-11-21 15:30:40,2022-11-21 15:37:41,2.0,1.08,1.0,N,163,100,2,6.5,0.0,0.5,0.0,0.0,0.3,9.8,2.5,0.0
1915805,2,2022-11-18 11:11:08,2022-11-18 11:32:32,1.0,1.53,1.0,N,230,143,1,13.5,0.0,0.5,3.36,0.0,0.3,20.16,2.5,0.0


In [4]:
#Check the columns data types
df.dtypes

VendorID                          int64
tpep_pickup_datetime     datetime64[ns]
tpep_dropoff_datetime    datetime64[ns]
passenger_count                 float64
trip_distance                   float64
RatecodeID                      float64
store_and_fwd_flag               object
PULocationID                      int64
DOLocationID                      int64
payment_type                      int64
fare_amount                     float64
extra                           float64
mta_tax                         float64
tip_amount                      float64
tolls_amount                    float64
improvement_surcharge           float64
total_amount                    float64
congestion_surcharge            float64
airport_fee                     float64
dtype: object

In [5]:
# Write a function that cleans up the data
def tweak_df(df):
    """ Clean up the dataframe by doing the following:
    
    1. Replace all blank passenger_counts column with zero (0).
    2. Replace all blank RatecodeID columns with five(5) for "Unknown".
    3. Convert the passenger_counts, RatecodeID, VendorID and payment_type 
        column to "uint8" to save memory space.
    4. Add a trip_duration column that is the difference 
        between pickup and dropoff times.
    5. Drop all rows where the trip duration is less than one second.
    
    Args:
        df(pandas dataframe): This is the dataframe from the file used here.
        
    Returns:
        df(pandas dataframe): The cleaned up dataframe.  
    """
    df = df.assign(
        passenger_count=lambda df: df.passenger_count.fillna(0).astype("uint8"),
        payment_type=lambda df: df.payment_type.astype("uint8"),
        VendorID=lambda df: df.VendorID.astype("uint8"),
        RatecodeID=lambda df: df.RatecodeID.fillna(5).astype("uint8"),
        trip_duration=lambda df: (df.tpep_dropoff_datetime - df.tpep_pickup_datetime)
    )
    duration_filtered=df[df["trip_duration"].dt.total_seconds() < 1.0]
    df=df.drop(duration_filtered.index)
    return df

In [6]:
#Check cleaned-up data types
new_df = tweak_df(df)

new_df.dtypes

VendorID                           uint8
tpep_pickup_datetime      datetime64[ns]
tpep_dropoff_datetime     datetime64[ns]
passenger_count                    uint8
trip_distance                    float64
RatecodeID                         uint8
store_and_fwd_flag                object
PULocationID                       int64
DOLocationID                       int64
payment_type                       uint8
fare_amount                      float64
extra                            float64
mta_tax                          float64
tip_amount                       float64
tolls_amount                     float64
improvement_surcharge            float64
total_amount                     float64
congestion_surcharge             float64
airport_fee                      float64
trip_duration            timedelta64[ns]
dtype: object

## Data insights and visualisation

### Question 1: What are the most lucrative days of the week?

In [7]:
#Group by trip days and calculate the total amount made by weekday.
trip_days = new_df.tpep_pickup_datetime.dt.day_name()
day_total = pd.DataFrame(new_df.groupby(trip_days).total_amount.sum())

#Sort the grouped data by weekday
weekday_cat = pd.CategoricalDtype(
    categories=["Monday", "Tuesday", "Wednesday", "Thursday", "Friday", "Saturday", "Sunday"],
    ordered=True)
day_total.index = day_total.index.astype(weekday_cat)
day_total = day_total.sort_values("tpep_pickup_datetime")
day_total = day_total.reset_index()

day_total

Unnamed: 0,tpep_pickup_datetime,total_amount
0,Monday,9236579.29
1,Tuesday,12344706.81
2,Wednesday,12542277.8
3,Thursday,9547948.45
4,Friday,9428629.68
5,Saturday,9664645.22
6,Sunday,8931626.15


In [8]:
#Plot a bar chart showing the amount made by weekday 
days = day_total["tpep_pickup_datetime"]
amount = day_total["total_amount"]

p = figure(title="Total amount made by weekday",
           height=300,
           sizing_mode="stretch_width",
           x_range=FactorRange(factors=days),
           y_axis_label="Amount (million $)",
           tools="hover",
           tooltips="$@top M",
           toolbar_location=None,
          )

p.vbar(x=days,
       top=(amount/10**6),
       width=0.9)
p.y_range.start = 5

         

show(p)

### Key Observation:
- Tuesdays and Wednesdays were the most lucrative days of the week for the drivers.
- Sundays are the least lucrative days of the week for the drivers

### Question 2: How long is the average trip duration?

In [9]:
#Insights on trip duration
new_df.trip_duration.describe()

count                      3249767
mean     0 days 00:18:08.630485508
std      0 days 00:48:57.664849398
min                0 days 00:00:01
25%                0 days 00:07:51
50%                0 days 00:13:02
75%                0 days 00:21:05
max                3 days 20:55:10
Name: trip_duration, dtype: object

### Key Observation:
- Average trip duration is 18 minutes.
- Longest trip duration is 3 days and ~21 hours
- The median trip duration is 13 minutes.
- The significant difference between the median and the average trip length could be due to huge outliers like the rare long trip duration.


### Question 3: What are the busiest periods of the day?

In [10]:
#Sort the pickup times by hour of the day
pick_up_hour = new_df.tpep_pickup_datetime.dt.hour

# Group the hour values into four general periods of the day
conditions = [pick_up_hour < 5, (pick_up_hour>=5) & (pick_up_hour<12), (pick_up_hour>=12) & (pick_up_hour<=18), pick_up_hour>18]
values = ["Midnight", "Morning", "Afternoon", "Evening"]

result = np.select(conditions, values)

pick_up_period = pd.Series(result)

#Generate a pandas series for the pickup hour by frequency
pick_up_data = pick_up_period.value_counts()
pick_up_data = pick_up_data.reset_index(name="Frequency").rename(columns={"index":"Hour"})

#Add columns for plot angle and color
pick_up_data["angle"] = pick_up_data["Frequency"]/pick_up_data["Frequency"].sum()*2*pi
pick_up_data["color"] = Category20[len(pick_up_data)]

#Plot a pie chart for the pickup data showing the frequency.
p = figure(height=300,
           title="Pickup Frequency by period of day",
           toolbar_location=None,
           tools="hover",
           tooltips="@Hour: @Frequency",
           x_range=(-0.5, 1.0),
          sizing_mode="stretch_width")

p.wedge(x=0,
        y=1,
        radius=0.35,
        start_angle=cumsum("angle", include_zero=True),
        end_angle=cumsum("angle"),
        line_color=None,
        fill_color="color",
        legend_field="Hour",
        source=pick_up_data)

p.axis.axis_label = None
p.axis.visible = False
p.grid.grid_line_color = None

show(p)

### Key Observation:
- Afternoons and Evenings are the busiest periods of the day.

### Question 4: When is the best time to receive a tip?

In [11]:
#Generate total tip amount data by time of day
tip_data = new_df.groupby(pick_up_period).tip_amount.sum()
tip_data = tip_data.reset_index(name="Amount").rename(columns={"index":"Period"})

#Add columns for plot angle and color
tip_data["angle"] = tip_data["Amount"]/tip_data["Amount"].sum() * 2*pi
tip_data["color"] = Category20[len(tip_data)]

tip_data

Unnamed: 0,Period,Amount,angle,color
0,Afternoon,4068015.19,2.765325,#1f77b4
1,Evening,2480272.17,1.686021,#aec7e8
2,Midnight,601632.35,0.408973,#ff7f0e
3,Morning,2093151.94,1.422867,#ffbb78


In [12]:
#Plot a pie chart showing tip amounts by period of day
p = figure(height=300,
           title="Total tip amount by period of day",
           toolbar_location=None,
           x_range=(-0.5, 1.0),
          sizing_mode="stretch_width")

p.wedge(x=0,
        y=1,
        radius=0.35,
        start_angle=cumsum("angle", include_zero=True),
        end_angle=cumsum("angle"),
        line_color=None,
        fill_color="color",
        legend_field="Period",
        source=tip_data)

p.axis.axis_label = None
p.axis.visible = False
p.grid.grid_line_color = None

show(p)

### Key Observation:
- The best times for getting tips are afternoons and evenings and this corresponds with the busiest times of the day.

### Question 5: How much does the average passenger tip?

In [13]:
#Insights on tip amount
new_df.tip_amount.describe()

count    3.249767e+06
mean     2.847096e+00
std      3.312256e+00
min     -8.160000e+01
25%      1.000000e+00
50%      2.220000e+00
75%      3.500000e+00
max      3.330000e+02
Name: tip_amount, dtype: float64

### Key Observation:

- The average tip amount is ~$3

- The highest tip amount is $333


### Question 6: What are the rush hour times?

In [14]:
#Insights on congestion times.

surcharge_by_hour = new_df.groupby(pick_up_hour).congestion_surcharge.sum()
x = surcharge_by_hour.index
y = surcharge_by_hour


p = figure(title="Surcharge by hour of day",
           height = 300,
          x_axis_label="Time of day",
          y_axis_label="Total amount",
          tools="hover",
          tooltips="@x.00hrs",
          toolbar_location=None,
          sizing_mode="stretch_width")

p.line(x,y, legend_label="Surcharge", line_width=2)
p.legend.location="top_left"
p.yaxis.formatter = NumeralTickFormatter(format="0,0")


show(p)

### Key Observations:
- The congestion surcharge gradually increases as the day goes by and peaks at around 6 pm, corresponding to rush hour when most people finish work and causing heavy traffic on the roads. It then gradually declines until about 5 am.