In [None]:
import pandas as pd
from pathlib import Path
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np

data_folder = 'data'

Load all the data and append them into one dataframe
Due to storage concerns, we will be sampling instead.

In [None]:
files = Path.glob(Path(data_folder), "yellow_tripdata*")
num_sample = 1000000

dfs = []
for file in files:
    temp = pd.read_parquet(file)
    print(f"{file} has {len(temp)} entries - sampling {num_sample} from them.")
    dfs.append(temp.sample(num_sample, random_state=42))

df = pd.concat(dfs)

In [None]:
df.info()

Data cleaning

In [None]:
# Remove entries where the total amount paid is <= 0 (refunds/cancelled rides)
df = df[df['total_amount'] > 0]

# Remove other entries with negative tips
df = df[df['tip_amount'] >= 0]

# Remove extreme outliers (tips above $100 and trip distances above 100 miles)
df = df[df['tip_amount'] < 100]
df = df[df['trip_distance'] < 100]

Get time of day, day of week, and season when the pickup occured

In [None]:
# Convert to datetime
df['tpep_pickup_datetime'] = pd.to_datetime(df['tpep_pickup_datetime'])

# Define time of day
def get_time_of_day(hour):
    if 5 <= hour < 11:
        return "Morning"
    elif 11 <= hour < 17:
        return "Noon/Afternoon"
    elif 17 <= hour < 23:
        return "Evening"
    else:
        return "Night"

df['time_of_day'] = df['tpep_pickup_datetime'].dt.hour.apply(get_time_of_day)

# Get Day of Week
df['day_of_week'] = df['tpep_pickup_datetime'].dt.day_name()

# Define seasons
def get_season(month):
    if month in [12, 1, 2]:
        return "Winter"
    elif month in [3, 4, 5]:
        return "Spring"
    elif month in [6, 7, 8]:
        return "Summer"
    else:
        return "Fall"

df['season'] = df['tpep_pickup_datetime'].dt.month.apply(get_season)

In [None]:
df.head()

Save data into one parquet file

In [None]:
df.to_parquet("tripdata_combined.parquet")

Load data from parquet file

In [None]:
df = pd.read_parquet("tripdata_combined.parquet")
print(df.head().to_markdown())
print("Shape of data", df.shape)

Data visualization

In [None]:
# Violin plots for tipping amount with respect to times (with outliers)
plt.figure(figsize=(8, 5))
sns.violinplot(x="time_of_day", y='tip_amount', data=df, order=["Morning", "Noon/Afternoon", "Evening", "Night"])
plt.xlabel("Time of Day")
plt.ylabel("Tip amount ($)")
plt.title("Tip Amount Variation by Time of Day")

plt.figure(figsize=(8, 5))
sns.violinplot(x="day_of_week", y='tip_amount', data=df, order=["Monday", "Tuesday", "Wednesday", "Thursday", "Friday", "Saturday", "Sunday"])
plt.xlabel("Day of Week")
plt.ylabel("Tip amount ($)")
plt.title("Tip Amount Variation by Day of Week")

plt.figure(figsize=(8, 5))
sns.violinplot(x="season", y='tip_amount', data=df, order=["Winter", "Spring", "Summer", "Fall"])
plt.xlabel("Season")
plt.ylabel("Tip amount ($)")
plt.title("Tip Amount Variation by Season")
plt.show()

In [None]:
# Violin plots for tipping amount with respect to times (without outliers) (Caution: needs lots of RAM)

temp = df[df['tip_amount'] < 6.51]  # Non-outlier entries

plt.figure(figsize=(8, 5))
sns.violinplot(x="time_of_day", y='tip_amount', data=temp, order=["Morning", "Noon/Afternoon", "Evening", "Night"])
plt.xlabel("Time of Day")
plt.ylabel("Tip amount ($)")
plt.title("Tip Amount Variation by Time of Day")

plt.figure(figsize=(8, 5))
sns.violinplot(x="day_of_week", y='tip_amount', data=temp, order=["Monday", "Tuesday", "Wednesday", "Thursday", "Friday", "Saturday", "Sunday"])
plt.xlabel("Day of Week")
plt.ylabel("Tip amount ($)")
plt.title("Tip Amount Variation by Day of Week")

plt.figure(figsize=(8, 5))
sns.violinplot(x="season", y='tip_amount', data=temp, order=["Winter", "Spring", "Summer", "Fall"])
plt.title("Tip Amount Variation by Season")
plt.xlabel("Season")
plt.ylabel("Tip amount ($)")
plt.show()

In [None]:
# Violin plots for tip amounts for credit vs cash payments

temp = df[np.logical_or(df['payment_type'] == 1, df['payment_type'] == 2)]  # Select entries paid in cash or credit

plt.figure(figsize=(8, 5))
sns.violinplot(x=temp['payment_type'].map({1: 'Credit Card', 2: 'Cash'}), y='tip_amount', data=temp)
plt.xlabel("Payment Type")
plt.ylabel("Tip amount ($)")
plt.title("Tip Amount for Credit vs Cash")

temp = temp[temp['tip_amount'] < 6.51]  # Non-outlier entries
plt.figure(figsize=(8, 5))
sns.violinplot(x=temp['payment_type'].map({1: 'Credit Card', 2: 'Cash'}), y='tip_amount', data=temp)
plt.xlabel("Payment Type")
plt.ylabel("Tip amount ($)")
plt.title("Tip Amount for Credit vs Cash (No outliers)")
plt.show()

In [None]:
df[df['payment_type'] == 1]['tip_amount'].describe()

In [None]:
df[df['payment_type'] == 2]['tip_amount'].describe()

In [None]:
df['tipped'] = (df['tip_amount'] > 0).astype(int)

# Define distance bins
bin_edges = np.linspace(df['trip_distance'].min(), df['trip_distance'].max(), 30)  # 30 bins
df['distance_bin'] = pd.cut(df['trip_distance'], bins=bin_edges, include_lowest=True)

# Compute percentage of people who tipped in each bin
tip_percentage = df.groupby('distance_bin')['tipped'].mean() * 100
bin_centers = [interval.mid for interval in tip_percentage.index]  # Get bin centers for plotting

plt.figure(figsize=(8,5))
plt.plot(bin_centers, tip_percentage, marker='o', linestyle='-', color='blue')

plt.xlabel("Trip Distance")
plt.ylabel("Percentage of People Who Tipped (%)")
plt.title("Tipping Rate vs. Trip Distance")
plt.grid(True)
plt.show()

In [None]:
df['trip_distance'].describe()