# Blacklane Dispatching Analysis

This notebook analyzes the taxi service data to optimize dispatching decisions between pre-purchased shifts and the auction marketplace.

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime

# Set plotting style
plt.style.use('seaborn')
sns.set_palette('viridis')

## 1. Load and Inspect Data

In [None]:
# Load data
shifts_df = pd.read_csv('../data/raw/disp_pre_purchased_shifts.csv')
bookings_df = pd.read_csv('../data/raw/disp_incoming_bookings.csv')
auction_df = pd.read_csv('../data/raw/disp_historical_auction_data.csv')

# Display basic information
print("\nShifts Data Info:")
print(shifts_df.info())
print("\nBookings Data Info:")
print(bookings_df.info())
print("\nAuction Data Info:")
print(auction_df.info())

## 2. Data Preprocessing

In [None]:
# Convert datetime columns
shifts_df['shift_date'] = pd.to_datetime(shifts_df['shift_date'])
bookings_df['booked_start_at'] = pd.to_datetime(bookings_df['booked_start_at'])

# Extract time features
bookings_df['day_of_week'] = bookings_df['booked_start_at'].dt.day_name()
bookings_df['hour_of_day'] = bookings_df['booked_start_at'].dt.hour

# Merge bookings with auction data
merged_df = bookings_df.merge(auction_df, on='booking_uuid', how='left')

## 3. Hypothesis 1: Demand Patterns Analysis

In [None]:
# Analyze daily patterns
day_demand = bookings_df['day_of_week'].value_counts().reindex([
    'Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday', 'Sunday'
])

plt.figure(figsize=(12, 6))
sns.barplot(x=day_demand.index, y=day_demand.values)
plt.title('Booking Demand by Day of Week')
plt.xlabel('Day of Week')
plt.ylabel('Number of Bookings')
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()

# Analyze hourly patterns
hour_demand = bookings_df['hour_of_day'].value_counts().sort_index()

plt.figure(figsize=(12, 6))
sns.lineplot(x=hour_demand.index, y=hour_demand.values, marker='o')
plt.title('Booking Demand by Hour of Day')
plt.xlabel('Hour of Day')
plt.ylabel('Number of Bookings')
plt.xticks(range(0, 24))
plt.tight_layout()
plt.show()

## 4. Hypothesis 2: Cost Analysis

In [None]:
# Calculate shift costs
shifts_df['total_shift_cost'] = shifts_df['shift_working_hours'] * shifts_df['hourly_rate_eur']
total_shift_cost = shifts_df['total_shift_cost'].sum()
avg_hourly_rate = shifts_df['hourly_rate_eur'].mean()

# Analyze auction prices
avg_auction_price = auction_df['auction_winning_price'].mean()
auction_price_std = auction_df['auction_winning_price'].std()

print(f"Average Hourly Rate (Shifts): €{avg_hourly_rate:.2f}")
print(f"Average Auction Price: €{avg_auction_price:.2f} (±{auction_price_std:.2f})")

# Compare costs per booking
bookings_with_costs = merged_df.merge(
    shifts_df[['shift_id', 'chauffeur_uuid', 'hourly_rate_eur']], 
    on='chauffeur_uuid', 
    how='left'
)

# Calculate costs
bookings_with_costs['shift_cost'] = (
    bookings_with_costs['booked_duration'] / 60 * 
    bookings_with_costs['hourly_rate_eur']
)

# Plot cost distribution
plt.figure(figsize=(12, 6))
plt.hist(
    [bookings_with_costs['shift_cost'].dropna(), 
     bookings_with_costs['auction_winning_price'].dropna()],
    label=['Shift Cost', 'Auction Price'],
    bins=50,
    alpha=0.5
)
plt.title('Distribution of Costs: Shifts vs Auctions')
plt.xlabel('Cost (EUR)')
plt.ylabel('Frequency')
plt.legend()
plt.tight_layout()
plt.show()

## 5. Hypothesis 3: Geographic Analysis

In [None]:
from sklearn.cluster import KMeans

# Prepare location data
locations = bookings_with_costs[[
    'pickup_longitude', 'pickup_latitude',
    'dropoff_longitude', 'dropoff_latitude'
]].dropna()

# Cluster pickup locations
pickup_coords = locations[['pickup_longitude', 'pickup_latitude']].values
kmeans = KMeans(n_clusters=5, random_state=42)
locations['pickup_cluster'] = kmeans.fit_predict(pickup_coords)

# Analyze prices by cluster
cluster_prices = bookings_with_costs.groupby('pickup_cluster').agg({
    'auction_winning_price': ['mean', 'std', 'count']
}).round(2)

print("\nAuction Prices by Location Cluster:")
print(cluster_prices)

# Visualize clusters
plt.figure(figsize=(12, 8))
scatter = plt.scatter(
    locations['pickup_longitude'],
    locations['pickup_latitude'],
    c=locations['pickup_cluster'],
    cmap='viridis',
    alpha=0.6
)
plt.title('Pickup Location Clusters')
plt.xlabel('Longitude')
plt.ylabel('Latitude')
plt.colorbar(scatter)
plt.tight_layout()
plt.show()