# Inroduction


[![Run in Colab](https://pathway.com/assets/colab-badge.svg)](https://colab.research.google.com/drive/18XRCD_bokB2IsaC6QJ8HpNwdTl8Rmud6?usp=sharing)

In [1]:
!pip install pathway bokeh --quiet # This cell may take a few seconds to execute.

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m60.4/60.4 kB[0m [31m3.2 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m149.4/149.4 kB[0m [31m8.9 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m69.7/69.7 MB[0m [31m10.2 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m77.6/77.6 kB[0m [31m6.8 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m777.6/777.6 kB[0m [31m48.9 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m139.2/139.2 kB[0m [31m10.1 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m26.5/26.5 MB[0m [31m73.3 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m45.5/45.5 kB[0m [31m2.9 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━

In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import datetime
from datetime import datetime
import pathway as pw
import bokeh.plotting
import panel as pn

# Step 1: Importing and Preprocessing the Data

In [3]:
df = pd.read_csv('/content/dataset.csv', index_col= 0)
df.head()

# You can find the dataset here which was given with problem statement: https://drive.google.com/file/d/1RqHF3zphAFOtYZgReDJUxEFweOiVAxqP/view?usp=drive_link

Unnamed: 0_level_0,SystemCodeNumber,Capacity,Latitude,Longitude,Occupancy,VehicleType,TrafficConditionNearby,QueueLength,IsSpecialDay,LastUpdatedDate,LastUpdatedTime
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
0,BHMBCCMKT01,577,26.144536,91.736172,61,car,low,1,0,04-10-2016,07:59:00
1,BHMBCCMKT01,577,26.144536,91.736172,64,car,low,1,0,04-10-2016,08:25:00
2,BHMBCCMKT01,577,26.144536,91.736172,80,car,low,2,0,04-10-2016,08:59:00
3,BHMBCCMKT01,577,26.144536,91.736172,107,car,low,2,0,04-10-2016,09:32:00
4,BHMBCCMKT01,577,26.144536,91.736172,150,bike,low,2,0,04-10-2016,09:59:00


In [4]:
df.shape

(18368, 11)

## Eventhough we have 11 columns we will only be using Timestamp(Data & Time combined together), LocationID (which is combined with Latitude and Longitude), Occupancy and Capacity for the base model

In [5]:
# Combine the 'LastUpdatedDate' and 'LastUpdatedTime' columns into a single datetime column
df['Timestamp'] = pd.to_datetime(df['LastUpdatedDate'] + ' ' + df['LastUpdatedTime'],
                                  format='%d-%m-%Y %H:%M:%S')
# Combine the 'Latitude' and 'Longitude' columns into a single datetime column
df['LocationID'] = df['Latitude'].astype(str) + ',' + df['Longitude'].astype(str)
# Sort the DataFrame by the new 'Timestamp' column and reset the index
df = df.sort_values('Timestamp').reset_index(drop=True)

In [6]:
LOCATIONS = df.LocationID.unique() #will be useful when we will have to view price fluctuation in different locations in bokeh

In [7]:
# Save the selected columns to a CSV file for streaming or downstream processing
df[["Timestamp", "Occupancy", "Capacity", "LocationID" ]].to_csv("parking_stream.csv", index=False)


In [8]:
# Define the schema for the streaming data using Pathway
# This schema specifies the expected structure of each data row in the stream

class ParkingSchema(pw.Schema):
    Timestamp: str   # Timestamp of the observation (should ideally be in ISO format)
    Occupancy: int   # Number of occupied parking spots
    Capacity: int    # Total parking capacity at the location
    LocationID: str  # different parking locations

In [9]:
# Load the data as a simulated stream using Pathway's replay_csv function
# This replays the CSV data at a controlled input rate to mimic real-time streaming
# input_rate=1000 means approximately 1000 rows per second will be ingested into the stream.

data = pw.demo.replay_csv("parking_stream.csv", schema=ParkingSchema, input_rate=1000)

In [10]:
# Define the datetime format to parse the 'Timestamp' column
fmt = "%Y-%m-%d %H:%M:%S"

# Add new columns to the data stream:
# - 't' contains the parsed full datetime
# - 'day' extracts the date part and resets the time to midnight (useful for day-level aggregations)
data_with_time = data.with_columns(
    t = data.Timestamp.dt.strptime(fmt),
    day = data.Timestamp.dt.strptime(fmt).dt.strftime("%Y-%m-%dT00:00:00")
)


In [11]:

data_with_time = data.with_columns(
    # Parse the Timestamp
    t = data.Timestamp.dt.strptime(fmt),
    # Compute the occupancy ratio
    occ_ratio = pw.this.Occupancy / pw.this.Capacity
)

BASE_PRICE = 10   # starting price when no occupancy effect
ALPHA = 2         # multiplier for how much occupancy influences price

import datetime

# Build the daily dynamic pricing table
daily_price = (
    data_with_time.windowby(
        pw.this.t,  # use the parsed timestamp as the event time
        instance=pw.this.LocationID,  # partition the data by LocationID → each location is its own stream
        window=pw.temporal.tumbling(datetime.timedelta(days=1)),  # make non-overlapping daily windows
        behavior=pw.temporal.exactly_once_behavior()  # enforce exactly-once semantics for correctness
    )
    .reduce(
        t = pw.this._pw_window_end,
        LocationID = pw.this._pw_instance,
        window_start = pw.reducers.min(pw.this.t),
        avg_occ_ratio = pw.reducers.avg(pw.this.occ_ratio)
    )
    .with_columns(
        price = BASE_PRICE + ALPHA * pw.this.avg_occ_ratio
    )
)

#model 1
# Daily demand-responsive pricing
# price = BASE + factor × (how full it was)
# since we depend on previous normalised occupancy hence we can say we can achieve the smooth change in price with change in occupancy



 ## model assumes that occupancy directly reflects real demand, with prices adjusting daily based on how full each location is. The relationship is linear: more cars mean a higher price. It doesn’t factor in competitor prices directly... only its own usage and it has no strict caps or floors beyond the set base price. but the factor is adjusted so that it doesnt go beyond 2x of base price


In [None]:
import panel as pn
import bokeh.plotting
from bokeh.palettes import Category20
from bokeh.transform import factor_cmap

pn.extension()


def price_plotter(source):
    fig = bokeh.plotting.figure(
        height=500,
        width=800,
        title="Pathway: Daily Parking Price by Location",
        x_axis_type="datetime",
        tools="pan,wheel_zoom,box_zoom,reset,hover"
    )
#add a filter for diff locations
    color_map = factor_cmap(
        "LocationID",
        palette=Category20[20] ,
        factors=LOCATIONS #array
    )
#scatter t vs price plot
    fig.scatter(
        "t", "price",
        source=source,
        size=6,
        color=color_map,
        legend_field="LocationID"
    )

    fig.xaxis.axis_label = "Timestamp"
    fig.yaxis.axis_label = "Price"
    fig.legend.title = "LocationID"
    fig.legend.location = "top_left"

    return fig

# viz = daily_price.plot(price_plotter, sorting_col="t")  #UNCOMMENT THIS THING BEFORE YOU RUN THIS CELL

# pn.Column(viz).servable() #UNCOMMENT THIS THING BEFORE YOU RUN THIS CELL


In [13]:
pw.run()