# Retrieve the feature set for batch inference

In [None]:
%run feature_set_retrieval

## Prepare 2023 Jan NYC yellow taxi trip and weather data for inferencing

In [None]:
from sklearn.preprocessing import MinMaxScaler

# Prepare weather data scaler from the model training used features retrieved from feature store
nycweather_df = nycweather_df.toPandas()
nycweather_df = nycweather_df[nycweather_df["year"] == 2022]
nycweather_scaler = MinMaxScaler().fit(
    nycweather_df[["temperature_2m_c", "windspeed_10m_km_per_hour", "precipitation_mm", "cloudcover_percentage"]])


In [None]:
import pandas as pd
import numpy as np

# Load NYC weather of 2023 Jan from landing zone
weather_2023_jan_df = pd.read_csv(f"/lakehouse/default/Files/01_landing/nyc_weather_2023-01.csv")

# Derive month, day and hour columns from time column
weather_2023_jan_df["month"] = pd.to_datetime(weather_2023_jan_df["time"]).dt.month.astype(np.uint8)
weather_2023_jan_df["day"] = pd.to_datetime(weather_2023_jan_df["time"]).dt.day.astype(np.uint8)
weather_2023_jan_df["hour"] = pd.to_datetime(weather_2023_jan_df["time"]).dt.hour.astype(np.uint8)

# Scaling weather column with the weather scaler
weather_2023_jan_df[["scaled_temperature", "scaled_windspeed", "scaled_precipitation", "scaled_cloudcover"]] = nycweather_scaler.transform(
    weather_2023_jan_df[["temperature_2m_c", "windspeed_10m_km_per_hour", "precipitation_mm", "cloudcover_percentage"]])

weather_2023_jan_df.head()


In [None]:
# Load NYC yellow taxi tripdata of 2023 Jan from landing zone
taxi_2023_jan_df = pd.read_parquet(f"/lakehouse/default/Files/01_landing/yellow_tripdata_2023-01.parquet")

# Derive month_pickup, day_pickup, weekday_pickup and hour_pickup columns from tpep_pickup_datetime column
taxi_2023_jan_df["month_pickup"] = pd.to_datetime(taxi_2023_jan_df["tpep_pickup_datetime"]).dt.month.astype(np.uint8)
taxi_2023_jan_df["weekday_pickup"] = pd.to_datetime(taxi_2023_jan_df["tpep_pickup_datetime"]).dt.weekday.astype(np.uint8)
taxi_2023_jan_df["day_pickup"] = pd.to_datetime(taxi_2023_jan_df["tpep_pickup_datetime"]).dt.day.astype(np.uint8)
taxi_2023_jan_df["hour_pickup"] = pd.to_datetime(taxi_2023_jan_df["tpep_pickup_datetime"]).dt.hour.astype(np.uint8)

taxi_2023_jan_df.head()


In [None]:
# Load location zones data from standardization zone
zones_df = pd.read_parquet(f"/lakehouse/default/Files/03_standard/nyc_zones.parquet")

# Encoding Borough columns
borough_array = zones_df["Borough"].unique()
zones_df["borough_id"] = list(map(lambda x: list(borough_array).index(x), zones_df["Borough"]))

# Join location zones data with taxi trip data
taxi_2023_jan_df = taxi_2023_jan_df.join(zones_df.set_index("LocationID"), on="PULocationID")

# Calculate the taxi pickup demand by hour_pickup, day_pickup, weekday_pickup, month_pickup and borough_id columns
aggregated_df = taxi_2023_jan_df.groupby(["hour_pickup", "day_pickup", "weekday_pickup", "month_pickup", "borough_id"]).agg(
    demand=('hour_pickup', 'size')
).reset_index()

aggregated_df.head()


In [None]:
# Join the NYC taxi trip aggregated data with the NYC weather data
test_df = aggregated_df.join(weather_2023_jan_df.set_index(["month", "day", "hour"]), 
                       on=["month_pickup", "day_pickup", "hour_pickup"],
                       how="inner")

# Pick required columns and sort the dataframe
test_df = test_df[["hour_pickup", "day_pickup", "weekday_pickup", "month_pickup", "borough_id", "scaled_temperature", "scaled_windspeed", "scaled_precipitation", "scaled_cloudcover", "demand"]]
test_df = test_df.sort_values(by=["month_pickup", "day_pickup", "weekday_pickup", "hour_pickup", "borough_id"], ascending=False)

# Get the first 30 rows
test_df = test_df.head(30)
test_df = test_df[['borough_id', 'hour_pickup', 'day_pickup', 'weekday_pickup', 'month_pickup', 'scaled_temperature', 'scaled_windspeed', 'scaled_precipitation', 'scaled_cloudcover', 'demand']]
test_df.columns


# Batch inference

In [None]:
import mlflow 
from synapse.ml.predict import MLFlowTransformer 

# Pick a model version with satisfied evaluation metrics
model_version = 1

spark_test_df = spark.createDataFrame(test_df)
spark_test_df = spark_test_df.drop("demand")

model = MLFlowTransformer( 
    inputCols=spark_test_df.columns,
    outputCol='demand',
    modelName='demand_prediction_model',
    modelVersion=model_version
) 
batch_predictions = model.transform(spark_test_df)
display(batch_predictions)


In [None]:
import matplotlib.pyplot as plt

# Plot line chart to compare actual and predicted values
plt.plot(test_df["demand"].values, label='actual')
plt.plot(batch_predictions.select("demand").collect(), label='predicted')
plt.title("actual vs. predicted")
plt.legend(loc='upper right')
plt.show()
