In [1]:
import polars as pl

In [27]:
eager_df = pl.read_csv("../nyc_cab_data.csv", n_rows=10_000_000)

In [28]:
# Data Manipulation
# 1. drop nulls
# 2. change field data types - with_column
from polars import datatypes as dt
eager_df = eager_df.with_columns(
    pl.selectors.contains("_datetime").cast(dt.Datetime) # => 2. usage of selectors for selecting multiple columns based on name
).drop_nulls().sort("request_datetime") # => 1. drop null values in generic way

In [31]:
# insights
# 1. Average trip time 
# 2. Average passenger fare, tips

analysis = (
    eager_df.group_by("hvfhs_license_num")
    .agg(
        pl.col("trip_time").mean().alias("avg_time"),
        pl.col("base_passenger_fare").mean().alias("avg bpf"),
        pl.col("tips").mean().alias("avg tips")
        )
)


In [36]:
# 3. Day with more number of trips
date_filter = (
    eager_df.group_by(
        pl.col("request_datetime").cast(dt.Date)
    )
    .agg(
        pl.count("request_datetime").alias("trip_counts")
    )
    .sort(
        by="trip_counts", descending=True
    )
    .head(5)
    )