In [1]:
import polars as pl
from great_tables import GT, md, html, nanoplot_options
from colorzero import *
from IPython.display import display, HTML

In [2]:
_file_location = './limited-memory-example/yellow_tripdata_2015-01.csv'
df = pl.read_csv(_file_location)

In [3]:
bw = (
    df
    .sort(pl.col('tpep_pickup_datetime'), descending=False)
    .group_by_dynamic(pl.col('tpep_pickup_datetime').str.to_datetime('%Y-%m-%d %H:%M:%S'), every="1w")
    .agg(
        pl.col('passenger_count').mean().alias('average_passenger_count'),
        pl.col('trip_distance').mean().alias('average_trip_distance'),
        pl.col('fare_amount').mean(),
        pl.col('extra').mean(),
        pl.col('mta_tax').mean(),
        pl.col('tip_amount').mean(),
        pl.col('tolls_amount').mean(),
        pl.col('total_amount').mean())
    ).rename(mapping={'tpep_pickup_datetime':'week'})

In [4]:
bw2 = (
    df
    .sort(pl.col('tpep_pickup_datetime'), descending=False)
    .group_by_dynamic(pl.col('tpep_pickup_datetime').str.to_datetime('%Y-%m-%d %H:%M:%S'), every="1d")
    .agg(pl.col('passenger_count').sum().alias('passenger_count_daily'))
    .rename(mapping={'tpep_pickup_datetime':'day'})
    .sort(pl.col('day'), descending=False)
    .group_by_dynamic(pl.col('day'), every="1w")
    .agg(pl.col('passenger_count_daily').implode())
    .with_columns(pl.col('passenger_count_daily').list.get(0))
    .rename(mapping={'day':'week'})
    )

In [5]:
gt1 = (
    bw.join(bw2, on='week', how='inner').style
        .tab_header('NYC Taxi dataset', subtitle=f"Weekly statistics between Dec 29, 2014 to Jan 26, 2015")
        .tab_stub(rowname_col='week')
        .tab_source_note(source_note='Kaggle, July 2017. https://www.kaggle.com/competitions/nyc-taxi-trip-duration')
        .tab_spanner(label='Averages', columns=['average_passenger_count', 'average_trip_distance'])
        .tab_spanner(label='Avg. Costs', columns=['fare_amount', 'extra', 'mta_tax', 'tip_amount', 'tolls_amount', 'total_amount'])
        .fmt_currency(columns=['fare_amount', 'extra', 'mta_tax', 'tip_amount', 'tolls_amount', 'total_amount'])
        .fmt_date(columns=['week'], date_style='m_day_year')
        .fmt_number(columns=['average_passenger_count', 'average_trip_distance'], compact=True)
        .fmt_nanoplot(columns='passenger_count_daily', autoscale=True)
        .cols_label(
            average_passenger_count=html('Passenger<br>count'),
            average_trip_distance=html('Trip<br>distance'),
            fare_amount=html('Fare'),
            extra='Extra',
            mta_tax='MTA tax',
            tip_amount='Tip',
            tolls_amount=html('Tolls'),
            total_amount=html('Total'),
            passenger_count_daily=html('Passenger count<br>(daily)')
        )
)

In [6]:
gt1

NYC Taxi dataset,NYC Taxi dataset,NYC Taxi dataset,NYC Taxi dataset,NYC Taxi dataset,NYC Taxi dataset,NYC Taxi dataset,NYC Taxi dataset,NYC Taxi dataset,NYC Taxi dataset
"Weekly statistics between Dec 29, 2014 to Jan 26, 2015","Weekly statistics between Dec 29, 2014 to Jan 26, 2015","Weekly statistics between Dec 29, 2014 to Jan 26, 2015","Weekly statistics between Dec 29, 2014 to Jan 26, 2015","Weekly statistics between Dec 29, 2014 to Jan 26, 2015","Weekly statistics between Dec 29, 2014 to Jan 26, 2015","Weekly statistics between Dec 29, 2014 to Jan 26, 2015","Weekly statistics between Dec 29, 2014 to Jan 26, 2015","Weekly statistics between Dec 29, 2014 to Jan 26, 2015","Weekly statistics between Dec 29, 2014 to Jan 26, 2015"
Unnamed: 0_level_2,Averages,Averages,Avg. Costs,Avg. Costs,Avg. Costs,Avg. Costs,Avg. Costs,Avg. Costs,Passenger count (daily)
Unnamed: 0_level_3,Passenger count,Trip distance,Fare,Extra,MTA tax,Tip,Tolls,Total,Passenger count (daily)
"Dec 29, 2014",1.77,19.39,$12.26,$0.26,$0.50,$1.33,$0.26,$14.91,898K232K690K607K722K566K
"Jan 5, 2015",1.68,22.71,$11.77,$0.32,$0.50,$1.51,$0.24,$14.64,898K232K601K632K707K746K747K893K719K
"Jan 12, 2015",1.67,2.76,$11.89,$0.32,$0.50,$2.84,$0.25,$16.09,898K232K648K735K725K740K795K829K743K
"Jan 19, 2015",1.67,18.76,$11.92,$0.30,$0.50,$1.61,$0.25,$14.89,898K232K576K661K704K740K783K799K694K
"Jan 26, 2015",1.67,4.78,$11.86,$0.32,$0.50,$1.59,$0.21,$14.78,898K232K376K232K622K705K801K898K
"Kaggle, July 2017. https://www.kaggle.com/competitions/nyc-taxi-trip-duration","Kaggle, July 2017. https://www.kaggle.com/competitions/nyc-taxi-trip-duration","Kaggle, July 2017. https://www.kaggle.com/competitions/nyc-taxi-trip-duration","Kaggle, July 2017. https://www.kaggle.com/competitions/nyc-taxi-trip-duration","Kaggle, July 2017. https://www.kaggle.com/competitions/nyc-taxi-trip-duration","Kaggle, July 2017. https://www.kaggle.com/competitions/nyc-taxi-trip-duration","Kaggle, July 2017. https://www.kaggle.com/competitions/nyc-taxi-trip-duration","Kaggle, July 2017. https://www.kaggle.com/competitions/nyc-taxi-trip-duration","Kaggle, July 2017. https://www.kaggle.com/competitions/nyc-taxi-trip-duration","Kaggle, July 2017. https://www.kaggle.com/competitions/nyc-taxi-trip-duration"


In [7]:
low_col = Color('lime') - Saturation(0.4)
high_col = Color('yellow') - Saturation(0.2)

In [8]:
gt2 = (
    gt1
    .data_color(
        palette=[low_col.html, high_col.html], 
        columns=[k for k,v in bw.schema.items() 
                 if v.is_numeric() 
                 and k not in ('average_passenger_count', 'average_trip_distance')])
    .data_color(
        palette=["white", "blue"], 
        columns=['average_passenger_count', 'average_trip_distance']
    )
)

In [9]:
gt2

NYC Taxi dataset,NYC Taxi dataset,NYC Taxi dataset,NYC Taxi dataset,NYC Taxi dataset,NYC Taxi dataset,NYC Taxi dataset,NYC Taxi dataset,NYC Taxi dataset,NYC Taxi dataset
"Weekly statistics between Dec 29, 2014 to Jan 26, 2015","Weekly statistics between Dec 29, 2014 to Jan 26, 2015","Weekly statistics between Dec 29, 2014 to Jan 26, 2015","Weekly statistics between Dec 29, 2014 to Jan 26, 2015","Weekly statistics between Dec 29, 2014 to Jan 26, 2015","Weekly statistics between Dec 29, 2014 to Jan 26, 2015","Weekly statistics between Dec 29, 2014 to Jan 26, 2015","Weekly statistics between Dec 29, 2014 to Jan 26, 2015","Weekly statistics between Dec 29, 2014 to Jan 26, 2015","Weekly statistics between Dec 29, 2014 to Jan 26, 2015"
Unnamed: 0_level_2,Averages,Averages,Avg. Costs,Avg. Costs,Avg. Costs,Avg. Costs,Avg. Costs,Avg. Costs,Passenger count (daily)
Unnamed: 0_level_3,Passenger count,Trip distance,Fare,Extra,MTA tax,Tip,Tolls,Total,Passenger count (daily)
"Dec 29, 2014",1.77,19.39,$12.26,$0.26,$0.50,$1.33,$0.26,$14.91,898K232K690K607K722K566K
"Jan 5, 2015",1.68,22.71,$11.77,$0.32,$0.50,$1.51,$0.24,$14.64,898K232K601K632K707K746K747K893K719K
"Jan 12, 2015",1.67,2.76,$11.89,$0.32,$0.50,$2.84,$0.25,$16.09,898K232K648K735K725K740K795K829K743K
"Jan 19, 2015",1.67,18.76,$11.92,$0.30,$0.50,$1.61,$0.25,$14.89,898K232K576K661K704K740K783K799K694K
"Jan 26, 2015",1.67,4.78,$11.86,$0.32,$0.50,$1.59,$0.21,$14.78,898K232K376K232K622K705K801K898K
"Kaggle, July 2017. https://www.kaggle.com/competitions/nyc-taxi-trip-duration","Kaggle, July 2017. https://www.kaggle.com/competitions/nyc-taxi-trip-duration","Kaggle, July 2017. https://www.kaggle.com/competitions/nyc-taxi-trip-duration","Kaggle, July 2017. https://www.kaggle.com/competitions/nyc-taxi-trip-duration","Kaggle, July 2017. https://www.kaggle.com/competitions/nyc-taxi-trip-duration","Kaggle, July 2017. https://www.kaggle.com/competitions/nyc-taxi-trip-duration","Kaggle, July 2017. https://www.kaggle.com/competitions/nyc-taxi-trip-duration","Kaggle, July 2017. https://www.kaggle.com/competitions/nyc-taxi-trip-duration","Kaggle, July 2017. https://www.kaggle.com/competitions/nyc-taxi-trip-duration","Kaggle, July 2017. https://www.kaggle.com/competitions/nyc-taxi-trip-duration"


In [16]:
bh = (
    df
    .sort(pl.col('tpep_pickup_datetime'), descending=False)
    .group_by_dynamic(
        pl.col('tpep_pickup_datetime').str.to_datetime('%Y-%m-%d %H:%M:%S'), 
        every="1d", 
        group_by=['VendorID'])
    .agg(pl.col('total_amount').mean().alias('total_amount'))
    .group_by('VendorID').agg(pl.col('total_amount').implode())
    .with_columns(pl.col('total_amount').list.get(0))
    .sort('VendorID', descending=False)
)

In [17]:
bh.style.fmt_nanoplot(columns="total_amount", reference_line="mean")

VendorID,total_amount
1,14.615.711.915.214.714.015.315.014.614.214.514.913.914.714.914.815.015.315.113.914.114.915.014.815.115.113.714.814.611.915.015.715.113.9
2,15.431.912.015.414.914.215.715.414.914.514.715.114.014.915.215.115.215.515.314.031.915.215.415.115.415.413.715.014.812.015.415.915.314.0


In [18]:
bh.style.fmt_nanoplot(columns="total_amount", autoscale=True)

VendorID,total_amount
1,31.911.915.214.714.015.315.014.614.214.514.913.914.714.914.815.015.315.113.914.114.915.014.815.115.113.714.814.611.915.015.715.113.9
2,31.911.915.414.914.215.715.414.914.514.715.114.014.915.215.115.215.515.314.031.915.215.415.115.415.413.715.014.812.015.415.915.314.0


In [19]:
bh.style.fmt_nanoplot(columns="total_amount", autoscale=True, plot_type="bar")

VendorID,total_amount
1,31.9015.214.714.015.315.014.614.214.514.913.914.714.914.815.015.315.113.914.114.915.014.815.115.113.714.814.611.915.015.715.113.9
2,31.9015.414.914.215.715.414.914.514.715.114.014.915.215.115.215.515.314.031.915.215.415.115.415.413.715.014.812.015.415.915.314.0
