In [1]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestRegressor
from itertools import combinations

from pdpilot import partial_dependence, PDPilotWidget

In [2]:
dataset_url = 'https://gist.githubusercontent.com/DanielKerrigan/f324b392dc9a58d8bd8f8d79e1101a12/raw/c3b4760c9facfac26bcab2cd7465c4cab88ef304/bike-hour.csv'

In [3]:
df_original = pd.read_csv(dataset_url).drop(columns=['yr'])

In [4]:
df_original.head()

Unnamed: 0,days_since_2011,season,mnth,hr,holiday,weekday,workingday,weathersit,temp,atemp,hum,windspeed,cnt
0,0,1,1,0,0,6,0,1,0.24,0.2879,0.81,0.0,16
1,0,1,1,1,0,6,0,1,0.22,0.2727,0.8,0.0,40
2,0,1,1,2,0,6,0,1,0.22,0.2727,0.8,0.0,32
3,0,1,1,3,0,6,0,1,0.24,0.2879,0.75,0.0,13
4,0,1,1,4,0,6,0,1,0.24,0.2879,0.75,0.0,1


In [5]:
df_one_hot = pd.get_dummies(df_original, columns=['weathersit'])

In [6]:
df_X = df_one_hot.drop(columns=['cnt'])

In [7]:
y = df_original['cnt'].to_numpy()

In [8]:
regr = RandomForestRegressor(n_estimators=20)
regr.fit(df_X, y)

In [9]:
features = [col for col in df_original.columns if col != 'cnt']

In [10]:
one_hot_features = {
    'weathersit': [
        ('weathersit_1', 'clear'),
        ('weathersit_2', 'mist'),
        ('weathersit_3', 'rain'),
        ('weathersit_4', 'storm')
    ]
}

In [11]:
feature_value_mappings = {
    'season': {
        1: 'winter',
        2: 'spring',
        3: 'summer',
        4: 'fall'
    },
    'weekday': {
        0: 'S',
        1: 'M',
        2: 'T',
        3: 'W',
        4: 'R',
        5: 'F',
        6: 'S'
    }
}

In [12]:
subset = df_X.sample(1000)

In [13]:
pd_data = partial_dependence(
    predict=regr.predict,
    df=subset,
    features=features,
    one_hot_features=one_hot_features,
    feature_value_mappings=feature_value_mappings,
    resolution=20,
    n_jobs=4,
)

Calculating 12 one-way PDPs


100%|██████████████████████████████████████████| 12/12 [00:04<00:00,  2.74PDP/s]


Calculating 23 two-way PDPs


100%|██████████████████████████████████████████| 23/23 [00:07<00:00,  3.03PDP/s]


In [14]:
w = PDPilotWidget(
    predict=regr.predict,
    df=subset,
    pd_data=pd_data,
    height=650
)

w

PDPilotWidget(dataset={'days_since_2011': [33, 175, 424, 687, 498, 475, 105, 57, 337, 395, 502, 633, 565, 497,…