# Performance

## Imports

In [None]:
import pdpexplorer
from pdpexplorer.pdp import get_single_pdps, get_double_pdps
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestRegressor
import altair as alt

from itertools import combinations
import timeit
import cProfile
import pstats
from pstats import SortKey

## Preparing Data and Model

In [None]:
df_original = pd.read_csv('bike-sharing.csv')
df_original.head()

In [None]:
categories = ['season', 'weathersit']
df_one_hot = pd.get_dummies(df_original, columns=categories)

In [None]:
feature_to_one_hot = {
    original_feat: [
        (feat, feat.split('_')[-1])
        for feat in df_one_hot.columns
        if feat.startswith(original_feat)
    ]
    for original_feat in categories
}
feature_to_one_hot

In [None]:
df_X = df_one_hot.drop(columns=['label', 'prediction'])
X = df_X.to_numpy()
X

In [None]:
y = df_original['label'].to_numpy()
y

In [None]:
regr = RandomForestRegressor(n_estimators=20)
regr.fit(X, y)
regr.predict(X)

In [None]:
value_to_one_hot = {
    (feature, value): one_hot
    for feature, one_hots in feature_to_one_hot.items()
    for one_hot, value in one_hots
}
value_to_one_hot

In [None]:
one_hot_features = {
    one_hot
    for one_hots in feature_to_one_hot.values()
    for one_hot, _ in one_hots
}
one_hot_features

In [None]:
normal_features = [
    feat
    for feat in df_X.columns
    if feat not in one_hot_features
]
normal_features

In [None]:
unique_feature_vals = {
    col: sorted(list(df_X[col].unique()))
    for col in normal_features
}

for feature, one_hot_info in feature_to_one_hot.items():
    unique_feature_vals[feature] = sorted([value for (_, value) in one_hot_info])

In [None]:
quantitative_features = {
    feature
    for feature in df_X.select_dtypes(include='number').columns
    if feature not in one_hot_features and len(unique_feature_vals[feature]) > 12
}

In [None]:
features = sorted(normal_features + list(feature_to_one_hot.keys()))

In [None]:
pairs = list(combinations(features, 2))

## Profiling

### Single feature

In [None]:
statement = '''\
get_single_pdps(
    regr,
    df_X,
    features,
    20,
    feature_to_one_hot,
    value_to_one_hot,
    quantitative_features,
    unique_feature_vals
)
'''

cProfile.run(statement, 'restats1d')
p = pstats.Stats('restats1d')

In [None]:
p.strip_dirs().sort_stats(SortKey.TIME).print_stats()

In [None]:
p.strip_dirs().sort_stats(SortKey.CUMULATIVE).print_stats()

### Double feature

In [None]:
statement = '''\
get_double_pdps(
    regr,
    df_X,
    pairs,
    20,
    feature_to_one_hot,
    value_to_one_hot,
    quantitative_features,
    unique_feature_vals
)
'''

cProfile.run(statement, 'restats2d')
p = pstats.Stats('restats2d')

In [None]:
p.strip_dirs().sort_stats(SortKey.TIME).print_stats()

In [None]:
p.strip_dirs().sort_stats(SortKey.CUMULATIVE).print_stats()

## Timing

In [None]:
%%time

single_pdps = get_single_pdps(
    regr,
    df_X,
    features,
    20,
    feature_to_one_hot,
    value_to_one_hot,
    quantitative_features,
    unique_feature_vals
)

In [None]:
%%time

double_pdps = get_double_pdps(
    regr,
    df_X,
    pairs,
    20,
    feature_to_one_hot,
    value_to_one_hot,
    quantitative_features,
    unique_feature_vals
)

In [None]:
def time_resolution_1d():
    resolutions = list(range(5, len(unique_feature_vals['hum']), 5))
    times = []
    for res in resolutions:
        def func():
            get_single_pdps(
                regr,
                df_X,
                ['hum'],
                res,
                feature_to_one_hot,
                value_to_one_hot,
                quantitative_features,
                unique_feature_vals
            )
        t = timeit.timeit(func, number=5)
        times.append(t)
        
    df = pd.DataFrame({
        'resolution': resolutions,
        'seconds': times
    })
    
    return df

In [None]:
def time_resolution_2d():
    min_unique = min(len(unique_feature_vals['hum']), len(unique_feature_vals['atemp']))
    resolutions = list(range(5, min_unique, 5))
    times = []
    for res in resolutions:
        def func():
            get_double_pdps(
                regr,
                df_X,
                [('hum', 'atemp')],
                res,
                feature_to_one_hot,
                value_to_one_hot,
                quantitative_features,
                unique_feature_vals
            )
        t = timeit.timeit(func, number=2)
        times.append(t)
        
    df = pd.DataFrame({
        'resolution': resolutions,
        'seconds': times
    })
    
    return df

In [None]:
def plot_resolution(df):
    return alt.Chart(df).mark_line().encode(
        x='resolution',
        y='seconds'
    )

In [None]:
df_res_1d = time_resolution_1d()

In [None]:
plot_resolution(df_res_1d)

In [None]:
df_res_2d = time_resolution_2d()

In [None]:
plot_resolution(df_res_2d)

In [None]:
def time_instances_1d():
    instances = list(range(50, df_X.shape[0], 50))
    times = []
    for i in instances:
        def func():
            get_single_pdps(
                regr,
                df_X.sample(i),
                ['hum'],
                20,
                feature_to_one_hot,
                value_to_one_hot,
                quantitative_features,
                unique_feature_vals
            )
        t = timeit.timeit(func, number=5)
        times.append(t)
        
    df = pd.DataFrame({
        'instances': instances,
        'seconds': times
    })
    
    return df

In [None]:
def plot_instances(df):
    return alt.Chart(df).mark_line().encode(
        x='instances',
        y='seconds'
    )

In [None]:
df_instances_1d = time_instances_1d()

In [None]:
plot_instances(df_instances_1d)