In [35]:
import pandas as pd
import numpy as np
import math
from sklearn.datasets import fetch_california_housing
from sklearn.ensemble import HistGradientBoostingRegressor
import statistics as sts
from sliceline.slicefinder import Slicefinder
import optbinning

**Reproducibility Notes**

The PyPi `sliceline` package requires Python 3.7~3.10.0, which is not the most up-to-date python version. Creating a virtual environment with python 3.9 should work. 

In [36]:
def split_xy(df):
    train_y = df['median_house_value']
    train_x = df.drop('median_house_value', axis=1)
    return train_x, train_y

In [37]:
df = pd.read_csv('data/housing0.csv')
train_x, train_y = split_xy(df)

In [38]:
train_x

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income
0,-118.30,33.90,19.0,2421.0,689.0,1726.0,660.0,3.2870
1,-118.30,33.99,44.0,1458.0,326.0,1159.0,283.0,1.1645
2,-122.07,37.06,31.0,1634.0,370.0,939.0,332.0,3.8625
3,-122.02,37.60,31.0,2155.0,522.0,1858.0,437.0,2.6520
4,-120.87,41.54,21.0,1091.0,208.0,660.0,188.0,2.2321
...,...,...,...,...,...,...,...,...
4123,-117.98,33.68,14.0,3396.0,477.0,1542.0,472.0,7.3982
4124,-119.45,35.07,45.0,973.0,183.0,500.0,177.0,2.6389
4125,-118.23,33.89,36.0,2598.0,514.0,1872.0,514.0,3.1667
4126,-121.48,38.52,34.0,2561.0,497.0,1583.0,530.0,3.1583


In [39]:
train_y

0       181400.0
1        98200.0
2       232300.0
3       159800.0
4        34600.0
          ...   
4123    369100.0
4124     30000.0
4125    117700.0
4126     95800.0
4127    146200.0
Name: median_house_value, Length: 4128, dtype: float64

In [40]:
test_x, test_y = split_xy(pd.read_csv('data/housing1.csv'))

In [41]:
def train_model(train_x, train_y):
    model = HistGradientBoostingRegressor(random_state=42)
    model.fit(train_x, train_y)
    return model

In [42]:
model = train_model(train_x, train_y)

In [43]:
def get_errors(model, x, y):
    preds = model.predict(x)
    training_errors = (y - preds)**2
    return training_errors

In [44]:
test_errors = get_errors(model, test_x, test_y)
test_errors

0       1.029576e+09
1       7.331564e+07
2       7.957983e+07
3       1.731452e+09
4       1.588708e+09
            ...     
4123    2.563728e+09
4124    3.216976e+06
4125    5.189770e+03
4126    2.787749e+09
4127    2.836143e+09
Name: median_house_value, Length: 4128, dtype: float64

In [45]:
train_errors = get_errors(model, train_x, train_y)

In [46]:
def get_rms_error(errors):
    means = sts.mean(errors)
    rms_error = math.sqrt(means)
    return rms_error

In [47]:
get_rms_error(test_errors)

52174.79239835969

In [48]:
optimal_binner = optbinning.ContinuousOptimalBinning(max_n_bins=5)

train_x_binned = pd.DataFrame(np.array(
    [
        optimal_binner.fit_transform(train_x[col], train_errors, metric="bins") for col in train_x.columns
    ]
).T, columns=train_x.columns)

In [49]:
train_x_binned

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income
0,"[-122.37, -118.28)","[33.89, 37.95)","(-inf, 26.50)","[2350.50, inf)","[576.50, 714.50)","[1267.50, inf)","[628.50, inf)","[2.60, inf)"
1,"[-122.37, -118.28)","[33.89, 37.95)","[42.50, 51.50)","[611.50, 2003.00)","[135.50, 514.50)","[1081.50, 1180.50)","[122.50, 445.50)","(-inf, 1.83)"
2,"[-122.37, -118.28)","[33.89, 37.95)","[26.50, 39.50)","[611.50, 2003.00)","[135.50, 514.50)","[351.50, 1081.50)","[122.50, 445.50)","[2.60, inf)"
3,"[-122.37, -118.28)","[33.89, 37.95)","[26.50, 39.50)","[2143.50, 2350.50)","[514.50, 576.50)","[1267.50, inf)","[122.50, 445.50)","[2.60, inf)"
4,"[-122.37, -118.28)","[38.98, inf)","(-inf, 26.50)","[611.50, 2003.00)","[135.50, 514.50)","[351.50, 1081.50)","[122.50, 445.50)","[2.11, 2.35)"
...,...,...,...,...,...,...,...,...
4123,"[-118.08, inf)","(-inf, 33.73)","(-inf, 26.50)","[2350.50, inf)","[135.50, 514.50)","[1267.50, inf)","[445.50, 569.50)","[2.60, inf)"
4124,"[-122.37, -118.28)","[33.89, 37.95)","[42.50, 51.50)","[611.50, 2003.00)","[135.50, 514.50)","[351.50, 1081.50)","[122.50, 445.50)","[2.60, inf)"
4125,"[-118.28, -118.19)","[33.89, 37.95)","[26.50, 39.50)","[2350.50, inf)","[135.50, 514.50)","[1267.50, inf)","[445.50, 569.50)","[2.60, inf)"
4126,"[-122.37, -118.28)","[37.95, 38.98)","[26.50, 39.50)","[2350.50, inf)","[135.50, 514.50)","[1267.50, inf)","[445.50, 569.50)","[2.60, inf)"


In [50]:
# fitting sliceline
sf = Slicefinder(
    alpha = 0.9,
    k = 1,
    max_l = 3,
    min_sup = 0,
    verbose = True
)
sf.fit(train_x_binned, train_errors)

  and array.dtypes.apply(is_sparse).any()
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
DEBUG:sliceline.slicefinder:Dropping 0/41 features below min_sup = 0.
DEBUG:sliceline.slicefinder:Initial top-K: count=1, max=0.021801, min=0.021801
  (
DEBUG:sliceline.slicefinder:Level 2:
DEBUG:sliceline.slicefinder: -- generated paired slice candidates: 41 -> 700
  (slice_errors / slice_sizes) / self.average_error_ - 1
  ) - (1 - self.alpha) * (n_row_x_encoded / slice_sizes - 1)
DEBUG:sliceline.slicefinder: -- valid slices after eval: 665/700
DEBUG:sliceline.slicefinder: -- top-K: count=1, max=0.021801, min=0.021801
DEBUG:sliceline.slicefinder:Level 3:
DEBUG:sliceline.slicefinder: -- generated paired slice candidates: 665 -> 4359
DEBUG:sliceline.slicefinder: -- valid slices after eval: 4022/4359
DEBUG:sliceline.slicefinder: -- top-K: count=1, max=0.021801, min=0.021801
DEBUG:sliceline.slicefinder:Terminated at level 3.


In [51]:
sf.top_slices_

array([[None, None, None, None, None, None, None, '[2.60, inf)']],
      dtype=object)

In [52]:
pd.DataFrame(sf.top_slices_, columns=sf.feature_names_in_, index=sf.get_feature_names_out())

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income
slice_0,,,,,,,,"[2.60, inf)"


In [63]:
sources = [
    'data/housing2.csv',
    'data/housing3.csv',
    'data/housing4.csv'
]
costs = [1.0, 1.0, 1.0]
# We will consider only one slice
# median income in [2.60, inf)
slices = [
    [
        '(-inf, inf)',
        '(-inf, inf)',
        '(-inf, inf)',
        '(-inf, inf)',
        '(-inf, inf)',
        '(-inf, inf)',
        '(-inf, inf)',
        '(-inf, inf)',
        '(2.60, inf)',
    ]
]
query_counts = [ 50 ]

In [64]:
from dt import *
dt = DT(sources, costs, slices, None, batch=100)
print(dt)

n: 3
sources: ['data/housing2.csv', 'data/housing3.csv', 'data/housing4.csv']
costs: [1. 1. 1.]
slices: [[(-inf, inf), (-inf, inf), (-inf, inf), (-inf, inf), (-inf, inf), (-inf, inf), (-inf, inf), (-inf, inf), (2.6, inf)]]
stats: [[0.]
 [0.]
 [0.]]
batch: 100



In [65]:
additional_data = dt.run(query_counts)

In [66]:
add_df = pd.DataFrame(additional_data, columns=df.columns)
add_df

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value
0,-122.21,37.78,46.0,2239.0,508.0,1390.0,569.0,2.7352,137300.0
1,-118.16,34.09,52.0,1722.0,448.0,1122.0,425.0,3.1204,224000.0
2,-120.41,36.77,24.0,1335.0,312.0,1180.0,267.0,1.9470,68900.0
3,-124.09,40.44,38.0,2220.0,426.0,1041.0,401.0,2.3947,70500.0
4,-120.95,37.61,17.0,4054.0,654.0,2034.0,667.0,4.6833,142200.0
...,...,...,...,...,...,...,...,...,...
95,-122.23,38.17,45.0,350.0,,225.0,72.0,1.8942,216700.0
96,-118.41,34.25,36.0,1146.0,259.0,1173.0,272.0,3.6016,153800.0
97,-118.45,34.30,35.0,4085.0,919.0,3988.0,906.0,3.4812,160200.0
98,-117.01,32.81,26.0,4499.0,645.0,1866.0,626.0,5.5160,185100.0


In [67]:
aug_df = pd.concat([df, add_df], ignore_index=True)
aug_df

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value
0,-118.30,33.90,19.0,2421.0,689.0,1726.0,660.0,3.2870,181400.0
1,-118.30,33.99,44.0,1458.0,326.0,1159.0,283.0,1.1645,98200.0
2,-122.07,37.06,31.0,1634.0,370.0,939.0,332.0,3.8625,232300.0
3,-122.02,37.60,31.0,2155.0,522.0,1858.0,437.0,2.6520,159800.0
4,-120.87,41.54,21.0,1091.0,208.0,660.0,188.0,2.2321,34600.0
...,...,...,...,...,...,...,...,...,...
4223,-122.23,38.17,45.0,350.0,,225.0,72.0,1.8942,216700.0
4224,-118.41,34.25,36.0,1146.0,259.0,1173.0,272.0,3.6016,153800.0
4225,-118.45,34.30,35.0,4085.0,919.0,3988.0,906.0,3.4812,160200.0
4226,-117.01,32.81,26.0,4499.0,645.0,1866.0,626.0,5.5160,185100.0


In [68]:
aug_x, aug_y = split_xy(aug_df)

In [69]:
aug_model = train_model(aug_x, aug_y)

In [70]:
aug_test_errors = get_errors(aug_model, test_x, test_y)
aug_test_errors

0       9.803068e+08
1       3.489624e+05
2       7.428097e+03
3       2.189351e+09
4       1.720292e+09
            ...     
4123    4.052988e+09
4124    6.047012e+07
4125    6.926057e+07
4126    2.727649e+09
4127    1.584252e+09
Name: median_house_value, Length: 4128, dtype: float64

In [71]:
get_rms_error(aug_test_errors)

52390.95011486408

In [72]:
train_errors = get_errors(model, aug_x, aug_y)