In [51]:
import pandas as pd
import numpy as np
import math
from sklearn.datasets import fetch_california_housing
from sklearn.ensemble import HistGradientBoostingRegressor
import statistics as sts
from sliceline.slicefinder import Slicefinder
import optbinning
from dt import *

**Reproducibility Notes**

The PyPi `sliceline` package requires Python 3.7~3.10.0, which is not the most up-to-date python version. Creating a virtual environment with python 3.9 should work. 

## Defining Reusable Methods

### Data Manipulation

In [60]:
def parse_file(filename):
    df = pd.read_csv(filename)
    train_x, train_y = split_xy(df)
    return train_x, train_y, df
def split_xy(df):
    train_y = df['median_house_value']
    train_x = df.drop('median_house_value', axis=1)
    return train_x, train_y

### Model Training

In [10]:
def train_model(train_x, train_y):
    model = HistGradientBoostingRegressor(random_state=42)
    model.fit(train_x, train_y)
    return model

### Error Analysis

In [11]:
def get_errors(model, x, y):
    preds = model.predict(x)
    training_errors = (y - preds)**2
    return training_errors

In [13]:
def get_rms(arr):
    means = sts.mean(arr)
    rms = math.sqrt(means)
    return rms

In [14]:
def get_rms_error(model, x, y):
    errors = get_errors(model, x, y)
    return get_rms(errors)

### Binning

In [20]:
def bin_xs(train_x, train_errors):
    optimal_binner = optbinning.ContinuousOptimalBinning(max_n_bins=5)
    train_x_binned = pd.DataFrame(np.array(
        [
            optimal_binner.fit_transform(train_x[col], train_errors, metric="bins") for col in train_x.columns
        ]
    ).T, columns=train_x.columns)
    return train_x_binned

### Sliceliner

In [24]:
def get_slices(train_x_binned, train_errors, alpha = 0.9, k=1, max_l = 3, min_sup = 0, verbose = False):
    sf = Slicefinder(alpha = alpha, k = k, max_l = max_l, min_sup = min_sup, verbose = verbose)
    sf.fit(train_x_binned, train_errors)
    df = pd.DataFrame(sf.top_slices_, columns=sf.feature_names_in_, index=sf.get_feature_names_out())
    return df

In [28]:
# Reformat slices returned from sliceliner as dataframe into a matrix of strings
def reformat_slices(slice_df):
    slice_df.fillna('(-inf, inf)', inplace=True)
    return slice_df.values.tolist()

### Putting the Pipeline Together

In [45]:
# Train a model, then get the underperforming slices
def pipeline_train(train_x, train_y, test_x, test_y):
    # Train model
    model = train_model(train_x, train_y)
    # Error analysis
    train_errors = get_errors(model, train_x, train_y)
    print("Train RMS error:", get_rms(train_errors))
    print("Test RMS error:", get_rms(get_errors(model, test_x, test_y)))
    # Binning
    train_x_binned = bin_xs(train_x, train_errors)
    # Sliceliner
    slices = reformat_slices(get_slices(train_x_binned, train_errors, alpha = 0.9, max_l = 3, min_sup = 0, verbose = False))
    return model, slices

In [36]:
# Obtain additional data
def pipeline_dt(sources, costs, slices, query_counts):
    dt = DT(sources, costs, slices, None, batch=100)
    additional_data = dt.run(query_counts)
    return additional_data

In [68]:
# Combine existing dataset with additional data
def pipeline_augment(train_x, train_y, additional_data, features):
    add_df = pd.DataFrame(additional_data, columns=features)
    add_x, add_y = split_xy(add_df)
    aug_x = pd.concat([train_x, add_x], ignore_index=True)
    aug_y = pd.concat([train_y, add_y], ignore_index=True)
    return aug_x, aug_y

## Housing Data Example

In [71]:
train_x, train_y, train = parse_file('data/housing0.csv')
test_x, test_y, test = parse_file('data/housing1.csv')
train_x

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income
0,-118.30,33.90,19.0,2421.0,689.0,1726.0,660.0,3.2870
1,-118.30,33.99,44.0,1458.0,326.0,1159.0,283.0,1.1645
2,-122.07,37.06,31.0,1634.0,370.0,939.0,332.0,3.8625
3,-122.02,37.60,31.0,2155.0,522.0,1858.0,437.0,2.6520
4,-120.87,41.54,21.0,1091.0,208.0,660.0,188.0,2.2321
...,...,...,...,...,...,...,...,...
4123,-117.98,33.68,14.0,3396.0,477.0,1542.0,472.0,7.3982
4124,-119.45,35.07,45.0,973.0,183.0,500.0,177.0,2.6389
4125,-118.23,33.89,36.0,2598.0,514.0,1872.0,514.0,3.1667
4126,-121.48,38.52,34.0,2561.0,497.0,1583.0,530.0,3.1583


In [62]:
model, slices = pipeline_train(train_x, train_y, test_x, test_y)
slices

Train RMS error: 35085.96794454075
Test RMS error: 52174.79239835969


  and array.dtypes.apply(is_sparse).any()
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  (
  (slice_errors / slice_sizes) / self.average_error_ - 1
  ) - (1 - self.alpha) * (n_row_x_encoded / slice_sizes - 1)


[['(-inf, inf)',
  '(-inf, inf)',
  '(-inf, inf)',
  '(-inf, inf)',
  '(-inf, inf)',
  '(-inf, inf)',
  '(-inf, inf)',
  '[2.60, inf)']]

In [63]:
sources = [
    'data/housing2.csv',
    'data/housing3.csv',
    'data/housing4.csv'
]
costs = [1.0, 1.0, 1.0]

In [64]:
# We will consider only one slice
# median income in [2.60, inf)
query_counts = [ 50 ]

In [65]:
additional_data = pipeline_dt(sources, costs, slices, query_counts)

In [69]:
aug_x, aug_y = pipeline_augment(train_x, train_y, additional_data, train.columns)

In [73]:
model, slices = pipeline_train(aug_x, aug_y, test_x, test_y)
slices

Train RMS error: 35359.47987203146
Test RMS error: 52390.95011486408


  and array.dtypes.apply(is_sparse).any()
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  (
  (slice_errors / slice_sizes) / self.average_error_ - 1
  ) - (1 - self.alpha) * (n_row_x_encoded / slice_sizes - 1)


[['[-122.37, -118.28)',
  '[33.89, 37.95)',
  '(-inf, inf)',
  '(-inf, inf)',
  '(-inf, inf)',
  '(-inf, inf)',
  '(-inf, inf)',
  '(-inf, inf)']]

In [77]:
additional_data = pipeline_dt(sources, costs, slices, query_counts)

In [78]:
aug_x, aug_y = pipeline_augment(aug_x, aug_y, additional_data, train.columns)
aug_x

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income
0,-118.30,33.90,19.0,2421.0,689.0,1726.0,660.0,3.2870
1,-118.30,33.99,44.0,1458.0,326.0,1159.0,283.0,1.1645
2,-122.07,37.06,31.0,1634.0,370.0,939.0,332.0,3.8625
3,-122.02,37.60,31.0,2155.0,522.0,1858.0,437.0,2.6520
4,-120.87,41.54,21.0,1091.0,208.0,660.0,188.0,2.2321
...,...,...,...,...,...,...,...,...
4423,-117.21,34.11,27.0,1245.0,229.0,692.0,234.0,3.2176
4424,-118.02,33.85,31.0,1922.0,329.0,1030.0,353.0,5.3416
4425,-122.44,37.80,52.0,3830.0,,1310.0,963.0,3.4801
4426,-122.16,37.70,36.0,1097.0,208.0,568.0,225.0,2.9917


In [79]:
model, slices = pipeline_train(aug_x, aug_y, test_x, test_y)
slices

Train RMS error: 35410.66330555648
Test RMS error: 52201.81100194593


  and array.dtypes.apply(is_sparse).any()
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  (
  (slice_errors / slice_sizes) / self.average_error_ - 1
  ) - (1 - self.alpha) * (n_row_x_encoded / slice_sizes - 1)


[['(-inf, inf)',
  '(-inf, inf)',
  '(-inf, 26.50)',
  '(-inf, inf)',
  '(-inf, inf)',
  '(-inf, inf)',
  '(-inf, inf)',
  '(-inf, inf)']]