In [27]:
import pandas as pd
import numpy as np
import math
from sklearn.datasets import fetch_california_housing
from sklearn.ensemble import HistGradientBoostingRegressor
import statistics as sts
from sliceline.slicefinder import Slicefinder
import optbinning
from dt import *

**Reproducibility Notes**

The PyPi `sliceline` package requires Python 3.7~3.10.0, which is not the most up-to-date python version. Creating a virtual environment with python 3.9 should work. 

## Defining Reusable Methods

### Data Manipulation

In [28]:
def parse_file(filename):
    df = pd.read_csv(filename)
    train_x, train_y = dt.split_xy(df)
    return train_x, train_y, df

### Model Training

In [29]:
def train_model(train_x, train_y):
    model = HistGradientBoostingRegressor(random_state=42)
    model.fit(train_x, train_y)
    return model

### Error Analysis

In [30]:
def get_errors(model, x, y):
    preds = model.predict(x)
    training_errors = (y - preds)**2
    return training_errors

In [31]:
def get_rms(arr):
    means = sts.mean(arr)
    rms = math.sqrt(means)
    return rms

In [32]:
def get_rms_error(model, x, y):
    errors = get_errors(model, x, y)
    return get_rms(errors)

### Binning

In [33]:
def bin_xs(train_x, train_errors):
    optimal_binner = optbinning.ContinuousOptimalBinning(max_n_bins=5)
    train_x_binned = pd.DataFrame(np.array(
        [
            optimal_binner.fit_transform(train_x[col], train_errors, metric="bins") for col in train_x.columns
        ]
    ).T, columns=train_x.columns)
    return train_x_binned

### Sliceliner

In [34]:
def get_slices(train_x_binned, train_errors, alpha = 0.9, k=1, max_l = 3, min_sup = 0, verbose = False):
    sf = Slicefinder(alpha = alpha, k = k, max_l = max_l, min_sup = min_sup, verbose = verbose)
    sf.fit(train_x_binned, train_errors)
    df = pd.DataFrame(sf.top_slices_, columns=sf.feature_names_in_, index=sf.get_feature_names_out())
    return df

In [35]:
# Reformat slices returned from sliceliner as dataframe into a matrix of strings
def reformat_slices(slice_df):
    slice_df.fillna('(-inf, inf)', inplace=True)
    slice_list = slice_df.values.tolist()
    slice_parsed = dt.parse_slices(slice_list)
    return slice_parsed

In [36]:
# Get the number of times each slice already exists in xs
def get_slice_cnts(xs, slices):
    cnts = []
    for slice_ in slices:
        cnt = 0
        for x in xs.values.tolist():
            if dt.belongs_to_slice(slice_, x):
                cnt += 1 
        cnts.append(cnt)
    return cnts

### Putting the Pipeline Together

In [37]:
# Train a model, report errors, and return model & binned train set
def pipeline_train(train_x, train_y, test_x, test_y):
    # Train model
    model = train_model(train_x, train_y)
    # Error analysis
    train_errors = get_errors(model, train_x, train_y)
    print("Train RMS error:", get_rms(train_errors))
    print("Test RMS error:", get_rms(get_errors(model, test_x, test_y)))
    # Binning
    train_x_binned = bin_xs(train_x, train_errors)
    return model, train_x_binned, train_errors

In [38]:
def pipeline_sliceline(train_x, train_x_binned, train_errors, alpha = 0.9, max_l = 3, min_sup = 0, k = 1):
    # Sliceliner
    slices_df = get_slices(train_x_binned, train_errors, alpha = 0.9, max_l = 3, min_sup = 0, verbose = False, k=k)
    slices = reformat_slices(slices_df)
    existing_cnts = get_slice_cnts(train_x, slices)
    print("Slices:")
    print(slices_df)
    print("Existing counts:", existing_cnts)
    return slices

In [39]:
# Obtain additional data
def pipeline_dt(sources, costs, slices, query_counts):
    dt = DT(sources, costs, slices, None, batch=100)
    additional_data = dt.run(query_counts)
    return additional_data

In [40]:
# Combine existing dataset with additional data
# Additional data is shuffled in
def pipeline_augment(train_x, train_y, additional_data, features):
    add_df = pd.DataFrame(additional_data, columns=features)
    add_x, add_y = split_xy(add_df)
    aug_x = pd.concat([train_x, add_x], ignore_index=True)
    aug_x = aug_x.sample(frac=1, random_state=12345)
    aug_y = pd.concat([train_y, add_y], ignore_index=True)
    aug_y = aug_y.sample(frac=1, random_state=12345)
    return aug_x, aug_y

## Housing Data Example

In [41]:
train_x, train_y, train = parse_file('data/housing0.csv')
test_x, test_y, test = parse_file('data/housing1.csv')
train_x

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income
0,-118.30,33.90,19.0,2421.0,689.0,1726.0,660.0,3.2870
1,-118.30,33.99,44.0,1458.0,326.0,1159.0,283.0,1.1645
2,-122.07,37.06,31.0,1634.0,370.0,939.0,332.0,3.8625
3,-122.02,37.60,31.0,2155.0,522.0,1858.0,437.0,2.6520
4,-120.87,41.54,21.0,1091.0,208.0,660.0,188.0,2.2321
...,...,...,...,...,...,...,...,...
4123,-117.98,33.68,14.0,3396.0,477.0,1542.0,472.0,7.3982
4124,-119.45,35.07,45.0,973.0,183.0,500.0,177.0,2.6389
4125,-118.23,33.89,36.0,2598.0,514.0,1872.0,514.0,3.1667
4126,-121.48,38.52,34.0,2561.0,497.0,1583.0,530.0,3.1583


In [42]:
model, train_x_binned, train_errors = pipeline_train(train_x, train_y, test_x, test_y)

Train RMS error: 35085.96794454075
Test RMS error: 52174.79239835969


In [43]:
slices = pipeline_sliceline(train_x, train_x_binned, train_errors, alpha = 0.5, max_l = 3, min_sup = 0, k = 3)

  and array.dtypes.apply(is_sparse).any()
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  (
  (slice_errors / slice_sizes) / self.average_error_ - 1
  ) - (1 - self.alpha) * (n_row_x_encoded / slice_sizes - 1)


Slices:
           longitude     latitude housing_median_age  total_rooms  \
slice_0  (-inf, inf)  (-inf, inf)        (-inf, inf)  (-inf, inf)   
slice_1  (-inf, inf)  (-inf, inf)      (-inf, 26.50)  (-inf, inf)   

        total_bedrooms   population   households median_income  
slice_0    (-inf, inf)  (-inf, inf)  (-inf, inf)   [2.60, inf)  
slice_1    (-inf, inf)  (-inf, inf)  (-inf, inf)   (-inf, inf)  
Existing counts: [3063, 1847]


In [44]:
sources = [
    'data/housing2.csv',
    'data/housing3.csv',
    'data/housing4.csv'
]
costs = [1.0, 1.0, 1.0]

In [45]:
# We will consider only one slice
# median income in [2.60, inf)
query_counts = [ 300, 200 ]

In [46]:
additional_data = pipeline_dt(sources, costs, slices, query_counts)
additional_data

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value
0,-122.21,37.78,46.0,2239.0,508.0,1390.0,569.0,2.7352,137300.0
1,-118.16,34.09,52.0,1722.0,448.0,1122.0,425.0,3.1204,224000.0
2,-120.41,36.77,24.0,1335.0,312.0,1180.0,267.0,1.9470,68900.0
3,-124.09,40.44,38.0,2220.0,426.0,1041.0,401.0,2.3947,70500.0
4,-120.95,37.61,17.0,4054.0,654.0,2034.0,667.0,4.6833,142200.0
...,...,...,...,...,...,...,...,...,...
448,-117.05,32.96,18.0,3593.0,661.0,1992.0,626.0,4.8295,165800.0
449,-117.97,33.66,14.0,6090.0,1338.0,1974.0,1248.0,2.8061,180300.0
450,-118.47,34.00,42.0,1271.0,301.0,574.0,312.0,3.1304,340500.0
451,-121.44,38.54,39.0,2855.0,,1217.0,562.0,3.2404,93600.0


In [47]:
aug_x, aug_y = pipeline_augment(train_x, train_y, additional_data, train.columns)
aug_x

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income
2475,-118.20,33.96,43.0,1233.0,306.0,1190.0,282.0,2.8371
278,-118.34,34.04,42.0,1681.0,360.0,987.0,337.0,2.6000
2201,-118.43,33.96,38.0,1104.0,216.0,415.0,163.0,6.1985
400,-122.47,37.75,49.0,2747.0,472.0,1281.0,448.0,5.4820
3834,-120.61,35.10,14.0,2919.0,691.0,1896.0,577.0,2.4003
...,...,...,...,...,...,...,...,...
3497,-117.82,35.03,30.0,2555.0,510.0,1347.0,467.0,3.3693
3492,-122.24,37.76,52.0,1846.0,471.0,827.0,446.0,2.6833
2177,-120.97,37.61,16.0,1326.0,375.0,884.0,375.0,1.8710
3557,-117.94,33.77,33.0,2964.0,747.0,2235.0,718.0,3.2591


In [48]:
model, aug_x_binned, train_errors = pipeline_train(aug_x, aug_y, test_x, test_y)

Train RMS error: 36106.925589106075
Test RMS error: 51843.32278558209


In [49]:
slices = pipeline_sliceline(aug_x, aug_x_binned, train_errors, alpha = 0.5, max_l = 3, min_sup = 0, k = 3)

  and array.dtypes.apply(is_sparse).any()
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  (
  (slice_errors / slice_sizes) / self.average_error_ - 1
  ) - (1 - self.alpha) * (n_row_x_encoded / slice_sizes - 1)


Slices:
           longitude     latitude housing_median_age  total_rooms  \
slice_0  (-inf, inf)  (-inf, inf)        (-inf, inf)  (-inf, inf)   

        total_bedrooms   population   households median_income  
slice_0    (-inf, inf)  (-inf, inf)  (-inf, inf)   [3.01, inf)  
Existing counts: [2965]


In [25]:
additional_data = pipeline_dt(sources, costs, slices, [290])

In [26]:
additional_data

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value
0,-122.21,37.78,46.0,2239.0,508.0,1390.0,569.0,2.7352,137300.0
1,-118.16,34.09,52.0,1722.0,448.0,1122.0,425.0,3.1204,224000.0
2,-120.41,36.77,24.0,1335.0,312.0,1180.0,267.0,1.9470,68900.0
3,-124.09,40.44,38.0,2220.0,426.0,1041.0,401.0,2.3947,70500.0
4,-120.95,37.61,17.0,4054.0,654.0,2034.0,667.0,4.6833,142200.0
...,...,...,...,...,...,...,...,...,...
415,-121.23,36.33,23.0,2095.0,536.0,1858.0,457.0,3.0543,92400.0
416,-120.02,36.80,25.0,1270.0,255.0,1050.0,245.0,2.1618,55300.0
417,-116.94,32.81,8.0,2517.0,632.0,1686.0,613.0,2.1360,143500.0
418,-119.01,35.34,36.0,973.0,219.0,613.0,187.0,1.5625,46700.0


In [50]:
aug_x, aug_y = pipeline_augment(aug_x, aug_y, additional_data, train.columns)
aug_x

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income
4457,-122.22,38.08,37.0,4590.0,857.0,2920.0,832.0,3.4360
861,-122.05,37.95,20.0,563.0,107.0,246.0,123.0,5.4482
2278,-117.25,32.79,27.0,848.0,300.0,455.0,298.0,3.0774
2020,-122.29,37.98,27.0,2133.0,347.0,850.0,350.0,5.1046
29,-121.35,38.66,8.0,3322.0,805.0,1694.0,774.0,2.7011
...,...,...,...,...,...,...,...,...
3497,-118.60,34.16,32.0,3999.0,667.0,1628.0,631.0,6.0794
3492,-117.19,33.69,3.0,6484.0,1037.0,3295.0,1074.0,4.5881
2177,-117.11,32.73,34.0,1096.0,221.0,574.0,223.0,3.8355
3557,-121.95,37.29,30.0,3734.0,813.0,1834.0,824.0,3.4505


In [56]:
model, aug_x_binned, train_errors = pipeline_train(aug_x, aug_y, test_x, test_y)

Train RMS error: 35600.56554407407
Test RMS error: 52041.94781585272


In [57]:
slices = pipeline_sliceline(aug_x, aug_x_binned, train_errors, alpha = 0.5, max_l = 3, min_sup = 0, k = 3)

  and array.dtypes.apply(is_sparse).any()
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  (
  (slice_errors / slice_sizes) / self.average_error_ - 1
  ) - (1 - self.alpha) * (n_row_x_encoded / slice_sizes - 1)


Slices:
                  longitude        latitude housing_median_age  total_rooms  \
slice_0         (-inf, inf)     (-inf, inf)        (-inf, inf)  (-inf, inf)   
slice_1  [-122.33, -118.28)  [33.89, 37.95)        (-inf, inf)  (-inf, inf)   

        total_bedrooms   population   households median_income  
slice_0    (-inf, inf)  (-inf, inf)  (-inf, inf)   [2.85, inf)  
slice_1    (-inf, inf)  (-inf, inf)  (-inf, inf)   (-inf, inf)  
Existing counts: [3465, 1991]


In [58]:
additional_data = pipeline_dt(sources, costs, slices, [340, 190])

In [59]:
additional_data

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value
0,-122.21,37.78,46.0,2239.0,508.0,1390.0,569.0,2.7352,137300.0
1,-118.16,34.09,52.0,1722.0,448.0,1122.0,425.0,3.1204,224000.0
2,-120.41,36.77,24.0,1335.0,312.0,1180.0,267.0,1.9470,68900.0
3,-124.09,40.44,38.0,2220.0,426.0,1041.0,401.0,2.3947,70500.0
4,-120.95,37.61,17.0,4054.0,654.0,2034.0,667.0,4.6833,142200.0
...,...,...,...,...,...,...,...,...,...
465,-121.82,36.54,22.0,1746.0,363.0,886.0,364.0,5.5469,378800.0
466,-118.27,33.94,30.0,1764.0,397.0,1406.0,362.0,1.4490,93100.0
467,-121.55,39.51,50.0,1050.0,288.0,485.0,260.0,1.1607,51700.0
468,-118.21,34.21,41.0,1676.0,263.0,757.0,255.0,4.7734,450800.0


In [61]:
model, aug_x_binned, train_errors = pipeline_train(aug_x, aug_y, test_x, test_y)

Train RMS error: 35600.56554407407
Test RMS error: 52041.94781585272
