In [1]:
import pandas as pd
import numpy as np
import math
from sklearn.datasets import fetch_california_housing
from sklearn.ensemble import HistGradientBoostingRegressor
import statistics as sts
from sliceline.slicefinder import Slicefinder
import optbinning
from dt import *

**Reproducibility Notes**

The PyPi `sliceline` package requires Python 3.7~3.10.0, which is not the most up-to-date python version. Creating a virtual environment with python 3.9 should work. 

## Defining Reusable Methods

### Data Manipulation

In [95]:
def parse_file(filename):
    df = pd.read_csv(filename)
    train_x, train_y = dt.split_xy(df)
    return train_x, train_y, df

### Model Training

In [96]:
def train_model(train_x, train_y):
    model = HistGradientBoostingRegressor(random_state=42)
    model.fit(train_x, train_y)
    return model

### Error Analysis

In [97]:
def get_errors(model, x, y):
    preds = model.predict(x)
    training_errors = (y - preds)**2
    return training_errors

In [98]:
def get_rms(arr):
    means = sts.mean(arr)
    rms = math.sqrt(means)
    return rms

In [99]:
def get_rms_error(model, x, y):
    errors = get_errors(model, x, y)
    return get_rms(errors)

### Binning

In [100]:
def bin_xs(train_x, train_errors):
    optimal_binner = optbinning.ContinuousOptimalBinning(max_n_bins=5)
    train_x_binned = pd.DataFrame(np.array(
        [
            optimal_binner.fit_transform(train_x[col], train_errors, metric="bins") for col in train_x.columns
        ]
    ).T, columns=train_x.columns)
    return train_x_binned

### Sliceliner

In [101]:
def get_slices(train_x_binned, train_errors, alpha = 0.9, k=1, max_l = 3, min_sup = 0, verbose = False):
    sf = Slicefinder(alpha = alpha, k = k, max_l = max_l, min_sup = min_sup, verbose = verbose)
    sf.fit(train_x_binned, train_errors)
    df = pd.DataFrame(sf.top_slices_, columns=sf.feature_names_in_, index=sf.get_feature_names_out())
    return df

In [102]:
# Reformat slices returned from sliceliner as dataframe into a matrix of strings
def reformat_slices(slice_df):
    slice_df.fillna('(-inf, inf)', inplace=True)
    slice_list = slice_df.values.tolist()
    slice_parsed = dt.parse_slices(slice_list)
    return slice_parsed

In [103]:
# Get the number of times each slice already exists in xs
def get_slice_cnts(xs, slices):
    cnts = []
    for slice_ in slices:
        cnt = 0
        for x in xs.values.tolist():
            if dt.belongs_to_slice(slice_, x):
                cnt += 1 
        cnts.append(cnt)
    return cnts

### Putting the Pipeline Together

In [104]:
# Train a model, report errors, and return model & binned train set
def pipeline_train(train_x, train_y, test_x, test_y):
    # Train model
    model = train_model(train_x, train_y)
    # Error analysis
    train_errors = get_errors(model, train_x, train_y)
    print("Train RMS error:", get_rms(train_errors))
    print("Test RMS error:", get_rms(get_errors(model, test_x, test_y)))
    # Binning
    train_x_binned = bin_xs(train_x, train_errors)
    return model, train_x_binned, train_errors

In [105]:
def pipeline_sliceline(train_x, train_x_binned, train_errors, alpha = 0.9, max_l = 3, min_sup = 0, k = 1):
    # Sliceliner
    slices_df = get_slices(train_x_binned, train_errors, alpha = 0.9, max_l = 3, min_sup = 0, verbose = False, k=k)
    slices = reformat_slices(slices_df)
    existing_cnts = get_slice_cnts(train_x, slices)
    print("Slices:")
    print(slices_df)
    print("Existing counts:", existing_cnts)
    return slices

In [106]:
# Obtain additional data
def pipeline_dt(sources, costs, slices, query_counts):
    dt = DT(sources, costs, slices, None, batch=100)
    additional_data = dt.run(query_counts)
    return additional_data

In [107]:
# Combine existing dataset with additional data
# Additional data is shuffled in
def pipeline_augment(train_x, train_y, additional_data, features):
    add_df = pd.DataFrame(additional_data, columns=features)
    add_x, add_y = split_xy(add_df)
    aug_x = pd.concat([train_x, add_x], ignore_index=True)
    aug_x = aug_x.sample(frac=1, random_state=12345)
    aug_y = pd.concat([train_y, add_y], ignore_index=True)
    aug_y = aug_y.sample(frac=1, random_state=12345)
    return aug_x, aug_y

## Housing Data Example

In [108]:
train_x, train_y, train = parse_file('data/housing0.csv')
test_x, test_y, test = parse_file('data/housing1.csv')
train_x

In [109]:
model, train_x_binned, train_errors = pipeline_train(train_x, train_y, test_x, test_y)

In [110]:
slices = pipeline_sliceline(train_x, train_x_binned, train_errors, alpha = 0.5, max_l = 3, min_sup = 0, k = 3)

In [111]:
sources = [
    'data/housing2.csv',
    'data/housing3.csv',
    'data/housing4.csv'
]
costs = [1.0, 1.0, 1.0]

In [112]:
# We will consider only one slice
# median income in [2.60, inf)
query_counts = [ 300, 200 ]

In [113]:
additional_data = pipeline_dt(sources, costs, slices, query_counts)
additional_data

In [114]:
aug_x, aug_y = pipeline_augment(train_x, train_y, additional_data, train.columns)
aug_x

In [115]:
model, aug_x_binned, train_errors = pipeline_train(aug_x, aug_y, test_x, test_y)

In [116]:
slices = pipeline_sliceline(aug_x, aug_x_binned, train_errors, alpha = 0.5, max_l = 3, min_sup = 0, k = 3)

In [117]:
additional_data = pipeline_dt(sources, costs, slices, [290])

In [118]:
additional_data

In [119]:
aug_x, aug_y = pipeline_augment(aug_x, aug_y, additional_data, train.columns)
aug_x

In [120]:
model, aug_x_binned, train_errors = pipeline_train(aug_x, aug_y, test_x, test_y)

In [121]:
slices = pipeline_sliceline(aug_x, aug_x_binned, train_errors, alpha = 0.5, max_l = 3, min_sup = 0, k = 3)

In [122]:
additional_data = pipeline_dt(sources, costs, slices, [320, 180, 270])

In [123]:
additional_data

In [124]:
model, aug_x_binned, train_errors = pipeline_train(aug_x, aug_y, test_x, test_y)