In [5]:
import pandas as pd
import numpy as np
import math
from sklearn.datasets import fetch_california_housing
from sklearn.ensemble import HistGradientBoostingRegressor
import statistics as sts
from sliceline.slicefinder import Slicefinder
import optbinning

**Reproducibility Notes**

The PyPi `sliceline` package requires Python 3.7~3.10.0, which is not the most up-to-date python version. Creating a virtual environment with python 3.9 should work. 

In [6]:
df = pd.read_csv('data/housing.csv')
df

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value,ocean_proximity
0,-122.23,37.88,41.0,880.0,129.0,322.0,126.0,8.3252,452600.0,NEAR BAY
1,-122.22,37.86,21.0,7099.0,1106.0,2401.0,1138.0,8.3014,358500.0,NEAR BAY
2,-122.24,37.85,52.0,1467.0,190.0,496.0,177.0,7.2574,352100.0,NEAR BAY
3,-122.25,37.85,52.0,1274.0,235.0,558.0,219.0,5.6431,341300.0,NEAR BAY
4,-122.25,37.85,52.0,1627.0,280.0,565.0,259.0,3.8462,342200.0,NEAR BAY
...,...,...,...,...,...,...,...,...,...,...
20635,-121.09,39.48,25.0,1665.0,374.0,845.0,330.0,1.5603,78100.0,INLAND
20636,-121.21,39.49,18.0,697.0,150.0,356.0,114.0,2.5568,77100.0,INLAND
20637,-121.22,39.43,17.0,2254.0,485.0,1007.0,433.0,1.7000,92300.0,INLAND
20638,-121.32,39.43,18.0,1860.0,409.0,741.0,349.0,1.8672,84700.0,INLAND


In [7]:
# Really stupid statistics
print('longitude', min(set(df['longitude'])), max(set(df['longitude'])))
print('latitude', min(set(df['latitude'])), max(set(df['latitude'])))
print('median age', set(df['housing_median_age']))
print('total rooms', min(set(df['total_rooms'])), max(set(df['total_rooms'])))
print('total bedrooms', min(set(df['total_bedrooms'])), max(set(df['total_bedrooms'])))
print('population', min(set(df['population'])), max(set(df['population'])))
print('households', min(set(df['households'])), max(set(df['households'])))
print('median income', min(set(df['median_income'])), max(set(df['median_income'])))
print('median house value', min(set(df['median_house_value'])), max(set(df['median_house_value'])))
print('ocean proximity', set(df['ocean_proximity']))

longitude -124.35 -114.31
latitude 32.54 41.95
median age {1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0, 17.0, 18.0, 19.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0, 27.0, 28.0, 29.0, 30.0, 31.0, 32.0, 33.0, 34.0, 35.0, 36.0, 37.0, 38.0, 39.0, 40.0, 41.0, 42.0, 43.0, 44.0, 45.0, 46.0, 47.0, 48.0, 49.0, 50.0, 51.0, 52.0}
total rooms 2.0 39320.0
total bedrooms nan nan
population 3.0 35682.0
households 1.0 6082.0
median income 0.4999 15.0001
median house value 14999.0 500001.0
ocean proximity {'NEAR OCEAN', 'ISLAND', 'NEAR BAY', '<1H OCEAN', 'INLAND'}


In [8]:
# Shuffle DF to simulate randomness
df = df.sample(frac=1)
# Split data into 5 splits (train, test, 3 additional DS)
num_splits = 5
splits = [df[i::num_splits] for i in range(num_splits)]
for i, split in enumerate(splits):
    split.to_csv('data/housing' + str(i) + '.csv', index=False)

In [9]:
train_df = pd.read_csv('data/housing0.csv')
train_y = train_df['median_house_value']
train_x = train_df.drop('median_house_value', axis=1)
train_x = pd.get_dummies(train_x, columns=['ocean_proximity'])

In [10]:
train_x

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,ocean_proximity_<1H OCEAN,ocean_proximity_INLAND,ocean_proximity_NEAR BAY,ocean_proximity_NEAR OCEAN
0,-119.81,36.85,17.0,2340.0,370.0,1174.0,396.0,4.2304,False,True,False,False
1,-118.32,34.26,24.0,5106.0,1010.0,2310.0,957.0,4.4375,True,False,False,False
2,-122.09,37.40,36.0,1575.0,379.0,1036.0,382.0,5.1408,False,False,True,False
3,-118.28,33.77,47.0,307.0,69.0,374.0,65.0,2.9063,True,False,False,False
4,-119.98,37.43,12.0,2776.0,592.0,1236.0,489.0,2.5551,False,True,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...
4123,-117.97,34.01,33.0,2006.0,381.0,1410.0,346.0,3.7083,True,False,False,False
4124,-122.23,37.76,52.0,2269.0,323.0,805.0,321.0,4.7188,False,False,True,False
4125,-118.05,33.96,37.0,2622.0,652.0,2778.0,644.0,2.9714,True,False,False,False
4126,-118.34,34.26,37.0,1776.0,301.0,702.0,265.0,5.2661,True,False,False,False


In [11]:
model = HistGradientBoostingRegressor(random_state=42)
model.fit(train_x, train_y)

In [12]:
test_df = pd.read_csv('data/housing1.csv')
test_y = train_df['median_house_value']
test_x = train_df.drop('median_house_value', axis=1)
test_x = pd.get_dummies(test_x, columns=['ocean_proximity'])

In [13]:
y_pred = model.predict(test_x)
y_pred

array([115568.1895561 , 245237.36010467, 309358.55619788, ...,
       156203.07018579, 283230.35847848, 163649.69046774])

In [14]:
training_errors = (test_y - y_pred)**2
training_errors

0       4.480922e+08
1       2.887704e+09
2       1.994387e+09
3       1.223810e+07
4       8.952542e+06
            ...     
4123    1.648131e+07
4124    3.730463e+09
4125    1.678483e+07
4126    1.002966e+09
4127    1.306800e+09
Name: median_house_value, Length: 4128, dtype: float64

In [15]:
mean_training_error = sts.mean(training_errors)
math.sqrt(mean_training_error)

32816.47241779146

In [16]:
optimal_binner = optbinning.ContinuousOptimalBinning(max_n_bins=5)

X_trans = pd.DataFrame(np.array(
    [
        optimal_binner.fit_transform(train_x[col], training_errors, metric="bins") for col in train_x.columns
    ]
).T, columns=train_x.columns)

In [17]:
X_trans

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,ocean_proximity_<1H OCEAN,ocean_proximity_INLAND,ocean_proximity_NEAR BAY,ocean_proximity_NEAR OCEAN
0,"[-119.99, -119.17)","[32.86, 37.75)","(-inf, 27.50)","[1755.50, inf)","[350.50, 379.50)","[964.00, inf)","[356.50, inf)","[3.13, inf)","(-inf, 0.50)","[0.50, inf)","(-inf, 0.50)","(-inf, 0.50)"
1,"[-119.17, inf)","[32.86, 37.75)","(-inf, 27.50)","[1755.50, inf)","[379.50, inf)","[964.00, inf)","[356.50, inf)","[3.13, inf)","[0.50, inf)","(-inf, 0.50)","(-inf, 0.50)","(-inf, 0.50)"
2,"[-122.22, -122.06)","[32.86, 37.75)","[35.50, 49.50)","[1438.50, 1575.50)","[350.50, 379.50)","[964.00, inf)","[356.50, inf)","[3.13, inf)","(-inf, 0.50)","(-inf, 0.50)","[0.50, inf)","(-inf, 0.50)"
3,"[-119.17, inf)","[32.86, 37.75)","[35.50, 49.50)","(-inf, 635.00)","(-inf, 143.50)","[354.00, 529.50)","(-inf, 129.50)","[2.38, 3.13)","[0.50, inf)","(-inf, 0.50)","(-inf, 0.50)","(-inf, 0.50)"
4,"[-119.99, -119.17)","[32.86, 37.75)","(-inf, 27.50)","[1755.50, inf)","[379.50, inf)","[964.00, inf)","[356.50, inf)","[2.38, 3.13)","(-inf, 0.50)","[0.50, inf)","(-inf, 0.50)","(-inf, 0.50)"
...,...,...,...,...,...,...,...,...,...,...,...,...
4123,"[-119.17, inf)","[32.86, 37.75)","[32.50, 35.50)","[1755.50, inf)","[379.50, inf)","[964.00, inf)","[306.50, 356.50)","[3.13, inf)","[0.50, inf)","(-inf, 0.50)","(-inf, 0.50)","(-inf, 0.50)"
4124,"(-inf, -122.22)","[37.75, 37.94)","[49.50, inf)","[1755.50, inf)","[143.50, 323.50)","[529.50, 870.50)","[306.50, 356.50)","[3.13, inf)","(-inf, 0.50)","(-inf, 0.50)","[0.50, inf)","(-inf, 0.50)"
4125,"[-119.17, inf)","[32.86, 37.75)","[35.50, 49.50)","[1755.50, inf)","[379.50, inf)","[964.00, inf)","[356.50, inf)","[2.38, 3.13)","[0.50, inf)","(-inf, 0.50)","(-inf, 0.50)","(-inf, 0.50)"
4126,"[-119.17, inf)","[32.86, 37.75)","[35.50, 49.50)","[1755.50, inf)","[143.50, 323.50)","[529.50, 870.50)","[129.50, 277.50)","[3.13, inf)","[0.50, inf)","(-inf, 0.50)","(-inf, 0.50)","(-inf, 0.50)"


In [None]:
# fitting sliceline
sf = Slicefinder(
    alpha = 0.95,
    k = 1,
    max_l = X_trans.shape[1],
    min_sup = 1,
    verbose = True
)

sf.fit(X_trans, training_errors)