# Example for running CARTE on multi-tables (joint learning)
In this example, we run CARTE for the multi-table task. We continue the example with the Wine Poland dataset, which contains information about wines on the polish market. The task is to predict the price.

In [1]:
# Set the current working directory and import packages
import os
from pathlib import Path
os.chdir(Path().cwd().parent)

import torch
import json
import statistics
import pandas as pd
import numpy as np
from sklearn.model_selection import GroupShuffleSplit
from sklearn.metrics import r2_score
from sklearn.preprocessing import PowerTransformer, StandardScaler
from src.carte_table_to_graph import Table2GraphTransformer
from src.carte_estimator import CARTERegressor, CARTEMultitableRegressor
from configs.directory import config_directory

In [2]:
# Define necessary functions

def _load_data(data_name):
    """Load the preprocessed data."""
    data_pd_dir = f"{config_directory['data_singletable']}/{data_name}/raw.parquet"
    data_pd = pd.read_parquet(data_pd_dir)
    data_pd.fillna(value=np.nan, inplace=True)
    config_data_dir = f"{config_directory['data_singletable']}/{data_name}/config_data.json"
    filename = open(config_data_dir)
    config_data = json.load(filename)
    filename.close()
    return data_pd, config_data

def _transform_to_graph(data, config_data):
    """Transform to graph."""
    graph_transformer = Table2GraphTransformer()
    X_original = data.drop(columns=config_data["target_name"])
    y_original = data[config_data["target_name"]]
    y_original = np.array(y_original)
    X_carte = graph_transformer.fit_transform(X=X_original, y=y_original)
    return X_carte


def _set_split(data, data_config, num_train, random_state):
    """Set train/test split given the random state."""
    target_name = data_config["target_name"]
    X = data.drop(columns=target_name)
    y = data[target_name]
    y = np.array(y)

    if data_config["repeated"]:
        entity_name = data_config["entity_name"]
    else:
        entity_name = np.arange(len(y))

    groups = np.array(data.groupby(entity_name).ngroup())
    num_groups = len(np.unique(groups))
    gss = GroupShuffleSplit(
        n_splits=1,
        test_size=int(num_groups - num_train),
        random_state=random_state,
    )
    idx_train, idx_test = next(iter(gss.split(X=y, groups=groups)))

    X_train, X_test = X.iloc[idx_train], X.iloc[idx_test]
    y_train, y_test = y[idx_train], y[idx_test]

    return X_train, X_test, y_train, y_test

def _prepare_carte(
    data_t,
    data_s_total,
    config_data_t,
    config_data_s_total,
    num_train,
    random_state,
):

    # Preprocess target data
    Xt = data_t.copy()
    Xt_train, Xt_test, yt_train, yt_test = _set_split(
        Xt,
        config_data_t,
        num_train,
        random_state=random_state,
    )

    graph_transformer = Table2GraphTransformer()
    Xt_carte_train = graph_transformer.fit_transform(X=Xt_train, y=yt_train)
    Xt_carte_test = graph_transformer.transform(Xt_test)
    for data in Xt_carte_train:
        data.domain = 0
    for data in Xt_carte_test:
        data.domain = 0

    task = config_data_t["task"]
    if task == "regression":
        # Set power_transformer for targets
        scaler_t = PowerTransformer()
        scaler_t.fit(np.array(yt_train).reshape(-1, 1))
        scaler_t_std = StandardScaler()
        scaler_t_std.fit(np.array(yt_train).reshape(-1, 1))
    else:
        pass

    # Preprocess source data
    Xs_carte = dict()
    domain_marker = 1
    for data_name in data_s_total.keys():
        data_s = data_s_total[data_name]
        config_s = config_data_s_total[data_name]
        Xs_carte_temp = _transform_to_graph(data_s, config_s)
        ys = np.array([data.y.cpu().detach().numpy() for data in Xs_carte_temp])
        g_idx = np.array([data.g_idx for data in Xs_carte_temp])
        # preprocess target for source data
        if task == "regression":
            if config_data_s_total[data_name]["task"] == "classification":
                scaler_s = StandardScaler()
                ys_scaled = scaler_s.fit_transform(ys)
                ys_train = scaler_t_std.inverse_transform(ys_scaled)
            else:
                scaler_s = PowerTransformer()
                ys_scaled = scaler_s.fit_transform(ys)
                ys_train = scaler_t.inverse_transform(ys_scaled)
            ys_train = ys_train.squeeze()
        else:
            ys_train = ys.copy()
            ys_train = ys_train.squeeze()
            if config_data_s_total[data_name]["task"] == "regression":
                med_value = statistics.median(ys_train)
                ys_train[ys_train < med_value] = 0
                ys_train[ys_train != 0] = 1

        # exclude null targets
        mask = np.isnan(ys_train)
        keep_idx = g_idx[~mask]
        Xs_carte_ = [Xs_carte_temp[x] for x in keep_idx]
        ys_train = ys_train[~mask]
        for i in range(len(Xs_carte_)):
            Xs_carte_[i].y = torch.tensor([ys_train[i]])
            Xs_carte_[i].domain = domain_marker
        Xs_carte[data_name] = Xs_carte_
        domain_marker += 1

    return Xt_carte_train, Xt_carte_test, Xs_carte, yt_train, yt_test


For joint-learning, we preprocess the target table exactly same as with the singletable case (fit_transform/transform), but for source data, we form a dictionary with containing each source separately. Moreover, we include a domain marker, indicating where the data is originating from.

Let first run the singletable case for the Wine Poland dataset.

In [3]:
# Set basic specifications
data_name = "wina_pl"      # Name of the data
num_train = 128     # Train-size
random_state = 1    # Random_state

# Load data and set train/test split
data, data_config = _load_data(data_name)
X_train_, X_test_, y_train, y_test = _set_split(
    data,
    data_config,
    num_train,
    random_state=random_state,
)
preprocessor = Table2GraphTransformer()
X_train = preprocessor.fit_transform(X_train_, y=y_train)
X_test = preprocessor.transform(X_test_)

# Define some parameters
fixed_params = dict()
fixed_params["num_model"] = 10 # 10 models for the bagging strategy
fixed_params["disable_pbar"] = False # True if you want cleanness
fixed_params["random_state"] = 0
fixed_params["device"] = "cpu"
fixed_params["n_jobs"] = 10

# Define the estimator and run fit/predict
estimator = CARTERegressor(**fixed_params) # CARTERegressor for Regression
estimator.fit(X=X_train, y=y_train)
y_pred = estimator.predict(X_test)

# Obtain the r2 score on predictions
score = r2_score(y_test, y_pred)
print(f"\nThe R2 score for CARTE Singletable:", "{:.4f}".format(score))

Model No. xx:  10%|█         | 51/500 [01:13<10:47,  1.44s/it]
Model No. xx:   9%|▉         | 46/500 [01:20<13:09,  1.74s/it]
Model No. xx:  11%|█         | 54/500 [01:22<11:21,  1.53s/it]
Model No. xx:  10%|█         | 51/500 [01:23<12:17,  1.64s/it]
Model No. xx:  12%|█▏        | 61/500 [01:27<10:27,  1.43s/it]
Model No. xx:  12%|█▏        | 62/500 [01:27<10:20,  1.42s/it]
Model No. xx:  13%|█▎        | 66/500 [01:27<09:38,  1.33s/it]
Model No. xx:  22%|██▏       | 112/500 [01:34<05:28,  1.18it/s]
Model No. xx:  20%|█▉        | 98/500 [01:37<06:38,  1.01it/s]
Model No. xx:  23%|██▎       | 114/500 [01:41<05:42,  1.13it/s]



The R2 score for CARTE Singletable: 0.3460


Second, we include a source data Wine Vivino, which contains information about wine bottles scrapped from Vivino’s website.

In [4]:
# Load data
target_data_name = "wina_pl"
source_data_name = ["wine_vivino_price"]
num_train = 128
random_state = 1

# Load target data
data_t, config_data_t = _load_data(target_data_name)

# Load and prepare source data
data_s_total = dict()
config_data_s_total = dict()
for data_name in source_data_name:
    data_s, config_data_s = _load_data(data_name)
    data_s_total[data_name] = data_s.copy()
    config_data_s_total[data_name] = config_data_s

Xt_carte_train, Xt_carte_test, Xs_carte, yt_train, yt_test = _prepare_carte(data_t, data_s_total, config_data_t, config_data_s_total, num_train, random_state)

In [5]:
# Original source data
print("Original Source Data:\n", data_s_total["wine_vivino_price"].iloc[0])

# Graph data
print("\nGraph Source Data:\n", Xs_carte["wine_vivino_price"][0])

Original Source Data:
 Name                          Pomerol 2011
Region                     Pomerol, France
Winery               Château La Providence
Rating                                 4.2
Number_Of_Ratings                    100.0
Price                             4.553877
Year                                  2011
Wine_Type                              red
Name: 0, dtype: object

Graph Source Data:
 Data(x=[8, 300], edge_index=[2, 14], edge_attr=[14, 300], y=[1], g_idx=0, domain=1)


For learning, CARTE multitable also runs with the sklearn interface (fit/predict). CARTE multitable is similar to the singletable estimators with additional parameters of the source_data and target_fraction, which controls the fraction of target data when creating the batch.

Note that CARTE multitable estimator builds additional models, trained jointly with the source data, on top of the models from singletable (see our paper for more specific implementation details). The parameter "num_model" specifies for each case (wina_pl/wina_pl-wine_vivino_price in this case), resulting in 10 models for the bagging strategy.

In [6]:
fixed_params = dict()
fixed_params["source_data"] = Xs_carte
fixed_params["num_model"] = 5           # (10 models total wina_pl/wina_pl-wine_vivino_price)
fixed_params["n_jobs"] = 10
fixed_params["random_state"] = 0
fixed_params["disable_pbar"] = False

estimator = CARTEMultitableRegressor(**fixed_params)
estimator.fit(Xt_carte_train, yt_train)

y_pred = estimator.predict(Xt_carte_test)

# Obtain the r2 score on predictions
score = r2_score(yt_test, y_pred)
print(f"\nThe R2 for CARTE Multitable with additional Wine Vivino:", "{:.4f}".format(score))


Model No. xx:  10%|▉         | 49/500 [00:13<02:00,  3.73it/s]
Model No. xx:   9%|▉         | 45/500 [00:13<02:13,  3.40it/s]
Model No. xx:   9%|▉         | 47/500 [00:15<02:30,  3.01it/s]
Model No. xx:   9%|▉         | 46/500 [00:21<03:31,  2.15it/s]
Model No. xx:   9%|▉         | 44/500 [00:12<02:05,  3.65it/s]
Model No. xx:  10%|▉         | 49/500 [01:41<15:38,  2.08s/it]
Model No. xx:   9%|▉         | 47/500 [01:21<13:06,  1.74s/it]
Model No. xx:  15%|█▍        | 73/500 [02:13<13:02,  1.83s/it]
Model No. xx:  15%|█▌        | 76/500 [02:08<11:57,  1.69s/it]
Model No. xx:  22%|██▏       | 111/500 [03:06<10:55,  1.68s/it]



The R2 for CARTE Multitable with additional Wine Vivino: 0.4065


For CARTE multitable estimator, it is also possible to include more than one source data.
Let us run the case with two source data, Wine Vivino and Wine.com, which contains information on wines scraped from the wine.com website.

In [7]:
# Load data
target_data_name = "wina_pl"
source_data_name = ["wine_vivino_price", "wine_dot_com_prices"]
num_train = 128
random_state = 1

# Load target data
data_t, config_data_t = _load_data(target_data_name)

# Load and prepare source data
data_s_total = dict()
config_data_s_total = dict()
for data_name in source_data_name:
    data_s, config_data_s = _load_data(data_name)
    data_s_total[data_name] = data_s.copy()
    config_data_s_total[data_name] = config_data_s

Xt_carte_train, Xt_carte_test, Xs_carte, yt_train, yt_test = _prepare_carte(data_t, data_s_total, config_data_t, config_data_s_total, num_train, random_state)

fixed_params = dict()
fixed_params["source_data"] = Xs_carte
fixed_params["num_model"] = 5           # (15 models total with two sources)
fixed_params["n_jobs"] = 15
fixed_params["random_state"] = 0
fixed_params["disable_pbar"] = False

estimator = CARTEMultitableRegressor(**fixed_params)
estimator.fit(Xt_carte_train, yt_train)

y_pred = estimator.predict(Xt_carte_test)

# Obtain the r2 score on predictions
score = r2_score(yt_test, y_pred)
print(f"\nThe R2 for CARTE Multitable with two sources:", "{:.4f}".format(score))

Model No. xx:  10%|▉         | 49/500 [00:11<01:41,  4.45it/s]
Model No. xx:  10%|▉         | 49/500 [01:08<10:27,  1.39s/it]
Model No. xx:   9%|▉         | 45/500 [00:13<02:18,  3.29it/s]
Model No. xx:  15%|█▌        | 77/500 [02:18<12:38,  1.79s/it]
Model No. xx:   9%|▉         | 47/500 [00:15<02:26,  3.10it/s]
Model No. xx:  13%|█▎        | 67/500 [02:19<14:59,  2.08s/it]
Model No. xx:   9%|▉         | 46/500 [00:15<02:34,  2.93it/s]
Model No. xx:  11%|█         | 53/500 [02:00<16:57,  2.28s/it]]
Model No. xx:  14%|█▍        | 70/500 [02:26<15:00,  2.09s/it]]
Model No. xx:  25%|██▌       | 127/500 [04:02<11:52,  1.91s/it]
Model No. xx:   9%|▉         | 44/500 [00:14<02:31,  3.01it/s]
Model No. xx:   9%|▉         | 47/500 [01:33<14:58,  1.98s/it]
Model No. xx:  22%|██▏       | 111/500 [03:36<12:38,  1.95s/it]
Model No. xx:  21%|██        | 103/500 [03:28<13:23,  2.03s/it]
Model No. xx:  30%|███       | 150/500 [04:38<10:49,  1.86s/it]



The R2 for CARTE Multitable with two sources: 0.4510
