In [1]:

from sklearn.datasets import fetch_openml
import numpy as np
from types import SimpleNamespace
from typing import Tuple, List
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder, MinMaxScaler
    
adult = fetch_openml(data_id = 1590, data_home='./data_cache')

data = adult.data

le = LabelEncoder()
label = pd.Series(le.fit_transform(adult.target))


category_cols = ['workclass', 'education', 'race', 'sex', "marital-status", "occupation", "relationship", "native-country"]
continuous_cols = [x for x in data.columns if x not in category_cols]

for col in category_cols:
    data[col] = le.fit_transform(data[col])


    
    
temp = None
for col in category_cols:
    oh_values = OneHotEncoder().fit_transform(data[col].values.reshape((-1, 1))).toarray()
    new_cols = [col + "-" + str(i) for i in range(len(data[col].unique()))]
    oh_values = pd.DataFrame(oh_values, columns = new_cols, dtype=np.int8, index=data.index)
    if temp is None:
        temp = oh_values
    else:
        temp = temp.merge(oh_values, left_index=True, right_index=True)

data = data.merge(temp, left_index=True, right_index=True)
data.drop(category_cols, inplace=True, axis=1)

category_cols = temp.columns

scaler = MinMaxScaler()
data[continuous_cols] = scaler.fit_transform(data[continuous_cols])

  warn(
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data[col] = le.fit_transform(data[col])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data[col] = le.fit_transform(data[col])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data[col] = le.fit_transform(data[col])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using 

In [2]:
import torch.nn as nn
model_hparams = {
    "encoder_dim" : data.shape[1],
    "predictor_hidden_dim" : 256,
    "predictor_output_dim" : 2,
    'alpha1' : 0.5,
    'alpha2' : 0.5,
    'beta' : 0.5,
    'K' : 10
}
data_hparams = {
    "K" : 10,
    "p_m" : 0.2
}
optim_hparams = {
    "lr" : 0.005
}
scheduler_hparams = {
    'gamma' : 0.3,
    'step_size' : 30
}
num_categoricals = len(continuous_cols)
num_continuous = len(continuous_cols)
loss_fn = nn.CrossEntropyLoss
metric =  "accuracy_score"
metric_params = {}
random_seed = 0

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
from misc.scorer import BaseScorer


class AccuracyScorer(BaseScorer):
    def __init__(self, metric: str) -> None:
        super().__init__(metric)
    
    def __call__(self, y, y_hat) -> float:
        return self.metric(y, y_hat.argmax(1))

In [4]:
from pl_vime import PLVIME
pl_vime = PLVIME(model_hparams, "Adam", optim_hparams, "StepLR", scheduler_hparams, 
       num_categoricals, num_continuous, -1, loss_fn,
       AccuracyScorer("accuracy_score"), random_seed)

Global seed set to 0


In [5]:
from sklearn.model_selection import train_test_split

X_train, X_valid, y_train, y_valid = train_test_split(data, label, train_size = 0.7, random_state=random_seed, stratify=label)

X_train, X_unlabeled, y_train, _ = train_test_split(X_train, y_train, train_size = 0.1, random_state=random_seed, stratify=y_train)

In [6]:
from pytorch_lightning import Trainer
from pytorch_lightning.callbacks import EarlyStopping, ModelCheckpoint
from data_utils import *

n_gpus = 1
n_jobs = 32
max_epochs = 20
batch_size = 512

pretraining_patience = 10
early_stopping_patience = 30

batch_size = 512

def fit_model(
            model,
    ):
    
    train_ds = VIMESelfDataset(X_train.append(X_unlabeled), data_hparams, continuous_cols, category_cols)
    test_ds = VIMESelfDataset(X_valid, data_hparams, continuous_cols, category_cols)
    
    pl_datamodule = PLDataModule(train_ds, test_ds, batch_size=batch_size)

    model.do_pretraining()

    callbacks = [
        EarlyStopping(
            monitor= 'val_loss', 
            mode = 'min',
            patience = pretraining_patience,
            verbose = False
        )
    ]
    pretraining_path = f'temporary_ckpt_data/pretraining'
    checkpoint_callback = ModelCheckpoint(
        monitor='val_loss',
        dirpath=pretraining_path,
        filename='pretraining-{epoch:02d}-{val_f1:.4f}',
        save_top_k=1,
        mode = 'min'
    )

    callbacks.append(checkpoint_callback)

    trainer = Trainer(
                    devices = n_gpus,
                    accelerator="cuda" if n_gpus >= 1 else 'cpu',
                    # replace_sampler_ddp=False,
                    max_epochs = max_epochs,
                    num_sanity_val_steps = 2,
                    callbacks = callbacks,
    )

    trainer.fit(model, pl_datamodule)
    
    pretraining_path = checkpoint_callback.best_model_path

    model = model.load_from_checkpoint(pretraining_path)

    model.do_finetuning()
    
        
    train_ds = VIMEClassificationDataset(X_train, y_train.values, data_hparams, X_unlabeled, continuous_cols, category_cols)
    test_ds = VIMEClassificationDataset(X_valid, y_valid.values, data_hparams, None, continuous_cols, category_cols)

    pl_datamodule = PLDataModule(train_ds, test_ds, batch_size = batch_size)
        
    callbacks = [
        EarlyStopping(
            monitor= 'val_' + metric, 
            mode = 'max',
            patience = early_stopping_patience,
            verbose = False
        )
    ]

    checkpoint_path = None

    checkpoint_path = f'temporary_ckpt_data/'
    checkpoint_callback = ModelCheckpoint(
        monitor='val_' + metric,
        dirpath=checkpoint_path,
        filename='{epoch:02d}-{val_f1:.4f}',
        save_top_k=1,
        mode = 'max'
    )

    callbacks.append(checkpoint_callback)

    trainer = Trainer(
                    devices = n_gpus,
                    accelerator = "cuda" if n_gpus >= 1 else 'cpu',
                    # replace_sampler_ddp=False,
                    max_epochs = max_epochs,
                    num_sanity_val_steps = 2,
                    callbacks = callbacks,
    )

    trainer.fit(model, pl_datamodule)

    model = model.load_from_checkpoint(checkpoint_callback.best_model_path)
    
    return model

In [7]:
pl_vime = fit_model(pl_vime)

  train_ds = VIMESelfDataset(X_train.append(X_unlabeled), data_hparams, continuous_cols, category_cols)
GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs
You are using a CUDA device ('NVIDIA GeForce RTX 4090') that has Tensor Cores. To properly utilize them, you should set `torch.set_float32_matmul_precision('medium' | 'high')` which will trade-off precision for performance. For more details, read https://pytorch.org/docs/stable/generated/torch.set_float32_matmul_precision.html#torch.set_float32_matmul_precision
  rank_zero_warn(f"Checkpoint directory {dirpath} exists and is not empty.")
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0,1]

  | Name                      | Type             | Params
---------------------------------------------------------------
0 | model                     | VIME             | 129 K 
1 | pretraining_mask_loss     | BCELoss          | 0     
2 | pretraining_feat

                                                                           

In [None]:
import torch.nn.functional as F

trainer = Trainer(
                    devices = n_gpus,
                    accelerator = "cuda" if n_gpus >= 1 else 'cpu',
                    max_epochs = max_epochs,
                    num_sanity_val_steps = 2,
                    callbacks = None,
    )
test_ds = VIMEClassificationDataset(X_valid, y_valid.values, data_hparams, None, continuous_cols, category_cols)
test_dl = DataLoader(test_ds, batch_size, shuffle=False, sampler = SequentialSampler(test_ds), num_workers=n_jobs)

preds = trainer.predict(pl_vime, test_dl)

preds = F.softmax(torch.concat([out.cpu() for out in preds]).squeeze(),dim=1)


GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0,1]


Predicting DataLoader 0: 100%|██████████| 29/29 [00:00<00:00, 476.09it/s]


In [None]:
from sklearn.metrics import accuracy_score
accuracy_score(y_valid, preds.argmax(1))

0.706954207329557