In [None]:
## configurations
from types import SimpleNamespace   

config = {}
config['train_frac'] = 1 # Increase to generate and train with more triplets. Can improve performance
config['epochs'] = 1
config['batch_size'] = 16
config['final_size'] = 200
config['lr'] = .00001
config['loss'] = 'tripletCE'
config['tl_margin'] = 1.0
config['tl_p'] = 2
config['pool_type'] = "CLS"
config['tokenizer_max_length'] = 512
config['temp'] = 0.05
config['init_model_path'] = './roberta-retrained'
config['input_data_dir'] = '../inputs/mlopen_t2t_SS_dataset'
config['separator'] = ','
config['max_header_encoding_length'] = 27
config['max_tuple_content_encoding_length'] = 482
config['max_tuples_per_table'] = 500
config['max_data_length'] = 50000
config['train_size'] = 40000
config['test_size'] = 10000

conf = SimpleNamespace(**config)

## TODO: Save config to disk and load

In [None]:
## table data read and encode
import os, sys, random
# add main directory in sys path to access utility functions
path2add = os.path.normpath(os.path.abspath('..'))
if (not (path2add in sys.path)) :
    sys.path.append(path2add)

from profiler.table_profiler_lm import table_profiler_lm
from table_trainer_utils import *
from transformers import RobertaTokenizer
tokenizer = RobertaTokenizer.from_pretrained('roberta-base')

## create two datasets, one with original sequence, and its augmentation with transformations, also return list of row ids
def encode_tuples(profiler, encoder, augmentor, data_dir):
    cnt = 0
    tupleids = []
    tokens = []
    aug_tokens = []
    
    for id, tblname_list, colname_list, _, content_list, _ in profiler.process_dir(data_dir, sep=','):
        cnt += 1
        tupleids.append(id)

        full_row_enc = encoder.encode_row(tblname_list, colname_list, content_list)
        tokens.append(full_row_enc)
        
        tbl, col, cont = augmentor.augment(tblname_list, colname_list, content_list)
        aug_row_enc = encoder.encode_row(tbl, col, cont)
        aug_tokens.append(aug_row_enc)
        
    print('Encoded ', cnt, ' rows!')
        
    return tokens, aug_tokens, tupleids
                 
data_dir = conf.input_data_dir
profiler = table_profiler_lm(tokenizer=tokenizer, 
                             max_tuples_per_table=conf.max_tuples_per_table)
encoder = TupleEncoder(tokenizer, conf)
augmentor = TupleAugmentor()
orig, aug, tupleids = encode_tuples(profiler, encoder, augmentor, data_dir)


In [4]:
## load model checkpoint pre-trained on corresponding text corpus
from transformers import RobertaTokenizer
model=conf.init_model_path


In [6]:
MAX_LEN = conf.max_data_length
tuple_dataset = TupleDataset(tokenizer, 
                             tupleids[:MAX_LEN], 
                             orig[:MAX_LEN], 
                             aug[:MAX_LEN])

print('Created dataset of length: ', len(tuple_dataset))

# data_loader = TupleDataloader(tuple_dataset, conf)
# print(dir(data_loader))
# training_data, test_data = data_loader.training_dataloader(), data_loader.test_dataloader()
training_data, test_data = torch.utils.data.random_split(tuple_dataset, 
                                                          [conf.train_size, conf.test_size],
                                                          torch.Generator().manual_seed(42))
data_loader = torch.utils.data.DataLoader(training_data, batch_size=conf.batch_size, shuffle=True)


Created dataset of length:  50000


In [16]:
## train models for tuples
from datetime import datetime
from tqdm import tqdm
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim

def save_torch_model(path, model):
    if not os.path.exists(path):
        os.makedirs(path)
    filepath = os.path.join(path, datetime.now().strftime("%H-%M-%d-%m-%y"))
    torch.save(model.state_dict(), filepath)
    print(f"Saved Model: {filepath}")
    return filepath

class Similarity(nn.Module):
    """
    Dot product or cosine similarity
    """

    def __init__(self, temp):
        super().__init__()
        self.temp = temp
        self.cos = nn.CosineSimilarity(dim=-1)

    def forward(self, x, y):
        similarities = self.cos(x, y)
        return similarities / self.temp

def crossentropy_loss(a_pred, p_pred, n_pred, device="cuda"):
    sim_fn = Similarity(0.05) # FIXME: hardcoded temp
    idxs = torch.arange(0, a_pred.shape[0], device=device)
    y_true = idxs#idxs + 1 - idxs % 2 * 2 # each example is paired with its p counterpart
    similarities = sim_fn(a_pred.unsqueeze(1), p_pred.unsqueeze(0))
    loss = F.cross_entropy(similarities, y_true)
    return torch.mean(loss)



In [18]:
## training

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

def train_table_model(model, 
                    train_data, 
                    loss_func, 
                    optimizer, 
                    epochs, 
                    save_dir,
                    tokenizer_max_length = 512,
                    train_model = True) -> str:
    model.to(device)
    for epoch in range(epochs):
        model.train()

        pbar = tqdm(enumerate(train_data))
        for i, d in pbar:
#             print('Working on index: ', i, ' data: ', d['pivot_ids'].shape)
            a, p, n = d['pivot_ids'], d['positive_ids'], d['negative_ids']
            a_mask, p_mask, n_mask = d['pivot_attn'], d['positive_attn'], d['negative_attn']
            for d in [a, p, n, a_mask, p_mask, n_mask]:
                d.to(device)
            optimizer.zero_grad()
            oa, op, on = model(a, p, n, a_mask, p_mask, n_mask)
            loss = loss_func(oa, op, on)
            pbar.set_description(f"Processing iter {i} with loss={loss.item()}")
            loss.backward()
            optimizer.step()

            if (i % 100) == 0 : 
                    tqdm.write(f"train batch loss: {loss.item()}")
            if (i % 500 == 0):
                    save_torch_model(save_dir, model)

        
if conf.loss == 'triplet':
    loss = nn.TripletMarginLoss(margin=conf.tl_margin, p=conf.tl_p)
elif conf.loss == 'tripletCE':
    loss = crossentropy_loss
model = TripletSingleBERTModel(final_size = conf.final_size, 
                                   tokenizer = tokenizer, 
                                   pooling = conf.pool_type, 
                                   model_path = conf.init_model_path)
optimizer = optim.AdamW(model.parameters(), lr=conf.lr)

# save_dir = param_header(conf.batch_size, conf.final_size, conf.lr, conf.pool_type, conf.epochs, conf.train_size)
save_dir = './mlopen-table-model'#f'{conf.data_path}/models/emb/{conf.model_name}/{save_dir}/'

# training_data = data_loader.training_data() #DataLoader(tuple_dataset, batch_size=conf.batch_size, shuffle=True)
tqdm.write("Training Begins")
last_saved = train_table_model(model, 
                               data_loader, 
                               loss, 
                               optimizer, 
                               conf.epochs, 
                               save_dir, 
                               train_model=True)
    


Processing iter 4100 with loss=0.001954467035830021: : 4101it [2:48:48,  2.37s/it]  

train batch loss: 0.001954467035830021


Processing iter 4200 with loss=0.0031837301794439554: : 4201it [2:52:53,  2.38s/it] 

train batch loss: 0.0031837301794439554


Processing iter 4374 with loss=0.0005818585050292313: : 4375it [3:00:03,  2.47s/it] 


In [None]:
## evaluate
import torch
import torch.nn.functional as F
from transformers import RobertaModel
from scipy.spatial.distance import cosine
from table_trainer_utils import *

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

def read_pretrained_model(model_path):
    return RobertaModel.from_pretrained(model_path)

def read_model_from_statedict(model_path):
    model = TripletSingleBERTModel(final_size = conf.final_size, 
                                   tokenizer = tokenizer, 
                                   pooling = conf.pool_type, 
                                   model_path=conf.init_model_path)    
    state_dict = torch.load(model_path)
    model.load_state_dict(state_dict)
    return model


model2 = read_model_from_statedict('./mlopen-table-model/14-45-08-11-22') ##Hardcoded
model2.to(device)
model2.eval()
print('Loaded model:', model2)

cos = nn.CosineSimilarity(dim=1, eps=1e-6)

# eval_data = data_loader.test_data()#DataLoader(eval_dataset, batch_size=5, shuffle=False)
eval_data = torch.utils.data.DataLoader(test_data, batch_size=conf.batch_size, shuffle=False)


pbar = tqdm(enumerate(eval_data))
for i, d in pbar:
    print('Working on index: ', i, ' data: ', d['pivot_ids'].shape)
    a, p, n = d['pivot_ids'], d['positive_ids'], d['negative_ids']
    a_mask, p_mask, n_mask = d['pivot_attn'], d['positive_attn'], d['negative_attn']
    embdA, embdP, embdN = model2(a, p, n, a_mask, p_mask, n_mask)
    print(f'embed shape: {embdA.shape}')
    print(f'first cos: {cos(embdA, embdP)}')
    print(f'second cos: {cos(embdA, embdN)}')
    break                      

    
# loss function is motivated from simcse work: https://github.com/princeton-nlp/SimCSE/blob/main/simcse/models.py


In [None]:
## save embeddings to disk
import csv
import numpy as np
from tqdm import tqdm

def write_csv(fp, rows):
  with open(fp, 'w') as f:
    csvf = csv.writer(f)
    [csvf.writerow([r]) for r in rows]

output_ids_path = '../features/mlopen-tupleids.list'
output_embed_path = '../features/mlopen-tuplefeatures.pt'

tuple_ids = [tuple_dataset.get_tuple_id(i) for i in range(0, len(tuple_dataset))]
data_loader = torch.utils.data.DataLoader(tuple_dataset, 
                                          batch_size=conf.batch_size, 
                                          shuffle=False)

write_csv(output_ids_path, tuple_ids)

embed_tensor = torch.empty((len(tuple_dataset), conf.final_size))
print('shape of empty: ', embed_tensor.shape)
for i, d in tqdm(enumerate(data_loader)):
    a, p, n = d['pivot_ids'], d['positive_ids'], d['negative_ids']
    a_mask, p_mask, n_mask = d['pivot_attn'], d['positive_attn'], d['negative_attn']
    embdA, embdP, embdN = model2(a, p, n, a_mask, p_mask, n_mask)
    op = embdA.detach().cpu()
    embed_tensor[i*conf.batch_size: (i+1)*conf.batch_size] = op
    
print('shape of output: ', embed_tensor.shape, ' output: ', embed_tensor)
torch.save(embed_tensor, output_embed_path)
