In [2]:
!pip install sentencepiece
!pip install transformers

Collecting sentencepiece
  Downloading sentencepiece-0.1.99-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.3/1.3 MB[0m [31m15.8 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: sentencepiece
Successfully installed sentencepiece-0.1.99


In [3]:
storage_path = "/content/drive/MyDrive/ire_project/"

In [4]:
import json
import torch
import torch.nn as nn
# import tensorflow as tf
import os
from transformers import AlbertConfig, AlbertModel

import numpy as np

from transformers import (
    AlbertConfig, AlbertForSequenceClassification, AlbertTokenizer,
    XLMRobertaConfig, XLMRobertaForSequenceClassification, XLMRobertaTokenizer,
    PreTrainedModel
)

In [5]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [7]:
file_path = storage_path + "data/"
file_name = file_path + "train_lookup.jsonl"
with open(file_name, "r") as file:
    # Read each line and parse it as JSON
    i=0
    for line in file:
        data = json.loads(line)
        # print(data)
        for key in data:
            print(key)
            print(data[key])
            print()
        i+=1
        if(i==2): break
    file.close()

id
q40

question
In which hemisphere does the summer solstice occur in December?

header
['', 'HEMISPHERE', '', 'ORBITAL EVENT', '', 'MONTH OF OCCURENCE']

rows
[['In the', 'northern hemisphere', ', the', 'summer solstice', 'occurs in', 'June'], ['In the', 'southern hemisphere', ', the', 'summer solstice', 'occurs in', 'December'], ['In the', 'northern hemisphere', ', the', 'winter solstice', 'occurs in', 'December'], ['In the', 'southern hemisphere', ', the', 'winter solstice', 'occurs in', 'June'], ['In the', 'northern hemisphere', ', the', 'spring equinox', 'occurs in', 'March'], ['In the', 'southern hemisphere', ', the', 'spring equinox', 'occurs in', 'September'], ['In the', 'northern hemisphere', ', the', 'fall equinox', 'occurs in', 'September'], ['In the', 'southern hemisphere', ', the', 'fall equinox', 'occurs in', 'March']]

target_column
1

answers
['Southern hemisphere']

table_id
regents-02

id
q41

question
The winter solstice, in the northern hemisphere takes place in wh

In [7]:
def getColRepresentation(header, rows):
    colRepresentation = []
    cols = [[str(h)] for h in header]
    for row in rows:
        for ci, cell in enumerate(row):
            if cell:  # for sparse table use case
                cols[ci].append(str(cell))
    for col in cols:
        col_rep = ' * '.join(col)
        colRepresentation.append(col_rep)
    return colRepresentation

def getRowRepresentation(header, rows):
    rowRepresentation = []
    for row in rows:
        row_rep = ' * '.join([h + ' : ' + str(c) for h, c in zip(header, row) if c])  # for sparse table use case
        rowRepresentation.append(row_rep)
    return rowRepresentation

In [8]:
def get_data(file_name, tables_cnt):
    # tensor1 = torch.Tensor()
    # tensor2 = torch.Tensor()
    data = {'row_labels': [], 'row_input':[], 'col_labels': [], 'col_input':[], 'row_queries': [], 'col_queries': []}
    tokenizer = AlbertTokenizer.from_pretrained('albert-base-v2')
    max_seq_length = 128
    batch_size = 16
    with open(file_name, "r") as file:
        # Read each line and parse it as JSON
        i=0
        for line in file:
            if(i==tables_cnt): break
            table = json.loads(line)
            header = table['header']
            rows = table['rows']
            target_label = table['target_column']
            query = table['question']
            answers = table['answers']
            col_queries = [query]*len(header)
            row_queries = [query]*len(rows)
            colsRepresentation = getColRepresentation(header, rows)
            rowsRepresentation = getRowRepresentation(header, rows)

            col_labels = [[0.9, 0.1]]*len(header)
            row_labels = [[0.9, 0.1]]*len(rows)
            col_labels[target_label] = [0.1,0.9]
            for j in range(len(rows)):
                if(rows[j][target_label] in answers):
                    row_labels[j] = [0.1, 0.9]
            data['col_input'].extend(colsRepresentation)
            data['row_input'].extend(rowsRepresentation)
            data['col_labels'].extend(col_labels)
            data['row_labels'].extend(row_labels)
            data['col_queries'].extend(col_queries)
            data['row_queries'].extend(row_queries)
            i+=1
    file.close()
    col_train_encoding = tokenizer(data['col_input'], return_tensors='pt', padding='max_length', max_length=max_seq_length)
    row_train_encoding = tokenizer(data['row_input'], return_tensors='pt', padding='max_length', max_length=max_seq_length)
    query_row_train_encoding = tokenizer(data['row_queries'], return_tensors='pt', padding='max_length', max_length=max_seq_length)
    query_col_train_encoding = tokenizer(data['col_queries'], return_tensors='pt', padding='max_length', max_length=max_seq_length)
    return row_train_encoding, torch.tensor(data['row_labels']).float(), col_train_encoding, torch.tensor(data['col_labels']).float(), query_row_train_encoding, query_col_train_encoding


def get_data_from_table(header, rows, query):
    data = {'row_input':[], 'col_input':[], 'row_queries': [], 'col_queries': []}
    tokenizer = AlbertTokenizer.from_pretrained('albert-base-v2')
    max_seq_length = 128
    batch_size = 16
    col_queries = [query]*len(header)
    row_queries = [query]*len(rows)
    colsRepresentation = getColRepresentation(header, rows)
    rowsRepresentation = getRowRepresentation(header, rows)
    data['col_input'].extend(colsRepresentation)
    data['row_input'].extend(rowsRepresentation)
    data['col_queries'].extend(col_queries)
    data['row_queries'].extend(row_queries)
    col_encoding = tokenizer(data['col_input'], return_tensors='pt', padding=True, max_length=max_seq_length)
    row_encoding = tokenizer(data['row_input'], return_tensors='pt', padding=True, max_length=max_seq_length)
    query_row_encoding = tokenizer(data['row_queries'], return_tensors='pt', padding=True, max_length=max_seq_length)
    query_col_encoding = tokenizer(data['col_queries'], return_tensors='pt', padding=True, max_length=max_seq_length)
    return row_encoding, col_encoding, query_row_encoding, query_col_encoding

In [10]:
row_train_encoding, row_labels, col_train_encodings, col_labels, query_row_train_encoding, query_col_train_encoding = get_data(file_name, 1)

print(row_train_encoding, row_labels, col_train_encodings, col_labels, query_row_train_encoding, query_col_train_encoding)

spiece.model:   0%|          | 0.00/760k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.31M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/684 [00:00<?, ?B/s]

{'input_ids': tensor([[ 2, 13, 45,  ...,  0,  0,  0],
        [ 2, 13, 45,  ...,  0,  0,  0],
        [ 2, 13, 45,  ...,  0,  0,  0],
        ...,
        [ 2, 13, 45,  ...,  0,  0,  0],
        [ 2, 13, 45,  ...,  0,  0,  0],
        [ 2, 13, 45,  ...,  0,  0,  0]]), 'token_type_ids': tensor([[0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0],
        ...,
        [0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0]]), 'attention_mask': tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        ...,
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0]])} tensor([[0.9000, 0.1000],
        [0.9000, 0.1000],
        [0.9000, 0.1000],
        [0.9000, 0.1000],
        [0.9000, 0.1000],
        [0.9000, 0.1000],
        [0.9000, 0.1000],
        [0.9000, 0.1000]]) {'input_ids': tensor([[    2,  1637,    19,    1

In [9]:
def train(model, train_data, query_train_data, optimiser, criterion, batch_size, train_labels):
    train_data.to(device)
    query_train_data.to(device)
    train_labels.to(device)
    model.train()
    train_loss = 0
    train_correct = 0
    i=0
    start = 0
    while(start < len(train_labels)):
        end = start + batch_size
        if(end > len(train_labels)):
            end = len(train_labels)
        # batch_size = len(data['input_ids'].to(device))
        optimiser.zero_grad()
        output = model(query_train_data['input_ids'][start:end, :], query_train_data['attention_mask'][start:end, :], train_data['input_ids'][start:end, :], train_data['attention_mask'][start:end, :])
        # print(output)
        # print(torch.tensor(train_labels[start:end]))
        # target = torch.tensor(train_logits[i*batch_size:(i+1)*batch_size, :], dtype=torch.float32).to(device)
        loss = criterion(output, torch.softmax(torch.tensor(train_labels)[start:end, :], dim=1, dtype = torch.float32).to(device))
        loss.backward()
        optimiser.step()
        train_loss += loss.item()
        pred = output.argmax(dim=1, keepdim=True).to(device)
        actual = train_labels[start:end].argmax(dim=1, keepdim=True).to(device)
        train_correct += pred.eq(actual.view_as(pred)).sum().item()
        i +=1
        start = end
    train_loss /= len(train_labels)
    train_acc = train_correct / len(train_labels)
    return train_loss, train_acc




In [10]:
def test(model, test_data, query_test_data, criterion, batch_size, test_labels):
    test_data.to(device)
    query_test_data.to(device)
    test_labels.to(device)
    model.eval()
    test_loss = 0
    test_correct = 0
    i=0
    start = 0
    logits = []
    with torch.no_grad():
        while(start < len(test_labels)):
            end = start + batch_size
            if(end > len(test_labels)):
                end = len(test_labels)
            output = model(query_test_data['input_ids'][start:end, :], query_test_data['attention_mask'][start:end, :], test_data['input_ids'][start:end, :], test_data['attention_mask'][start:end, :])
            # print(output)
            # print(torch.tensor(test_labels[start:end]))
            # target = torch.tensor(test_logits[i*batch_size:(i+1)*batch_size, :], dtype=torch.float32).to(device)
            loss = criterion(output, torch.softmax(torch.tensor(test_labels)[start:end, :], dim=1, dtype = torch.float32).to(device))
            # loss.backward()
            # optimiser.step()
            test_loss += loss.item()
            pred = output.argmax(dim=1, keepdim=True).to(device)
            for out in output:
                logits.append(out[1])
            actual = test_labels[start:end].argmax(dim=1, keepdim=True).to(device)
            test_correct += pred.eq(actual.view_as(pred)).sum().item()
            i +=1
            start = end
    test_loss /= len(test_labels)
    test_acc = test_correct / len(test_labels)
    return test_loss, test_acc, logits

In [11]:
# rq :rc:rq ⊗rc:(rq -rc)⊗(rq -rc)

class RCI_representation(nn.Module):
    def __init__(self, l):
        super(RCI_representation, self).__init__()
        albert_config = AlbertConfig(hidden_size=512)
        self.albert1 = AlbertModel(albert_config)
        self.albert2 = AlbertModel(albert_config)
        self.net1 = nn.Linear(512*4, 512)
        self.leaky_relu = nn.LeakyReLU(l)
        self.net2 = nn.Linear(512, 2)

        # model_output.last_hidden_state[:, 0, :]

    def forward(self, q_input_ids, q_attention_mask, c_input_ids, c_attention_mask, rc=None, get_representation=False):
        if(get_representation):
            rc = self.albert2(c_input_ids, attention_mask=c_attention_mask).last_hidden_state[:, 0, :]
            return rc
        rq = self.albert1(q_input_ids, attention_mask=q_attention_mask).last_hidden_state[:, 0, :]
        if(rc is None): rc = self.albert2(c_input_ids, attention_mask=c_attention_mask).last_hidden_state[:, 0, :]
        rqc = torch.cat([rq, rc, rq*rc, (rq-rc)**2], dim = 1)
        net1_output = self.net1(rqc)
        output = torch.softmax(self.net2(net1_output), dim=1)
        return output

In [12]:
def train_model(model, train_data, query_train_data, train_labels, optimiser=None, criterion=None, batch_size=16, lr=1e-3, epoch=2, checkpoint_dir=None, resume_from_checkpoint=True, checkpoint_file=None):
    if(optimiser is None): optimiser = torch.optim.Adam(model.parameters(), lr)
    if(criterion is None): criterion = nn.CrossEntropyLoss()
    print("Training...........")
    checkpoint_path = checkpoint_dir + checkpoint_file

    if resume_from_checkpoint:
        if checkpoint_path is None or not os.path.exists(checkpoint_path):
            start_epoch = 0
            print("No saved checkpoints to resume")
        else:
            print("Checkpoint accessing...........")
            checkpoint = torch.load(checkpoint_path, map_location=torch.device('cpu'))
            model.load_state_dict(checkpoint['model_state_dict'])
            optimiser.load_state_dict(checkpoint['optimiser_state_dict'])
            # lr_scheduler.load_state_dict(checkpoint['scheduler_state_dict'])
            start_epoch = checkpoint['epoch']
            print(f"Resuming training from epoch {start_epoch}")
    else:
        start_epoch = 0
    for i in range(start_epoch, epoch):
        train_loss, train_acc = train(model, train_data, query_train_data, optimiser, criterion, batch_size, train_labels)
        print(f"Epoch {i}   Train loss: {train_loss}    Train accuracy: {train_acc}")

        if checkpoint_dir is not None:
            if(not os.path.exists(checkpoint_dir)):
                os.makedirs(checkpoint_dir, exist_ok=True)
            checkpoint = {
                'epoch': epoch + 1,
                'model_state_dict': model.state_dict(),
                'optimiser_state_dict': optimiser.state_dict(),
            }
            torch.save(checkpoint, checkpoint_path)

def test_model(model, test_data, query_test_data, criterion, batch_size, test_labels):
    print("Evaluating on testing data..................")
    test_loss, test_acc = test(model, test_data, query_test_data, criterion, batch_size, test_labels)
    print(f"Testing loss: {test_loss} Test accuracy:  {test_acc}")



In [16]:
row_train_data, row_train_labels, col_train_data, col_train_labels, query_row_train_data, query_col_train_data = get_data(file_name, 10000)
row_model_representation = RCI_representation(0.1).to(device)
optimiser = torch.optim.Adam(row_model_representation.parameters(), lr=1e-4)
criterion = nn.CrossEntropyLoss()


train_model(model=row_model_representation, train_data=row_train_data, query_train_data=query_row_train_data, train_labels=row_train_labels, optimiser=optimiser, criterion=criterion, batch_size=16, lr=1e-3, epoch=3, checkpoint_dir=storage_path+"checkpoints/", resume_from_checkpoint=False, checkpoint_file="row_model_representation.pth")


Training...........
Epoch 0   Train loss: 0.042900774627923965    Train accuracy: 0.725
Epoch 1   Train loss: 0.041852790862321854    Train accuracy: 0.875
Epoch 2   Train loss: 0.04145502373576164    Train accuracy: 0.875


In [18]:
col_model_representation = RCI_representation(0.1).to(device)
optimiser = torch.optim.Adam(col_model_representation.parameters(), lr=1e-4)
criterion = nn.CrossEntropyLoss()


train_model(model=col_model_representation, train_data=col_train_data, query_train_data=query_col_train_data, train_labels=col_train_labels, optimiser=optimiser, criterion=criterion, batch_size=16, lr=1e-3, epoch=3, checkpoint_dir=storage_path+"checkpoints/", resume_from_checkpoint=False, checkpoint_file="col_model_representation.pth")


Training...........
Epoch 0   Train loss: 0.042678617491827615    Train accuracy: 0.7161359956826767
Epoch 1   Train loss: 0.042406844211409556    Train accuracy: 0.7609282245008095
Epoch 2   Train loss: 0.040756292343139646    Train accuracy: 0.8253334446759423


In [19]:
file_name = file_path + "test_lookup.jsonl"
row_test_data, row_test_labels, col_test_data, col_test_labels, query_row_test_data, query_col_test_data = get_data(file_name, 3)

criterion = nn.CrossEntropyLoss()

test_model(model = row_model_representation, test_data = row_test_data, query_test_data=query_row_test_data, criterion = criterion, batch_size = 16, test_labels = row_test_labels)


Evaluating on testing data..................
Test loss: 0.04199714462910725    Test accuracy: 0.86923987753267821


In [22]:

test_model(model = col_model_representation, test_data = col_test_data, query_test_data=query_col_test_data, criterion = criterion, batch_size = 16, test_labels = col_test_labels)


Evaluating on testing data..................
Test loss: 0.04129876543210987    Test accuracy: 0.8227345678901234


In [22]:
def getLogits(header, rows, query, row_model, col_model):
    criterion = nn.CrossEntropyLoss()
    batch_size = 16
    row_data, col_data, query_row_data, query_col_data = get_data_from_table(header, rows, query)
    row_labels = torch.zeros([len(row_data),2])
    col_labels = torch.zeros([len(col_data), 2])
    _, _, rowsLogits = test(row_model, row_data, query_row_data, criterion, batch_size, row_labels)
    _, _, colsLogits = test(col_model, col_data, query_col_data, criterion, batch_size, col_labels)
    return rowsLogits, colsLogits

def getScores(rowsLogits, colsLogits, top_k):
    scores = []
    for i in range(len(rowsLogits)):
        for j in range(len(colsLogits)):
            score = float(rowsLogits[i] + colsLogits[j])
            scores.append([i,j,score])
    scores.sort(key=lambda x: x[2], reverse=True)
    return scores[0:top_k]


def getQueryAnswers(query, header, rows, row_model, col_model):
    batch_size = 16
    max_seq_length=128
    rowsLogits, colsLogits = getLogits(header, rows, query, row_model, col_model)

    top_k = 5
    rci_scores = getScores(rowsLogits, colsLogits, top_k)
    # row_scores = getScores(rowsLogits, )

    return [{'row_ndx': i, 'col_ndx': j, 'confidence_score': score, 'text': rows[i][j]} for i, j, score in rci_scores]

In [23]:
file_path = storage_path + "data/"
file_name = file_path + "test_lookup.jsonl"
tables = []
with open(file_name, "r") as file:
    # Read each line and parse it as JSON
    i=0
    for line in file:
        data = json.loads(line)
        tables.append(data)
        # print(data)
        # for key in data:
        #     print(key)
        #     print(data[key])
        #     print()
        # i+=1
        # if(i==1): break
    file.close()


print(tables[0])

{'id': 'q20', 'question': 'Which orbital event has midrange day and night periods and happens around Easter?', 'header': ['', 'ORBITAL EVENT', '', 'PERIOD OF DAYLIGHT', '', 'PERIOD OF NIGHT', ''], 'rows': [['The', 'summer solstice', 'is the day with the', 'longest', 'period of daylight and the', 'shortest', 'period of night'], ['The', 'winter solstice', 'is the day with the', 'shortest', 'period of daylight and the', 'longest', 'period of night'], ['The', 'spring equinox', 'is the day with the', 'midrange', 'period of daylight and the', 'midrange', 'period of night'], ['The', 'fall equinox', 'is the day with the', 'midrange', 'period of daylight and the', 'midrange', 'period of night']], 'target_column': 1, 'answers': ['Spring equinox'], 'table_id': 'regents-01'}


In [27]:
import time
rci_answers = []
true_answers = []
tapas_answers = []
for table in tables[:25]:
    true_answers.extend(table['answers'])
    start_time = time.time()
    answers = getQueryAnswers(table['question'], table['header'], table['rows'], row_model_representation, col_model_representation)
    end_time = time.time()
    rci_answer = []
    for answer in answers:
        rci_answer.append(answer['text'])
    rci_answers.append(rci_answer)

total_time = end_time-start_time
print(rci_answers)
print(true_answers)
print("time per query is ", total_time/(len(true_answers)))

[['spring equinox', 'fall equinox', 'summer solstice', 'winter solstice', 'The'], ['period of night', 'period of night', 'period of daylight and the', 'period of daylight and the', 'midrange'], ['midrange', 'shortest', 'longest', 'midrange', 'period of night'], ['midrange', 'longest', 'shortest', 'is the day with the', 'The'], ['midrange', 'period of daylight and the', 'period of night', 'midrange', 'is the day with the'], ['midrange', 'shortest', 'longest', 'midrange', 'midrange'], ['midrange', 'shortest', 'longest', 'midrange', 'longest'], ['midrange', 'period of night', 'midrange', 'midrange', 'period of daylight and the'], ['spring equinox', 'fall equinox', 'The', 'The', 'summer solstice'], ['fall equinox', 'spring equinox', 'The', 'The', 'winter solstice'], ['is the day with the', 'is the day with the', 'is the day with the', 'is the day with the', 'The'], ['midrange', 'midrange', 'period of daylight and the', 'period of night', 'longest'], ['midrange', 'longest', 'shortest', 'The

In [29]:
hit_at_1 = 0
for i in range(len(true_answers)):
    if(true_answers[i] == rci_answers[i][0]): hit_at_1 +=1
hit_at_1/= len(true_answers)
print(hit_at_1)

0.43


In [30]:
reciprocal_rank_sum = 0
for i in range(len(true_answers)):
    for j in range(len(rci_answers[i])):
        if(true_answers[i] == rci_answers[i][j]):
            reciprocal_rank_sum += 1/(j+1)
            break
mrr = reciprocal_rank_sum/len(true_answers)
print(mrr)

0.49
