In [1]:
!pip install sentencepiece
!pip install transformers

Collecting sentencepiece
  Downloading sentencepiece-0.1.99-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.3/1.3 MB[0m [31m14.3 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: sentencepiece
Successfully installed sentencepiece-0.1.99


In [4]:
storage_path = "/content/drive/MyDrive/ire_project/"

In [8]:
import json
import torch
import torch.nn as nn
import os
from transformers import AlbertConfig, AlbertModel

import numpy as np

from transformers import (
    AlbertConfig, AlbertForSequenceClassification, AlbertTokenizer,
    XLMRobertaConfig, XLMRobertaForSequenceClassification, XLMRobertaTokenizer,
    PreTrainedModel
)

In [4]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [7]:
file_path = storage_path + "data/"
file_name = file_path + "train_lookup.jsonl"
with open(file_name, "r") as file:
    # Read each line and parse it as JSON
    i=0
    for line in file:
        data = json.loads(line)
        # print(data)
        for key in data:
            print(key)
            print(data[key])
            print()
        i+=1
        if(i==1): break
    file.close()

id
q40

question
In which hemisphere does the summer solstice occur in December?

header
['', 'HEMISPHERE', '', 'ORBITAL EVENT', '', 'MONTH OF OCCURENCE']

rows
[['In the', 'northern hemisphere', ', the', 'summer solstice', 'occurs in', 'June'], ['In the', 'southern hemisphere', ', the', 'summer solstice', 'occurs in', 'December'], ['In the', 'northern hemisphere', ', the', 'winter solstice', 'occurs in', 'December'], ['In the', 'southern hemisphere', ', the', 'winter solstice', 'occurs in', 'June'], ['In the', 'northern hemisphere', ', the', 'spring equinox', 'occurs in', 'March'], ['In the', 'southern hemisphere', ', the', 'spring equinox', 'occurs in', 'September'], ['In the', 'northern hemisphere', ', the', 'fall equinox', 'occurs in', 'September'], ['In the', 'southern hemisphere', ', the', 'fall equinox', 'occurs in', 'March']]

target_column
1

answers
['Southern hemisphere']

table_id
regents-02



In [8]:
def getColRepresentation(header, rows):
    colRepresentation = []
    cols = [[str(h)] for h in header]
    for row in rows:
        for ci, cell in enumerate(row):
            if cell:  # for sparse table use case
                cols[ci].append(str(cell))
    for col in cols:
        col_rep = ' * '.join(col)
        colRepresentation.append(col_rep)
    return colRepresentation

def getRowRepresentation(header, rows):
    rowRepresentation = []
    for row in rows:
        row_rep = ' * '.join([h + ' : ' + str(c) for h, c in zip(header, row) if c])  # for sparse table use case
        rowRepresentation.append(row_rep)
    return rowRepresentation

In [9]:
def get_data(file_name, tables_cnt):
    # tensor1 = torch.Tensor()
    # tensor2 = torch.Tensor()
    data = {'row_labels': [], 'row_input':[], 'col_labels': [], 'col_input':[], 'row_queries': [], 'col_queries': []}
    tokenizer = AlbertTokenizer.from_pretrained('albert-base-v2')
    max_seq_length = 128
    batch_size = 16
    with open(file_name, "r") as file:
        # Read each line and parse it as JSON
        i=0
        for line in file:
            if(i==tables_cnt): break
            table = json.loads(line)
            header = table['header']
            rows = table['rows']
            target_label = table['target_column']
            query = table['question']
            answers = table['answers']
            col_queries = [query]*len(header)
            row_queries = [query]*len(rows)
            colsRepresentation = getColRepresentation(header, rows)
            rowsRepresentation = getRowRepresentation(header, rows)

            col_labels = [[0.9, 0.1]]*len(header)
            row_labels = [[0.9, 0.1]]*len(rows)
            col_labels[target_label] = [0.1,0.9]
            for j in range(len(rows)):
                if(rows[j][target_label] in answers):
                    row_labels[j] = [0.1, 0.9]
            data['col_input'].extend(colsRepresentation)
            data['row_input'].extend(rowsRepresentation)
            data['col_labels'].extend(col_labels)
            data['row_labels'].extend(row_labels)
            data['col_queries'].extend(col_queries)
            data['row_queries'].extend(row_queries)
            i+=1
    file.close()
    col_train_encoding = tokenizer(data['col_input'], data['col_queries'],return_tensors='pt', padding=True, max_length=max_seq_length)
    row_train_encoding = tokenizer(data['row_input'], data['row_queries'],return_tensors='pt', padding=True, max_length=max_seq_length)

    return row_train_encoding, torch.tensor(data['row_labels']).float(), col_train_encoding, torch.tensor(data['col_labels']).float()


def get_data_from_table(header, rows, query):
    data = {'row_input':[], 'col_input':[], 'row_queries': [], 'col_queries': []}
    tokenizer = AlbertTokenizer.from_pretrained('albert-base-v2')
    max_seq_length = 128
    batch_size = 16
    col_queries = [query]*len(header)
    row_queries = [query]*len(rows)
    colsRepresentation = getColRepresentation(header, rows)
    rowsRepresentation = getRowRepresentation(header, rows)
    data['col_input'].extend(colsRepresentation)
    data['row_input'].extend(rowsRepresentation)
    data['col_queries'].extend(col_queries)
    data['row_queries'].extend(row_queries)
    col_encoding = tokenizer(data['col_input'], data['col_queries'],return_tensors='pt', padding=True, truncation=True, max_length=max_seq_length)
    row_encoding = tokenizer(data['row_input'], data['row_queries'],return_tensors='pt', padding=True, truncation=True, max_length=max_seq_length)
    return row_encoding, col_encoding

In [31]:
row_train_encoding, row_labels, col_train_encodings, col_labels = get_data(file_name, 1)

print(row_train_encoding, row_labels, col_train_encodings, col_labels)

{'input_ids': tensor([[    2,    13,    45,    19,    14,  1637, 15429,    13,    45,   743,
         15429,  1637,    13,    45,    13,    15,    14,  1637, 14985,   807,
            13,    45,   697, 29230,  1637,    13,    45,  3690,    19,  1637,
          1617,    16,  3744,  2940,    13,    45,   295,     3,    19,    56,
         15429,   630,    14,   697, 29230,  3744,    19,   356,    60,     3,
             0,     0,     0],
        [    2,    13,    45,    19,    14,  1637, 15429,    13,    45,   775,
         15429,  1637,    13,    45,    13,    15,    14,  1637, 14985,   807,
            13,    45,   697, 29230,  1637,    13,    45,  3690,    19,  1637,
          1617,    16,  3744,  2940,    13,    45,   356,     3,    19,    56,
         15429,   630,    14,   697, 29230,  3744,    19,   356,    60,     3,
             0,     0,     0],
        [    2,    13,    45,    19,    14,  1637, 15429,    13,    45,   743,
         15429,  1637,    13,    45,    13,    15,    1



In [10]:
def train(model, train_data, optimiser, criterion, batch_size, train_labels):
    train_data.to(device)
    train_labels.to(device)
    model.train()
    train_loss = 0
    train_correct = 0
    i=0
    start = 0
    while(start < len(train_labels)):
        end = start + batch_size
        if(end > len(train_labels)):
            end = len(train_labels)
        # batch_size = len(data['input_ids'].to(device))
        optimiser.zero_grad()
        output = model(train_data['input_ids'][start:end, :], train_data['attention_mask'][start:end, :])
        # print(output)
        # print(torch.tensor(train_labels[start:end]))
        # target = torch.tensor(train_logits[i*batch_size:(i+1)*batch_size, :], dtype=torch.float32).to(device)
        loss = criterion(output, torch.softmax(torch.tensor(train_labels)[start:end, :], dim=1, dtype = torch.float32).to(device))
        loss.backward()
        optimiser.step()
        train_loss += loss.item()
        pred = output.argmax(dim=1, keepdim=True).to(device)
        actual = train_labels[start:end].argmax(dim=1, keepdim=True).to(device)
        train_correct += pred.eq(actual.view_as(pred)).sum().item()
        i +=1
        start = end
    train_loss /= len(train_labels)
    train_acc = train_correct / len(train_labels)
    return train_loss, train_acc




In [11]:
def test(model, test_data, criterion, batch_size, test_labels):
    test_data.to(device)
    test_labels.to(device)
    model.eval()
    test_loss = 0
    test_correct = 0
    i=0
    start = 0
    logits = []
    with torch.no_grad():
        while(start < len(test_labels)):
            end = start + batch_size
            if(end > len(test_labels)):
                end = len(test_labels)
            output = model(test_data['input_ids'][start:end, :], test_data['attention_mask'][start:end, :].to(device))
            # print(output)
            # print(torch.tensor(test_labels[start:end]))
            # target = torch.tensor(test_logits[i*batch_size:(i+1)*batch_size, :], dtype=torch.float32).to(device)
            loss = criterion(output, torch.softmax(torch.tensor(test_labels)[start:end, :], dim=1, dtype = torch.float32).to(device))
            # loss.backward()
            # optimiser.step()
            test_loss += loss.item()
            pred = output.argmax(dim=1, keepdim=True).to(device)
            for out in output:
                logits.append(out[1])
            actual = test_labels[start:end].argmax(dim=1, keepdim=True).to(device)
            test_correct += pred.eq(actual.view_as(pred)).sum().item()
            i +=1
            start = end
    test_loss /= len(test_labels)
    test_acc = test_correct / len(test_labels)
    return test_loss, test_acc, logits

In [12]:
class RCI_interaction(nn.Module):
    def __init__(self, l):
        super(RCI_interaction, self).__init__()
        albert_config = AlbertConfig(hidden_size=512)
        self.albert = AlbertModel(albert_config)
        self.net1 = nn.Linear(512, 2)
        self.leaky_relu = nn.LeakyReLU(l)

    def forward(self, input_ids, attention_mask):
        albert_output = self.albert(input_ids, attention_mask=attention_mask)[0]
        output = torch.softmax(self.net1(albert_output[:, 0]), dim=1)
        return output


In [13]:
def train_model(model, train_data, train_labels, optimiser=None, criterion=None, batch_size=16, lr=1e-3, epoch=2, checkpoint_dir=None, resume_from_checkpoint=True, checkpoint_file=None):
    if(optimiser is None): optimiser = torch.optim.Adam(model.parameters(), lr)
    if(criterion is None): criterion = nn.CrossEntropyLoss()
    print("Training...........")
    checkpoint_path = checkpoint_dir + checkpoint_file

    if resume_from_checkpoint:
        if checkpoint_path is None or not os.path.exists(checkpoint_path):
            start_epoch = 0
            print("No saved checkpoints to resume")
        else:
            print("Checkpoint accessing...........")
            checkpoint = torch.load(checkpoint_path, map_location=torch.device('cpu'))
            model.load_state_dict(checkpoint['model_state_dict'])
            optimiser.load_state_dict(checkpoint['optimiser_state_dict'])
            # lr_scheduler.load_state_dict(checkpoint['scheduler_state_dict'])
            start_epoch = checkpoint['epoch']
            print(f"Resuming training from epoch {start_epoch}")
    else:
        start_epoch = 0
    for i in range(start_epoch, epoch):
        train_loss, train_acc = train(model, train_data, optimiser, criterion, batch_size, train_labels)
        print(f"Epoch {i}   Train loss: {train_loss}    Train accuracy: {train_acc}")

        if checkpoint_dir is not None:
            if(not os.path.exists(checkpoint_dir)):
                os.makedirs(checkpoint_dir, exist_ok=True)
            checkpoint = {
                'epoch': epoch + 1,
                'model_state_dict': model.state_dict(),
                'optimiser_state_dict': optimiser.state_dict(),
            }
            torch.save(checkpoint, checkpoint_path)

def test_model(model, test_data, criterion, batch_size, test_labels):
    print("Evaluating on testing data..................")
    test_loss, test_acc = test(model, test_data, criterion, batch_size, test_labels)
    print(f"Testing loss: {test_loss} Test accuracy:  {test_acc}")



In [48]:
row_train_data, row_train_labels, col_train_data, col_train_labels = get_data(file_name, 10000)
print(row_train_data['input_ids'].size(), row_labels.size())
print(len(row_labels))
# print(train_data['input_ids'])
# print()
# print(labels)
row_model_interaction = RCI_interaction(0.1).to(device)
optimiser = torch.optim.Adam(row_model_interaction.parameters(), lr=1e-4)
criterion = nn.CrossEntropyLoss()

train_model(model=row_model_interaction, train_data=row_train_data, train_labels=row_train_labels, optimiser=optimiser, criterion=criterion, batch_size=16, lr=1e-3, epoch=3, checkpoint_dir=storage_path+"checkpoints/", resume_from_checkpoint=False, checkpoint_file="row_model_interaction.pth")


torch.Size([24, 56]) torch.Size([8, 2])
8
Training...........
Epoch 0   Train loss: 0.04193515129723903    Train accuracy: 0.8676542010684798
Epoch 1   Train loss: 0.0415085686851092    Train accuracy: 0.8705682370082565
Epoch 2   Train loss: 0.04132189093254445    Train accuracy: 0.8744536182612919


In [None]:
file_name = file_path + "test_lookup.jsonl"
row_test_data, row_test_labels, col_test_data, col_test_labels = get_data(file_name, 100)
print(row_test_data['input_ids'].size(), row_labels.size())
print(len(row_labels))
# print(test_data['input_ids'])
# print()
# print(labels)
# row_model = MLP(0.1).to(device)
# optimiser = torch.optim.Adam(row_model.parameters(), lr=1e-4)
criterion = nn.CrossEntropyLoss()

test_model(model = row_model, test_data = row_test_data, criterion = criterion, batch_size = 16, test_labels = row_test_labels)

In [56]:
col_model_interaction = RCI_interaction(0.1).to(device)
optimiser = torch.optim.Adam(col_model_interaction.parameters(), lr=1e-4)
criterion = nn.CrossEntropyLoss()

train_model(model=col_model_interaction, train_data=col_train_data, train_labels=col_train_labels, optimiser=optimiser, criterion=criterion, batch_size=16, lr=1e-3, epoch=3, checkpoint_dir=storage_path+"checkpoints/", resume_from_checkpoint=True, checkpoint_file="col_model_interaction.pth")


Training...........
Epoch 0   Train loss: 0.046467066804567976    Train accuracy: 0.6666666666666666
Epoch 1   Train loss: 0.04621846963564555    Train accuracy: 0.7
Epoch 2   Train loss: 0.045756292343139646    Train accuracy: 0.8333333333333334


In [14]:
def getLogits(header, rows, query, row_model, col_model):
    criterion = nn.CrossEntropyLoss()
    batch_size = 16
    row_data, col_data = get_data_from_table(header, rows, query)
    row_labels = torch.zeros([len(row_data),2])
    col_labels = torch.zeros([len(col_data), 2])
    _, _, rowsLogits = test(row_model, row_data, criterion, batch_size, row_labels)
    _, _, colsLogits = test(col_model, col_data, criterion, batch_size, col_labels)
    return rowsLogits, colsLogits

def getScores(rowsLogits, colsLogits, top_k):
    scores = []
    for i in range(len(rowsLogits)):
        for j in range(len(colsLogits)):
            score = float(rowsLogits[i] + colsLogits[j])
            scores.append([i,j,score])
    scores.sort(key=lambda x: x[2], reverse=True)
    return scores[0:top_k]


def getQueryAnswers(query, header, rows, row_model, col_model):
    batch_size = 16
    max_seq_length=128
    rowsLogits, colsLogits = getLogits(header, rows, query, row_model, col_model)

    top_k = 3
    rci_scores = getScores(rowsLogits, colsLogits, top_k)
    # row_scores = getScores(rowsLogits, )

    return [{'row_ndx': i, 'col_ndx': j, 'confidence_score': score, 'text': rows[i][j]} for i, j, score in rci_scores]


In [54]:
header = ['Participant', 'Race', 'Date']
rows = [['Michael', 'Runathon', 'June 10, 2020'],
        ['Mustafa', 'Runathon', 'Sept 3, 2020'],
        ['Alfio', 'Runathon', 'Jan 1, 2021'],]
answers = getQueryAnswers('Who won the race in June?',header, rows, row_model_interaction, col_model_interaction)
print("Predicted cells along with their confidence scores and values")
for answer in answers:
        print(answer)

Predicted cells along with their confidence scores and values
{'row_ndx': 0, 'col_ndx': 0, 'confidence_score': 0.8965473175048828, 'text': 'Michael'}
{'row_ndx': 2, 'col_ndx': 0, 'confidence_score': 0.8920146822929382, 'text': 'Alfio'}
{'row_ndx': 0, 'col_ndx': 2, 'confidence_score': 0.8879615068435669, 'text': 'Jun 10, 2021'}


In [17]:
!pip install rank_bm25

Collecting rank_bm25
  Downloading rank_bm25-0.2.2-py3-none-any.whl (8.6 kB)
Installing collected packages: rank_bm25
Successfully installed rank_bm25-0.2.2


In [18]:
import re
from rank_bm25 import BM25Okapi

class pipeline:
    def __init__(self, row_model, col_model, file_path, data_file_names, representation_file_names, BM25_k=300, top_k=5):
        self.row_model = row_model
        self.col_model = col_model
        # self.file_path = file_path
        for file_name in data_file_names:
            self.data_file_names.append(file_path + file_name)
        for file_name in representation_file_names:
            self.representation_file_names.append(file_path + file_name)
        self.BM25_k = BM25_k
        self.top_k = top_k

    def read_file(self, in_file,binary=False,errors=None):
        if binary:
            if in_file.endswith('.gz'):
                return gzip.open(in_file,'rb')
            elif in_file.endswith('.bz2'):
                return bz2.open(in_file,'rb')
            else:
                return open(in_file,'rb')

        else:
            if in_file.endswith('.gz'):
                return gzip.open(in_file,'rt',encoding='utf-8',errors=errors)
            elif in_file.endswith('.bz2'):
                return bz2.open(in_file,'rt',encoding='utf-8',errors=errors)
            else:
                return open(in_file,'r',encoding='utf-8',errors=errors)

    def pre_process(self, path):
        di = {}
        punc_pattern = r"[!\"#\$%&\'\(\)\*\+,-\./:;<=>\?@\[\\\]\^_`{\|}~]"
        with read_file(path) as fp:
            for n1,line in enumerate(fp):
                data = json.loads(line)
                for k,v in data.items():
                    qid = k
                    header = v[0]
                    rows = v[1:]
                    # print(qid,header,rows)
                    header1 = []
                    for h in header:
                        res = re.sub(punc_pattern,' ',h)
                        res = re.sub("\s+",' ',res)
                        header1.extend(res.lower().split())

                    rows1 = []
                    for i in rows:
                        for j in i:
                            res = re.sub(punc_pattern,' ',j)
                            res = re.sub("\s+",' ',res)
                            rows1.extend(res.lower().split())

                    header1.extend(rows1)
                    # print(header1)
                    di[k] = header1
            return di


    def ranking_docs(self, query,di):
        tokenized_query = preprocess_query(query)
        bm25 = BM25Okapi(di.values())
        scores = bm25.get_scores(tokenized_query)
        ranked_documents = dict(sorted(zip(di.keys(), scores), key=lambda x: x[1], reverse=True))
        return ranked_documents

    def BM25(self, query):
        top = self.BM25_k
        paths = self.data_file_names
        di1 = pre_process(paths[0])
        di2 = pre_process(paths[1])
        di3 = pre_process(paths[2])
        ranked_doc1 = ranking_docs(query,di1)
        ranked_doc2 = ranking_docs(query,di2)
        ranked_doc3 = ranking_docs(query,di3)
        result = {**ranked_doc1,**ranked_doc2,**ranked_doc3}
        final_result = dict(list(sorted(result.items(), key=lambda x: x[1], reverse=True))[:top])

        tables = {}
        for i in paths:
            with read_file(i) as fp:
                for n1,line in enumerate(fp):
                    data = json.loads(line)
                    for k,v in data.items():
                        if(k in final_result.keys()):
                            tables[k] = v

        return tables

    def getColRepresentation(self, header, rows):
        colRepresentation = []
        cols = [[str(h)] for h in header]
        for row in rows:
            for ci, cell in enumerate(row):
                if cell:  # for sparse table use case
                    cols[ci].append(str(cell))
        for col in cols:
            col_rep = ' * '.join(col)
            colRepresentation.append(col_rep)
        return colRepresentation

    def getRowRepresentation(self, header, rows):
        rowRepresentation = []
        for row in rows:
            row_rep = ' * '.join([h + ' : ' + str(c) for h, c in zip(header, row) if c])  # for sparse table use case
            rowRepresentation.append(row_rep)
        return rowRepresentation

    def get_data_from_table(self, header, rows, query):
        data = {'row_input':[], 'col_input':[], 'row_queries': [], 'col_queries': []}
        tokenizer = AlbertTokenizer.from_pretrained('albert-base-v2')
        max_seq_length = 128
        batch_size = 16
        col_queries = [query]*len(header)
        row_queries = [query]*len(rows)
        colsRepresentation = getColRepresentation(header, rows)
        rowsRepresentation = getRowRepresentation(header, rows)
        data['col_input'].extend(colsRepresentation)
        data['row_input'].extend(rowsRepresentation)
        data['col_queries'].extend(col_queries)
        data['row_queries'].extend(row_queries)
        col_encoding = tokenizer(data['col_input'], data['col_queries'],return_tensors='pt', padding=True, truncation=True, max_length=max_seq_length)
        row_encoding = tokenizer(data['row_input'], data['row_queries'],return_tensors='pt', padding=True, truncation=True, max_length=max_seq_length)
        return row_encoding, col_encoding

    def getLogits(self, header, rows, query):
        criterion = nn.CrossEntropyLoss()
        batch_size = 16
        row_data, col_data = get_data_from_table(header, rows, query)
        row_labels = torch.zeros([len(row_data),2])
        col_labels = torch.zeros([len(col_data), 2])
        _, _, rowsLogits = test(self.row_model, row_data, criterion, batch_size, row_labels)
        _, _, colsLogits = test(self.col_model, col_data, criterion, batch_size, col_labels)
        return rowsLogits, colsLogits

    def getScores(self, rowsLogits, colsLogits, top_k):
        scores = []
        for i in range(len(rowsLogits)):
            for j in range(len(colsLogits)):
                score = float(rowsLogits[i] + colsLogits[j])
                scores.append([i,j,score])
        scores.sort(key=lambda x: x[2], reverse=True)
        return scores[0:top_k]

    def RCI(self, query, header, rows):
        batch_size = 16
        max_seq_length=128
        rowsLogits, colsLogits = getLogits(header, rows, query, row_model, col_model)

        top_k = 5
        rci_scores = getScores(rowsLogits, colsLogits, top_k)
        return [{'row_ndx': i, 'col_ndx': j, 'confidence_score': score, 'text': rows[i][j]} for i, j, score in rci_scores]

    def RCI_System(self, query):
        tables = BM25(query)
        table_dict = {}
        table_scores = {}
        results = {}
        for table in tables:
            header = table['header']
            rows = table['rows']
            id = table['id']
            table_dict[id] = {'header':header, 'rows': rows}
            cell_score = RCI(query, header, rows)
            results[id] = cell_score
            table_scores[id] = cell_score[0]['confidence_score']

        result_table_ids = sorted(table_scores, key=lambda k: table_scores[k], reverse=True)[:top_k]
        retrieved_results = []
        for id in result_table_ids:
            retrieved_results.append({'id':id, 'header':table_dict[id]['header'], 'rows':table_dict[id]['rows'], 'cells': results[id]})
        return retrieved_results





In [30]:
file_path = storage_path + "data/"
file_name = file_path + "test_lookup.jsonl"
tables = []
with open(file_name, "r") as file:
    # Read each line and parse it as JSON
    i=0
    for line in file:
        data = json.loads(line)
        tables.append(data)
        # print(data)
        # for key in data:
        #     print(key)
        #     print(data[key])
        #     print()
        # i+=1
        # if(i==1): break
    file.close()


print(tables[0])

{'id': 'q20', 'question': 'Which orbital event has midrange day and night periods and happens around Easter?', 'header': ['', 'ORBITAL EVENT', '', 'PERIOD OF DAYLIGHT', '', 'PERIOD OF NIGHT', ''], 'rows': [['The', 'summer solstice', 'is the day with the', 'longest', 'period of daylight and the', 'shortest', 'period of night'], ['The', 'winter solstice', 'is the day with the', 'shortest', 'period of daylight and the', 'longest', 'period of night'], ['The', 'spring equinox', 'is the day with the', 'midrange', 'period of daylight and the', 'midrange', 'period of night'], ['The', 'fall equinox', 'is the day with the', 'midrange', 'period of daylight and the', 'midrange', 'period of night']], 'target_column': 1, 'answers': ['Spring equinox'], 'table_id': 'regents-01'}


In [48]:
import time
rci_answers = []
true_answers = []
for table in tables[:25]:
    true_answers.extend(table['answers'])
    start_time = time.time()
    answers = getQueryAnswers(table['question'], table['header'], table['rows'], row_model_interaction, col_model_interaction)
    end_time = time.time()
    rci_answer = []
    for answer in answers:
        rci_answer.append(answer['text'])
    rci_answers.append(rci_answer)

total_time = end_time-start_time
print(rci_answers)
print(true_answers)
print("time per query is ", total_time/(len(true_answers)))

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pai

[['spring equinox', 'fall equinox', 'summer solstice', 'winter solstice', 'The'], ['period of night', 'period of night', 'period of daylight and the', 'period of daylight and the', 'midrange'], ['midrange', 'shortest', 'longest', 'midrange', 'period of night'], ['midrange', 'longest', 'shortest', 'is the day with the', 'The'], ['midrange', 'period of daylight and the', 'period of night', 'midrange', 'is the day with the'], ['midrange', 'shortest', 'longest', 'midrange', 'midrange'], ['midrange', 'shortest', 'longest', 'midrange', 'longest'], ['midrange', 'period of night', 'midrange', 'midrange', 'period of daylight and the'], ['spring equinox', 'fall equinox', 'The', 'The', 'summer solstice'], ['fall equinox', 'spring equinox', 'The', 'The', 'winter solstice'], ['is the day with the', 'is the day with the', 'is the day with the', 'is the day with the', 'The'], ['midrange', 'midrange', 'period of daylight and the', 'period of night', 'longest'], ['midrange', 'longest', 'shortest', 'The

In [49]:
hit_at_1 = 0
for i in range(len(true_answers)):
    if(true_answers[i] == rci_answers[i][0]): hit_at_1 +=1
hit_at_1/= len(true_answers)
print(hit_at_1)

0.52


In [50]:
reciprocal_rank_sum = 0
for i in range(len(true_answers)):
    for j in range(len(rci_answers[i])):
        if(true_answers[i] == rci_answers[i][j]):
            print(1/(j+1))
            reciprocal_rank_sum += 1/(j+1)
            break
mrr = reciprocal_rank_sum/len(true_answers)
print(mrr)

1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
0.5
1.0
1.0
0.54


In [10]:
from transformers import AutoTokenizer, TapasForQuestionAnswering
import pandas as pd

tokenizer = AutoTokenizer.from_pretrained("google/tapas-base-finetuned-wtq")
model = TapasForQuestionAnswering.from_pretrained("google/tapas-base-finetuned-wtq")

def get_answers_from_tapas(table):
    rows = table['rows']
    header = table['header']
    query = table['question']

    df = pd.DataFrame(rows, columns = header)

    inputs = tokenizer(table=df, queries=query, padding="max_length", return_tensors="pt")
    outputs = model(**inputs)

    logits = outputs.logits
    logits_aggregation = outputs.logits_aggregation


    flat_list = []
    for values in rows:
        # flat_list.append(key)
        for value in values:
            flat_list.append(value)
    # print(len(flat_list))
    total_cells = len(flat_list)
    logits = logits[0][:total_cells]
    indexes = sorted(range(len(logits)), key=lambda i: logits[i], reverse=True)[:5]
    # print(indexes)


    # pred = torch.argmax(logits[:, :total_cells], dim=1)
    # print(pred)
    answers = []
    for idx in indexes:
        # print(flat_list[idx])
        answers.append(flat_list[idx])
    return answers


In [12]:
import time
tapas_answers = []
true_answers = []
start_time = time.time()
for table in tables[:25]:
    true_answers.extend(table['answers'])
    answers = get_answers_from_tapas(table)
    tapas_answers.append(answers)
end_time = time.time()

total_time = end_time-start_time
print(tapas_answers)
print(true_answers)
print("time per query is ", total_time/(len(true_answers)))

[['period of night', 'period of daylight and the', 'midrange', 'is the day with the', 'midrange'], ['The', 'summer solstice', 'is the day with the', 'longest', 'period of daylight and the'], ['period of night', 'The', 'summer solstice', 'is the day with the', 'longest'], ['period of daylight and the', 'midrange', 'period of night', 'The', 'summer solstice'], ['midrange', 'period of daylight and the', 'midrange', 'period of night', 'is the day with the'], ['The', 'summer solstice', 'is the day with the', 'longest', 'period of daylight and the'], ['The', 'summer solstice', 'is the day with the', 'longest', 'period of daylight and the'], ['period of night', 'The', 'fall equinox', 'midrange', 'period of daylight and the'], ['period of night', 'The', 'The', 'summer solstice', 'is the day with the'], ['period of daylight and the', 'midrange', 'period of night', 'is the day with the', 'midrange'], ['The', 'summer solstice', 'is the day with the', 'longest', 'period of daylight and the'], ['sp

In [15]:
hit_at_1 = 0
for i in range(len(true_answers)):
    if(true_answers[i] == tapas_answers[i][0]): hit_at_1 +=1
hit_at_1/= len(true_answers)
print(hit_at_1)

0.43


In [16]:
reciprocal_rank_sum = 0
for i in range(len(true_answers)):
    for j in range(len(tapas_answers[i])):
        if(true_answers[i] == tapas_answers[i][j]):
            # print(1/(j+1))
            reciprocal_rank_sum += 1/(j+1)
            break
mrr = reciprocal_rank_sum/len(true_answers)
print(mrr)

0.47


In [None]:
from transformers import AutoTokenizer, TapasForQuestionAnswering
from transformers import AutoTokenizer, AutoModelForSequenceClassification
import pandas as pd

tokenizer = AutoTokenizer.from_pretrained("facebook/bart-large-mnli")
model = AutoModelForSequenceClassification.from_pretrained("facebook/bart-large-mnli")

In [50]:
from transformers import AutoTokenizer, AutoModelForQuestionAnswering

def find_answers(table, mrc_model_name='distilbert-base-cased-distilled-squad'):
    query = table['question']
    rows_ = table['rows']
    header_ = table['header']
    rows = ""
    for i in range(len(rows)):
        for j in range(len(rows[0])):
            rows += rows[i][j]
            if(j<len(rows[0])-1):
                rows += "\t"
        if(i< len(rows)-1): rows += "\n"

    header = ""
    for j in range(len(header)):
        rows += header[j]
        if(j<len(header)-1): header += "\t";
    # Load the MRC tokenizer
    tokenizer = AutoTokenizer.from_pretrained(mrc_model_name)

    # Tokenize the query, rows, and header separately
    question_tokens = tokenizer(query, return_tensors="pt")
    context_tokens = tokenizer(f"{header}\n{rows}", return_tensors="pt")

    # Combine the tokenized question and context
    inputs = {
        'input_ids': torch.cat([question_tokens['input_ids'], context_tokens['input_ids']], dim=1),
        'attention_mask': torch.cat([question_tokens['attention_mask'], context_tokens['attention_mask']], dim=1)
    }

    # Load the MRC model
    model = AutoModelForQuestionAnswering.from_pretrained(mrc_model_name)

    # Get model outputs
    outputs = model(**inputs)

    # Decode the answer from the model output
    answer_start = torch.argmax(outputs.start_logits)
    answer_end = torch.argmax(outputs.end_logits) + 1
    answer = tokenizer.decode(inputs['input_ids'][0][answer_start:answer_end])

    return answer

# Example usage
# query = "What is the capital of France?"
# header = "Country\tCapital\n"
# rows = "France\tParis\nGermany\tBerlin\nItaly\tRome"

# answer = find_answers(query, rows, header)
# print(answer)


In [52]:
import time
bert_answers = []
true_answers = []
start_time = time.time()
for table in tables[:25]:
    true_answers.extend(table['answers'])
    answers = find_answers(table)
    bert_answers.append(answers)
end_time = time.time()

total_time = end_time-start_time
print(bert_answers)
print(true_answers)
print("time per query is ", total_time/(len(true_answers)))

['Easter', 'fall equinox', 'daylight', 'daylight', 'equinox', 'night', 'spring equinox also has _ _ _ _ _ _ _ periods for both daylight and night', 'equinox', 'daylight and period of night', 'midrange period of night', 'daylight', '', 'daylight', 'fall equinox', 'the period of night of the fall equinox', 'equinox', 'equinox', 'summer solstice occurs in June', 'June', '', 'Heard Island', '[CLS] What country is located in the southern hemisphere? [SEP] [CLS]', 'East Timor', 'South Africa', 'South Africa']
['Spring equinox', 'spring equinox', 'midrange', 'midrange', 'midrange', 'midrange', 'midrange', 'Midrange', 'Fall Equinox and Spring Equinox', 'fall equinox', 'fall equinox', 'Midrange', 'midrange', 'midrange', 'midrange', 'midrange', 'midrange between day and night', 'northern hemisphere', 'Summer solstice', 'June', 'southern', 'East Timor', 'southern hemisphere', 'Southern', 'southern hemisphere']
time per query is  1.9667918491363525
