In [None]:
from __future__ import absolute_import, division, print_function

import argparse
import glob
import logging
import os
import pickle
import random
import re
import shutil

import numpy as np
import torch
import torch.nn.functional as F
from torch.utils.data import DataLoader, Dataset, SequentialSampler, RandomSampler
from torch.utils.data.distributed import DistributedSampler

try:
    from torch.utils.tensorboard import SummaryWriter
except:
    from tensorboardX import SummaryWriter

from tqdm import trange
from tqdm.autonotebook import tqdm

from data_loader.hybrid_data_loaders import *
from data_loader.header_data_loaders import *
#from data_loader.CT_Wiki_data_loaders import *
from data_loader.RE_data_loaders import *
from data_loader.EL_data_loaders import *
from model.configuration import TableConfig
from model.model import HybridTableMaskedLM, HybridTableCER, TableHeaderRanking, HybridTableCT,HybridTableEL,HybridTableRE,BertRE
from model.transformers import BertConfig,BertTokenizer, WEIGHTS_NAME, AdamW, get_linear_schedule_with_warmup
from utils.util import *
from baselines.row_population.metric import average_precision,ndcg_at_k
from baselines.cell_filling.cell_filling import *
from model import metric

In [None]:
logger = logging.getLogger(__name__)

MODEL_CLASSES = {
    'CER': (TableConfig, HybridTableCER, BertTokenizer),
    'CF' : (TableConfig, HybridTableMaskedLM, BertTokenizer),
    'HR': (TableConfig, TableHeaderRanking, BertTokenizer),
    'CT': (TableConfig, HybridTableCT, BertTokenizer),
    'EL': (TableConfig, HybridTableEL, BertTokenizer),
    'RE': (TableConfig, HybridTableRE, BertTokenizer),
    'REBERT': (BertConfig, BertRE, BertTokenizer)
}

In [None]:
# set data directory, this will be used to load test data
data_dir = 'data/wikitables_v2/'

In [None]:
config_name = "configs/table-base-config_v2.json"
device = torch.device('cuda')
# load entity vocab from entity_vocab.txt
entity_vocab = load_entity_vocab(data_dir, ignore_bad_title=True, min_ent_count=2)
entity_wikid2id = {entity_vocab[x]['wiki_id']:x for x in entity_vocab}

In [None]:
os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"
os.environ["CUDA_VISIBLE_DEVICES"] = "0"

# Table of Contents
This notebook shows examples of how to using the model components and running evaluation of different tasks.
* [Pretrained and Cell Filling](#cf)
* [Entity Linking](#el)
* [Column Type Classification](#ct)
* [Relation Extraction](#re)

<a class="anchor" id="ct"></a>
# CT
Evaluate column type annotation

In [None]:
from data_loader.CT_Wiki_data_loaders_STEER import WikiCTDataset, CTLoader
type_vocab = load_type_vocab(data_dir)

In [None]:
data = WikiCTDataset(data_dir, entity_vocab, type_vocab, labeled_data_size=1, unlabeled_data_size="absolute", test_data_size=20.0, random_state=1, data_split_set="labeled", add_STEER_train_data=True, max_input_tok=500, src="train_dev_test", max_length = [50, 10, 10], force_new=False, tokenizer = None)

In [None]:
data.__len__()

In [None]:
len(data.df["table_id"].unique())

In [None]:
from torch.utils.data import DataLoader, Dataset, SequentialSampler, RandomSampler
train_sampler = SequentialSampler(data) if -1 == -1 else DistributedSampler(train_dataset)
train_dataloader = CTLoader(data, sampler=train_sampler, batch_size=1, is_train=True)

In [None]:
from torch.utils.data import DataLoader, Dataset, SequentialSampler, RandomSampler
train_sampler = SequentialSampler(data) if -1 == -1 else DistributedSampler(train_dataset)
train_dataloader = CTLoader(data, sampler=train_sampler, batch_size=1, is_train=True)

epoch_iterator = tqdm(train_dataloader, desc="Iteration", disable=-1 not in [-1, 0])
mode = 0
print(mode)
config_class, model_class, _ = MODEL_CLASSES['CT']
config = config_class.from_pretrained(config_name)
config.class_num = len(type_vocab)
config.mode = mode
model = model_class(config, is_simple=True)
checkpoint = checkpoints[mode]
checkpoint = torch.load(checkpoint)
model.load_state_dict(checkpoint)
model.to(device)
model.eval()
for step, batch in enumerate(epoch_iterator):
    if step > 0:
        break
    table_ids, input_tok, input_tok_type, input_tok_pos, input_tok_mask, \
            input_ent_text, input_ent_text_length, input_ent, input_ent_type, input_ent_mask, \
            column_entity_mask, column_header_mask, labels_mask, labels = batch
    print(labels)

In [None]:
with open(os.path.join(data_dir, 'test.table_col_type.json'), 'r') as f:
    testset = json.load(f)[4]
table_id, pgTitle, pgEnt, secTitle, caption, headers, entities, type_annotations = testset

In [None]:
testset

In [None]:
type_annotations

In [None]:
labeled_columns = [0,2]
list(map(type_annotations.__getitem__, labeled_columns))

In [None]:
import pandas as pd
from os.path import join
import os
os.environ["WORKING_DIR"] = "/home/sanonymous/semantic_data_lake"
os.environ["TYPENAME"] = "type_turl"
corpus = "turl"
labeled_data_size = 1
unlabeled_data_size = "absolute"
test_data_size = 20.0
random_state = 1
gen_train_data = False
absolute_numbers = True
validation_on = "test"

valid_header_path = join(os.environ["WORKING_DIR"], "data", "extract", "out",
                         "valid_headers")

# load the valid headers with real sem. types
valid_header_file = f"{corpus}_{os.environ['TYPENAME']}_valid.json"
valid_headers = join(valid_header_path, valid_header_file)
with open(valid_headers, "r") as file:
    valid_headers = json.load(file)

new_valid_headers = {}

for table in valid_headers.keys():
    new_valid_headers[table.split("_")[1].split(".csv")[0]] = valid_headers[table]
valid_headers = new_valid_headers

labeled_unlabeled_test_split_path = join(os.environ["WORKING_DIR"], "data",
                                         "extract", "out",
                                         "labeled_unlabeled_test_split")

# load labeled data from labeled, unlabeled, test split file and use labeled and test data for clustering
with open(
        join(
            labeled_unlabeled_test_split_path,
            f"{corpus}_{labeled_data_size}_{unlabeled_data_size}_{test_data_size}_{random_state}.json"
        )) as f:
    labeled_unlabeled_test_split_file = json.load(f)
    labeled_data_ids = labeled_unlabeled_test_split_file[
        f"labeled{labeled_data_size}"]
    if gen_train_data:
        if absolute_numbers:
            unlabeled_data_ids = labeled_unlabeled_test_split_file[
                f"unlabeled"]
        else:
            unlabeled_data_ids = labeled_unlabeled_test_split_file[
                f"unlabeled{unlabeled_data_size}"]
        print(f"Unlabeled Data: {len(unlabeled_data_ids)}")
    if validation_on == "unlabeled":
        test_data_ids = labeled_unlabeled_test_split_file[
            f"{validation_on}"]
    else:
        test_data_ids = labeled_unlabeled_test_split_file[
            f"{validation_on}{test_data_size}"]

print(f"Labeled Data: {len(labeled_data_ids)}")
print(f"Test Data: {len(test_data_ids)}")


df = pd.DataFrame({"table_id": [entry.split("+")[0].split(".csv")[0].split("_")[1] for entry in labeled_unlabeled_test_split_file["labeled1"]],
                  "column": [entry.split("+")[1].split("_")[1] for entry in labeled_unlabeled_test_split_file["labeled1"]]})




In [None]:
df.head(1)

In [None]:
### load additional STEER train data
gen_train_path = join(os.environ["WORKING_DIR"], "labeling_functions", "combined_LFs", "gen_training_data", f"{corpus}_gen_training_data_all_combined_maj_{labeled_data_size}_absolute_{test_data_size}_{random_state}.csv")

df_gen_train_data = pd.read_csv(gen_train_path, names=["table_id", "column", "dataset_id", "predicted_semantic_type"], header=0)
df_gen_train_data["table_id"] = df_gen_train_data["table_id"].apply(lambda x: x.split(".csv")[0].split("_")[1])
df_gen_train_data["column"] = df_gen_train_data["column"].apply(lambda x: int(x.split("_")[1]))

In [None]:
df_gen_train_data.head(1)

In [None]:
pd.concat([df, df_gen_train_data[["table_id", "column"]]], ignore_index=True)

In [None]:
with open(join(os.environ["WORKING_DIR"], "data", "extract", "out", "valid_types", "types.json")) as f:
    types = json.load(f)["type_turl"]
df_types = pd.DataFrame({"types":types})
df_types.to_csv(join(data_dir, "types_STEER.txt"), header=False, sep="\t")

In [None]:
def overwrite_to_one_valid_header(input_data, valid_headers):
    table_id, pgTitle, pgEnt, secTitle, caption, headers, entities, type_annotations = input_data
    
    valid_header_entry = valid_headers.get(table_id)
    if valid_header_entry == None:
        return None
    column_ids = [int(col.split("_")[1]) for col in valid_header_entry.keys()]
    for column_id in column_ids:
        type_annotations[column_id] = [valid_header_entry[f"column_{column_id}"]["semanticType"]]
    
    return [table_id, pgTitle, pgEnt, secTitle, caption, headers, entities, type_annotations]


def filter_labeled_unlabeled_test_data(input_data, labeled_unlabeled_test):
    df = pd.DataFrame({"table_id": [entry.split("+")[0][:-4] for entry in labeled_unlabeled_test["labeled1"]],
                  "column": [int(entry.split("+")[1].split("_")[1]) for entry in labeled_unlabeled_test["labeled1"]]})

    table_id, pgTitle, pgEnt, secTitle, caption, headers, entities, type_annotations = input_data
    if table_id not in df["table_id"].tolist():
        return None
    labeled_columns = df["column"].tolist()
    labeled_columns.sort()
    headers = [headers[labeled_column] for labeled_column in labeled_columns]
    entities = [entities[labeled_column] for labeled_column in labeled_columns]
    # reset column indexes 
    for i,col in enumerate(entities):
        #print([[[row_i,i], cell] for [row_i, col_i], cell in col])
        entities[i] = [[[row_i,i], cell] for [row_i, col_i], cell in col]

    type_annotations = [type_annotations[labeled_column] for labeled_column in labeled_columns]
    return [table_id, pgTitle, pgEnt, secTitle, caption, headers, entities, type_annotations] 


In [None]:
# Sven: Test WikiCTDataset
def process_single_CT(input_data, config):
    input_data = overwrite_to_one_valid_header(input_data, valid_headers)
    if input_data == None:
        return None
    input_data = filter_labeled_unlabeled_test_data(input_data, labeled_unlabeled_test_split_file)
    if input_data == None:
        return None
    table_id, pgTitle, pgEnt, secTitle, caption, headers, entities, type_annotations = input_data
    entities = [z for column in entities for z in column[:config.max_column]]
    pgEnt = config.entity_wikid2id.get(pgEnt, -1)

    tokenized_pgTitle = config.tokenizer.encode(pgTitle, max_length=config.max_title_length, add_special_tokens=False)
    tokenized_meta = tokenized_pgTitle+\
                    config.tokenizer.encode(secTitle, max_length=config.max_title_length, add_special_tokens=False)
    if caption != secTitle:
        tokenized_meta += config.tokenizer.encode(caption, max_length=config.max_title_length, add_special_tokens=False)
    tokenized_headers = [config.tokenizer.encode(z, max_length=config.max_header_length, add_special_tokens=False) for z in headers]
    input_tok = []
    input_tok_pos = []
    input_tok_type = []
    tokenized_meta_length = len(tokenized_meta)
    input_tok += tokenized_meta
    input_tok_pos += list(range(tokenized_meta_length))
    input_tok_type += [0]*tokenized_meta_length
    tokenized_headers_length = [len(z) for z in tokenized_headers]
    input_tok += list(itertools.chain(*tokenized_headers))
    input_tok_pos += list(itertools.chain(*[list(range(z)) for z in tokenized_headers_length]))
    input_tok_type += [1]*sum(tokenized_headers_length)

    input_ent = []
    input_ent_text = []
    input_ent_type = []
    column_en_map = {}
    row_en_map = {}
    for e_i, (index, cell) in enumerate(entities):
        entity, entity_text = cell
        entity = config.entity_wikid2id.get(entity, 0)
        tokenized_ent_text = config.tokenizer.encode(entity_text, max_length=config.max_cell_length, add_special_tokens=False)
        input_ent.append(entity)
        input_ent_text.append(tokenized_ent_text)
        input_ent_type.append(4)
        if index[1] not in column_en_map:
            column_en_map[index[1]] = [e_i]
        else:
            column_en_map[index[1]].append(e_i)
        if index[0] not in row_en_map:
            row_en_map[index[0]] = [e_i]
        else:
            row_en_map[index[0]].append(e_i)
    input_ent_length = len(input_ent)
    # create column entity mask
    column_entity_mask = np.zeros([len(type_annotations), len(input_ent)], dtype=int)
    for j in range(len(type_annotations)):
        for e_i_1 in column_en_map[j]:
            column_entity_mask[j, e_i_1] = 1
    # create column header mask
    start_i = 0
    header_span = {}
    column_header_mask = np.zeros([len(type_annotations), len(input_tok)], dtype=int)
    for j in range(len(type_annotations)):
        header_span[j] = (start_i, start_i+tokenized_headers_length[j])
        column_header_mask[j, tokenized_meta_length+header_span[j][0]:tokenized_meta_length+header_span[j][1]] = 1
        start_i += tokenized_headers_length[j]
    #create input mask
    tok_tok_mask = np.ones([len(input_tok), len(input_tok)], dtype=int)
    meta_ent_mask = np.ones([tokenized_meta_length, len(input_ent)], dtype=int)
    header_ent_mask = np.zeros([sum(tokenized_headers_length), len(input_ent)], dtype=int)
    
    for e_i, (index, _) in enumerate(entities):
        header_ent_mask[header_span[index[1]][0]:header_span[index[1]][1], e_i] = 1
    ent_header_mask = np.transpose(header_ent_mask)

    input_tok_mask = [tok_tok_mask, np.concatenate([meta_ent_mask, header_ent_mask], axis=0)]
    ent_meta_mask = np.ones([len(input_ent), tokenized_meta_length], dtype=int)
    
    ent_ent_mask = np.eye(len(input_ent), dtype=int)
    for _,e_is in column_en_map.items():
        for e_i_1 in e_is:
            for e_i_2 in e_is:
                ent_ent_mask[e_i_1, e_i_2] = 1
    for _,e_is in row_en_map.items():
        for e_i_1 in e_is:
            for e_i_2 in e_is:
                ent_ent_mask[e_i_1, e_i_2] = 1
    input_ent_mask = [np.concatenate([ent_meta_mask, ent_header_mask], axis=1), ent_ent_mask]
    # prepend pgEnt to input_ent, input_ent[0] = pgEnt
    if pgEnt!=-1:
        input_tok_mask[1] = np.concatenate([np.ones([len(input_tok), 1], dtype=int),input_tok_mask[1]],axis=1)
    else:
        input_tok_mask[1] = np.concatenate([np.zeros([len(input_tok), 1], dtype=int),input_tok_mask[1]],axis=1)
    input_ent = [pgEnt if pgEnt!=-1 else 0] + input_ent
    input_ent_text = [tokenized_pgTitle[:config.max_cell_length]] + input_ent_text
    input_ent_type = [2] + input_ent_type

    new_input_ent_mask = [np.ones([len(input_ent), len(input_tok)], dtype=int), np.ones([len(input_ent), len(input_ent)], dtype=int)]
    new_input_ent_mask[0][1:, :] = input_ent_mask[0]
    new_input_ent_mask[1][1:, 1:] = input_ent_mask[1]
    if pgEnt==-1:
        new_input_ent_mask[1][:, 0] = 0
        new_input_ent_mask[1][0, :] = 0
    column_entity_mask = np.concatenate([np.zeros([len(type_annotations), 1], dtype=int),column_entity_mask],axis=1)

    input_ent_mask = new_input_ent_mask
    labels = np.zeros([len(type_annotations), config.type_num], dtype=int)
    for j, types in enumerate(type_annotations):
        for t in types:
            labels[j, config.type_vocab[t]] = 1
    input_ent_cell_length = [len(x) if len(x)!=0 else 1 for x in input_ent_text]
    max_cell_length = max(input_ent_cell_length)
    input_ent_text_padded = np.zeros([len(input_ent_text), max_cell_length], dtype=int)
    for i,x in enumerate(input_ent_text):
        input_ent_text_padded[i, :len(x)] = x

    return [table_id,np.array(input_tok),np.array(input_tok_type),np.array(input_tok_pos),(np.array(input_tok_mask[0]),np.array(input_tok_mask[1])),len(input_tok), \
                np.array(input_ent),input_ent_text_padded,input_ent_cell_length,np.array(input_ent_type),(np.array(input_ent_mask[0]),np.array(input_ent_mask[1])),len(input_ent), \
                column_header_mask,column_entity_mask,labels,len(labels)]



In [None]:
process_single_CT(testset, test_dataset)

In [None]:
# load type vocab from type_vocab.txt
type_vocab = load_type_vocab(data_dir)
test_dataset = WikiCTDataset(data_dir, entity_vocab, type_vocab, max_input_tok=500, src="test", max_length = [50, 10, 10], force_new=False, tokenizer = None)

In [None]:
id2type = {idx:t for t, idx in type_vocab.items()}
t2d_invalid = set()

In [None]:
def average_precision(output, relevance_labels):
    with torch.no_grad():
        sorted_output = torch.argsort(output, dim=-1, descending=True)
        sorted_labels = torch.gather(relevance_labels, -1, sorted_output).float()
        cum_correct = torch.cumsum(sorted_labels, dim=-1)
        cum_precision = cum_correct / torch.arange(start=1,end=cum_correct.shape[-1]+1, device=cum_correct.device)[None, :]
        cum_precision = cum_precision * sorted_labels
        total_valid = torch.sum(sorted_labels, dim=-1)
        total_valid[total_valid==0] = 1
        average_precision = torch.sum(cum_precision, dim=-1)/total_valid

    return average_precision

In [None]:
################################
## Evaluation TURL on TURL-Copus
################################

from torch.utils.data import DataLoader, Dataset, SequentialSampler, RandomSampler
from sklearn.metrics import classification_report
import json

from data_loader.CT_Wiki_data_loaders_STEER import WikiCTDataset, CTLoader
type_vocab = load_type_vocab(data_dir)

labeled_data_size = "14.30"
unlabeled_data_size = "65.70"
test_data_size = "20.00"
random_state = 2

data = WikiCTDataset(data_dir, entity_vocab, type_vocab, labeled_data_size=labeled_data_size, unlabeled_data_size=unlabeled_data_size, test_data_size=test_data_size,
                    random_state=random_state, data_split_set="test", max_input_tok=500, src="train_dev_test", max_length=[50, 10, 10], force_new=False, tokenizer=None)

for labeled_data_size, unlabeled_data_size in [("14.30","65.70"),("14.56","65.44"),("14.74","65.26"),("14.86","65.14"),("14.89","65.11")]:
    train_sampler = SequentialSampler(data)
    train_dataloader = CTLoader(data, sampler=train_sampler, batch_size=1, is_train=True)

    epoch_iterator = tqdm(train_dataloader, desc="Iteration", disable=-1 not in [-1, 0])
    mode = 1
    print(mode)
    config_class, model_class, _ = MODEL_CLASSES['CT']
    config = config_class.from_pretrained(config_name)
    config.class_num = len(type_vocab)
    config.mode = mode
    model = model_class(config, is_simple=True)
    #checkpoint = checkpoints[mode]
    model_path = f"output/CT/v2/{mode}/model_STEER_{labeled_data_size}_{unlabeled_data_size}_{test_data_size}_{random_state}_sameTraindata"
    checkpoint = torch.load(f"{model_path}/pytorch_model.bin")
    model.load_state_dict(checkpoint)
    model.to(device)
    model.eval()

    predicted_labels_all = []
    true_labels_all = []
    for step, batch in enumerate(epoch_iterator):
        # if step > 5:
        #     break
        table_ids, input_tok, input_tok_type, input_tok_pos, input_tok_mask, \
                input_ent_text, input_ent_text_length, input_ent, input_ent_type, input_ent_mask, \
                column_entity_mask, column_header_mask, labels_mask, labels = batch
        input_tok = input_tok.to(device)
        input_tok_type = input_tok_type.to(device)
        input_tok_pos = input_tok_pos.to(device)
        input_tok_mask = input_tok_mask.to(device)
        input_ent_text = input_ent_text.to(device)
        input_ent_text_length = input_ent_text_length.to(device)
        input_ent = input_ent.to(device)
        input_ent_type = input_ent_type.to(device)
        input_ent_mask = input_ent_mask.to(device)
        column_entity_mask = column_entity_mask.to(device)
        column_header_mask = column_header_mask.to(device)
        labels_mask = labels_mask.to(device)
        labels = labels.to(device)

        if mode == 1:
            input_ent_mask = input_ent_mask[:,:,input_tok_mask.shape[1]:]
            input_tok = None
            input_tok_type = None
            input_tok_pos = None
            input_tok_mask = None

        with torch.no_grad():
            outputs = model(input_tok, input_tok_type, input_tok_pos, input_tok_mask,\
                input_ent_text, input_ent_text_length, input_ent, input_ent_type, input_ent_mask, column_entity_mask, column_header_mask, labels_mask, labels)
            loss = outputs[0]
            prediction_scores = outputs[1]
            prediction_labels = (prediction_scores.view(-1, config.class_num)==prediction_scores.view(-1, config.class_num).max(-1)[0][:,None]).tolist()
            true_labels = labels.view(-1, config.class_num).tolist()
            predicted_labels_all.extend([pred.index(True) for pred in prediction_labels])
            true_labels_all.extend([true_label.index(1.0) for true_label in true_labels])

    class_report = classification_report(true_labels_all, predicted_labels_all, output_dict=True)
    with open(f"{model_path}/classification_report.json", "w") as outfile:
        json.dump(class_report, outfile)

In [None]:
labeled_data_size = 1
unlabeled_data_size = "absolute"
test_data_size = 20.0
random_state = 2

#data = WikiCTDataset(data_dir, entity_vocab, type_vocab, labeled_data_size=labeled_data_size, unlabeled_data_size="absolute", test_data_size=test_data_size,
#                     random_state=random_state, data_split_set="test", max_input_tok=500, src="train_dev_test", max_length=[50, 10, 10], force_new=False, tokenizer=None)

train_sampler = SequentialSampler(data)
train_dataloader = CTLoader(data, sampler=train_sampler, batch_size=1, is_train=True)



epoch_iterator = tqdm(train_dataloader, desc="Iteration", disable=-1 not in [-1, 0])
mode = 0
print(mode)
config_class, model_class, _ = MODEL_CLASSES['CT']
config = config_class.from_pretrained(config_name)
config.class_num = len(type_vocab)
config.mode = mode
model = model_class(config, is_simple=True)
#checkpoint = checkpoints[mode]
model_path = f"output/CT/v2/{mode}/model_STEER_{labeled_data_size}_absolute_{test_data_size}_{random_state}_STEER"
checkpoint = torch.load(f"{model_path}/pytorch_model.bin")
model.load_state_dict(checkpoint)
model.to(device)
model.eval()

predicted_labels_all = []
true_labels_all = []
for step, batch in enumerate(epoch_iterator):
    # if step > 5:
    #     break
    table_ids, input_tok, input_tok_type, input_tok_pos, input_tok_mask, \
            input_ent_text, input_ent_text_length, input_ent, input_ent_type, input_ent_mask, \
            column_entity_mask, column_header_mask, labels_mask, labels = batch
    input_tok = input_tok.to(device)
    input_tok_type = input_tok_type.to(device)
    input_tok_pos = input_tok_pos.to(device)
    input_tok_mask = input_tok_mask.to(device)
    input_ent_text = input_ent_text.to(device)
    input_ent_text_length = input_ent_text_length.to(device)
    input_ent = input_ent.to(device)
    input_ent_type = input_ent_type.to(device)
    input_ent_mask = input_ent_mask.to(device)
    column_entity_mask = column_entity_mask.to(device)
    column_header_mask = column_header_mask.to(device)
    labels_mask = labels_mask.to(device)
    labels = labels.to(device)

    if mode == 1:
        input_ent_mask = input_ent_mask[:,:,input_tok_mask.shape[1]:]
        input_tok = None
        input_tok_type = None
        input_tok_pos = None
        input_tok_mask = None

    with torch.no_grad():
        outputs = model(input_tok, input_tok_type, input_tok_pos, input_tok_mask,\
            input_ent_text, input_ent_text_length, input_ent, input_ent_type, input_ent_mask, column_entity_mask, column_header_mask, labels_mask, labels)
        loss = outputs[0]
        prediction_scores = outputs[1]
        prediction_labels = (prediction_scores.view(-1, config.class_num)==prediction_scores.view(-1, config.class_num).max(-1)[0][:,None]).tolist()
        true_labels = labels.view(-1, config.class_num).tolist()
        predicted_labels_all.extend([pred.index(True) for pred in prediction_labels])
        true_labels_all.extend([true_label.index(1.0) for true_label in true_labels])

class_report = classification_report(true_labels_all, predicted_labels_all, output_dict=True)
with open(f"{model_path}/classification_report.json", "w") as outfile:
    json.dump(class_report, outfile)

In [None]:
from tqdm.autonotebook import tqdm
checkpoints = [
     "output/CT/v2/0/model_STEER_1_absolute_20.0_2/pytorch_model.bin",
     "output/CT/v2/0/model_STEER_2_absolute_20.0_2/pytorch_model.bin"
#    "output/CT/v2/0/model_v1_table_0.2_0.6_0.7_10000_1e-4_candnew_0_adam/checkpoint-50000/pytorch_model.bin",
#    "output/CT/v2/1/model_v1_table_0.2_0.6_0.7_10000_1e-4_candnew_0_adam/pytorch_model.bin",
#    "output/CT/v2/2/model_v1_table_0.2_0.6_0.7_10000_1e-4_candnew_0_adam/pytorch_model.bin",
#    "output/CT/v2/3/model_v1_table_0.2_0.6_0.7_10000_1e-4_candnew_0_adam/pytorch_model.bin",
#    "output/CT/v2/4/model_v1_table_0.2_0.6_0.7_10000_1e-4_candnew_0_adam/pytorch_model.bin",
#    "output/CT/v2/5/model_v1_table_0.2_0.6_0.7_10000_1e-4_candnew_0_adam/pytorch_model.bin"
]
# checkpoints = [
#     #"output/CT/v2/0/model_v1_table_0.2_0.6_0.7_10000_1e-4_candnew_0_adam/pytorch_model.bin",
#     "output/provided/column_type/0/pytorch_model.bin",
#     "output/provided/column_type/1/pytorch_model.bin"
# ]
# for mode in range(6):
for mode in [0,0]:
    print(mode)
    config_class, model_class, _ = MODEL_CLASSES['CT']
    config = config_class.from_pretrained(config_name)
    config.class_num = len(type_vocab)
    config.mode = mode
    model = model_class(config, is_simple=True)
    checkpoint = checkpoints[mode]
    checkpoint = torch.load(checkpoint)
    model.load_state_dict(checkpoint)
    model.to(device)
    model.eval()
    eval_batch_size = 20
    eval_sampler = SequentialSampler(data)
    eval_dataloader = CTLoader(data, sampler=eval_sampler, batch_size=eval_batch_size, is_train=False)
    eval_loss = 0.0
    eval_map = 0.0
    nb_eval_steps = 0
    eval_targets = []
    eval_prediction_scores = []
    eval_pred = []
    eval_mask = []
    per_table_result[mode] = {}
    for batch in tqdm(eval_dataloader, desc="Evaluating"):
        table_ids, input_tok, input_tok_type, input_tok_pos, input_tok_mask, \
            input_ent_text, input_ent_text_length, input_ent, input_ent_type, input_ent_mask, \
            column_entity_mask, column_header_mask, labels_mask, labels = batch
        input_tok = input_tok.to(device)
        input_tok_type = input_tok_type.to(device)
        input_tok_pos = input_tok_pos.to(device)
        input_tok_mask = input_tok_mask.to(device)
        input_ent_text = input_ent_text.to(device)
        input_ent_text_length = input_ent_text_length.to(device)
        input_ent = input_ent.to(device)
        input_ent_type = input_ent_type.to(device)
        input_ent_mask = input_ent_mask.to(device)
        column_entity_mask = column_entity_mask.to(device)
        column_header_mask = column_header_mask.to(device)
        labels_mask = labels_mask.to(device)
        labels = labels.to(device)
        if mode == 1:
            input_ent_mask = input_ent_mask[:,:,input_tok_mask.shape[1]:]
            input_tok = None
            input_tok_type = None
            input_tok_pos = None
            input_tok_mask = None
        elif mode == 2:
            input_tok_mask = input_tok_mask[:,:,:input_tok_mask.shape[1]]
            input_ent_text = None
            input_ent_text_length = None
            input_ent = None
            input_ent_type = None
            input_ent_mask = None
        elif mode == 3:
            input_ent = None
        elif mode == 4:
            input_ent_mask = input_ent_mask[:,:,input_tok_mask.shape[1]:]
            input_tok = None
            input_tok_type = None
            input_tok_pos = None
            input_tok_mask = None
            input_ent = None
        elif mode == 5:
            input_ent_mask = input_ent_mask[:,:,input_tok_mask.shape[1]:]
            input_tok = None
            input_tok_type = None
            input_tok_pos = None
            input_tok_mask = None
            input_ent_text = None
            input_ent_text_length = None
        with torch.no_grad():
            outputs = model(input_tok, input_tok_type, input_tok_pos, input_tok_mask,\
                input_ent_text, input_ent_text_length, input_ent, input_ent_type, input_ent_mask, column_entity_mask, column_header_mask, labels_mask, labels)
            loss = outputs[0]
            prediction_scores = outputs[1]
            for l_i in t2d_invalid:
                prediction_scores[:,:,l_i] = -1000
            for idx, table_id in enumerate(table_ids):
                valid = labels_mask[idx].nonzero().max().item()+1
                if table_id not in per_table_result[mode]:
                    per_table_result[mode][table_id] = [[],labels_mask[idx,:valid],labels[idx,:valid]]
                per_table_result[mode][table_id][0].append(prediction_scores[idx,:valid])
            ap = metric.average_precision(prediction_scores.view(-1, config.class_num), labels.view((-1, config.class_num)))
            map = (ap*labels_mask.view(-1)).sum()/labels_mask.sum()
            eval_loss += loss.mean().item()
            eval_map += map.item()
            eval_targets.extend(labels.view(-1, config.class_num).tolist())
            eval_prediction_scores.extend(prediction_scores.view(-1, config.class_num).tolist())
            eval_pred.extend((torch.sigmoid(prediction_scores.view(-1, config.class_num))>0.5).tolist())
            eval_mask.extend(labels_mask.view(-1).tolist())
        nb_eval_steps += 1
    print(eval_map/nb_eval_steps)
    eval_targets = np.array(eval_targets)
    eval_prediction_scores = np.array(eval_prediction_scores)
    eval_mask = np.array(eval_mask)
    eval_prediction_ranks = np.argsort(np.argsort(-eval_prediction_scores))
    eval_pred = np.array(eval_pred)
    eval_tp = eval_mask[:,np.newaxis]*eval_pred*eval_targets
    eval_precision = np.sum(eval_tp,axis=0)/np.sum(eval_mask[:,np.newaxis]*eval_pred,axis=0)
    eval_precision = np.nan_to_num(eval_precision, 1)
    eval_recall = np.sum(eval_tp,axis=0)/np.sum(eval_mask[:,np.newaxis]*eval_targets,axis=0)
    eval_recall = np.nan_to_num(eval_recall, 1)
    eval_f1 = 2*eval_precision*eval_recall/(eval_precision+eval_recall)
    eval_f1 = np.nan_to_num(eval_f1, 0)
    per_type_instance_num = np.sum(eval_mask[:,np.newaxis]*eval_targets,axis=0)
    per_type_instance_num[per_type_instance_num==0] = 1
    per_type_correct_instance_num = np.sum(eval_mask[:,np.newaxis]*(eval_prediction_ranks<eval_targets.sum(axis=1)[:,np.newaxis])*eval_targets,axis=0)
    per_type_accuracy[mode] = per_type_correct_instance_num/per_type_instance_num
    per_type_precision[mode] = eval_precision
    per_type_recall[mode] = eval_recall
    per_type_f1[mode] = eval_f1
    precision[mode] = np.sum(eval_tp)/np.sum(eval_mask[:,np.newaxis]*eval_pred)
    recall[mode] = np.sum(eval_tp)/np.sum(eval_mask[:,np.newaxis]*eval_targets)
    f1[mode] = 2*precision[mode]*recall[mode]/(precision[mode]+recall[mode])

In [None]:
total_corr = 0
total_valid = 0
errors = []
for table_id, result in per_table_result[0].items():
    prediction_scores, label_mask, label = result
    prediction_scores = torch.stack(prediction_scores, 0).mean(0)
    current_corr = 0
    for col_idx, pred in enumerate(prediction_scores.argmax(-1).tolist()):
        current_corr += label[col_idx, pred].item()
    total_valid += label_mask.sum().item()
    total_corr += current_corr
    if current_corr!=label_mask.sum().item():
        errors.append(table_id)
print(total_corr/total_valid, total_valid)

In [None]:
for t,i in sorted(type_vocab.items(),key=lambda z:-per_type_instance_num[z[1]]):
    #print('%s %.4f %.4f %.4f %.4f %.4f  %.4f %.4f'%(t, per_type_instance_num[i], per_type_f1[0][i], per_type_f1[4][i], per_type_f1[1][i], per_type_f1[3][i], per_type_f1[2][i], per_type_f1[5][i]))
    print('%s %.4f %.4f %.4f'%(t, per_type_instance_num[i], per_type_f1[0][i], per_type_f1[1][i] ))
    print()

In [None]:
f1

In [None]:
precision

In [None]:
recall

Type mapping is used to map the types used in some other datasets to our types, so we can directly evaluate without retraining our model

In [None]:
t2d_type_mapping = {
    'Election': ['government.election'],
    'Film': ['film.film'],
    'mountain': ['geography.mountain'],
    'Building': ['architecture.building'],
    'RadioStation': ['broadcast.radio_station'],
    'TelevisionShow': ['tv.tv_program'],
    'Country': ['location.country'],
    'Airport': ['aviation.airport'],
    'AdministrativeRegion': ['location.region'],
    'University': ['education.university'],
    'Newspaper': ['book.newspaper'],
    'FictionalCharacter': ['fictional_universe.fictional_character'],
    'Currency': ['finance.currency'],
    'Novel': ['book.book'],
    'Wrestler': ['sports.pro_athlete'],
    'swimmer': ['sports.pro_athlete'],
    'GolfPlayer': ['sports.golfer', 'sports.pro_athlete'],
    'Book': ['book.book'],
    'Political Party': ['government.political_party'],
    'Person': ['people.person'],
    'VideoGame': ['cvg.computer_videogame'],
    'Animal': ['biology.animal'],
    'PoliticalParty': ['government.political_party'],
    'BaseballPlayer': ['sports.pro_athlete'],
    'Monarch': ['royalty.monarch'],
    'Mountain': ['geography.mountain'],
    'City': ['location.citytown'],
    'Company': ['business.consumer_company'],
    'cricketer': ['sports.pro_athlete'],
    'Airline': ['aviation.airline']
}
t2d_types = set([y for _,x in t2d_type_mapping.items() for y in x])
t2d_invalid = []
for t,i in type_vocab.items():
    if t not in t2d_types:
        t2d_invalid.append(i)

In [None]:
t2d_type_mapping = {
    'City': ['location.citytown'],
    'VideoGame': ['cvg.computer_videogame'],
    'Mountain': ['geography.mountain'],
    'Museum': [],
    'Writer': ['film.writer', 'tv.tv_writer', 'music.writer', 'book.author'],
    'Lake': [], 
    'AdministrativeRegion': ['location.administrative_division'],
    'Book': ['book.book'],
    'Saint': [],
    'Monarch': ['royalty.monarch'],
    'Bird': [],
    'Plant': [],
    'Mayor': [],
    'Currency': ['finance.currency'],
    'MovieDirector': ['film.director'],
    'Company': ['film.production_company', 'automotive.company', 'business.consumer_company', 'business.defunct_company', ],
    'Genre': ['cvg.cvg_genre', 'film.film_genre', 'broadcast.genre', 'media_common.media_genre', 'tv.tv_genre', 'music.genre'],
    'GovernmentType': ['government.governmental_body'],
    'Hospital': [],
    'Building': ['architecture.building'],
    'PoliticalParty': ['government.political_party'],
    'Language': ['language.human_language'],
    'Country': ['location.country'],
    'University': ['education.university'],
    'SportsTeam': ['sports.sports_team'],
    'RadioStation': ['broadcast.radio_station'],
    'Airport': ['aviation.airport'],
    'Airline': ['aviation.airline'],
    'Wrestler': [],
    'Newspaper': ['book.newspaper'],
    'Mammal': [],
    'MountainRange': [],
    'BaseballPlayer': ['baseball.baseball_player'],
    'AcademicJournal': [],
    'Scientist': [],
    'Continent': [],
    'Film': ['film.film']
}

t2d_types = set([y for _,x in t2d_type_mapping.items() for y in x])
t2d_invalid = []
for t,i in type_vocab.items():
    if t not in t2d_types:
        t2d_invalid.append(i)

In [None]:
t2d_type_mapping = {
    'Film': ['film.film'],
    'Lake': [],
    'Language': ['language.human_language'],
    'Country': ['location.country'],
    'Company': ['film.production_company', 'automotive.company', 'business.consumer_company', 'business.defunct_company'],
    'Person': ['people.person'],
    'VideoGame': ['cvg.computer_videogame'],
    'City': ['location.citytown'],
    'Currency': ['finance.currency'],
    'Bird': [],
    'Mountain': ['geography.mountain'],
    'Scientist': [],
    'Plant': [],
    'TelevisionShow': ['tv.tv_program'],
    'Animal': [],
    'AdministrativeRegion': ['location.administrative_division'],
    'Genre': ['cvg.cvg_genre', 'film.film_genre', 'broadcast.genre', 'media_common.media_genre', 'tv.tv_genre', 'music.genre'],
    'Newspaper': ['book.newspaper'],
    'Airport': ['aviation.airport'],
    'AcademicJournal': [],
    'PopulatedPlace': [],
    'Wrestler': [],
    'PoliticalParty': ['government.political_party'],
    'Cricketer': ['cricket.cricket_player'],
    'Eukaryote': [],
    'Saint': [],
    'Writer': ['film.writer', 'tv.tv_writer', 'music.writer', 'book.author'],
    'Museum': [],
    'BaseballPlayer': ['baseball.baseball_player'],
    'EducationalInstitution': ['education.educational_institution'],
    'GovernmentType': ['government.governmental_body'],
    'SportsTeam': ['sports.sports_team'],   
}

reverse_type_mapping = {t2:t1 for t1,t2s in t2d_type_mapping.items() for t2 in t2s}

t2d_types = set([y for _,x in t2d_type_mapping.items() for y in x])
t2d_invalid = []
for t,i in type_vocab.items():
    if t not in t2d_types:
        t2d_invalid.append(i)

In [None]:
errors

In [None]:
p = 0
pred = 0
tp = 0
for table_id, result in per_table_result[4].items():
    prediction_scores, label_mask, label = result
    prediction_scores = torch.stack(prediction_scores, 0).mean(0)
    current_corr = 0
    for col_idx in range(label.shape[0]):
        if label_mask[col_idx]!=0:
            gt_t = set([reverse_type_mapping[id2type[t]] for t in label[col_idx].nonzero()[0].tolist()])
            if (prediction_scores[col_idx]>0).nonzero().shape[0]>0:
                pred_t = set([reverse_type_mapping[id2type[t]] for t in (prediction_scores[col_idx]>0).nonzero()[0].tolist()])
            else:
                pred_t = set()
            p += len(gt_t)
            pred += len(pred_t)
            tp += len(gt_t&pred_t)
precision = tp/pred
recall = tp/p
f1 = 2*precision*recall/(precision+recall)
print(f1,precision,recall)

In [None]:
p

In [None]:
pred

In [None]:
tp

In [None]:
label[1].nonzero()[0].tolist()

In [None]:
1 if label_mask[1]==0 else 0

In [None]:
type_vocab

In [None]:
per_table_result['64499281_8_7181683886563136802'][0][1].argsort(-1)

# CT - Semtab

In [None]:
data_dir = 'data/Semtab'
type_vocab = load_type_vocab(data_dir)
test_dataset = WikiCTDataset(data_dir, entity_vocab, type_vocab, max_input_tok=500, src="wiki_test30", max_length = [50, 10, 10], force_new=False, tokenizer = None)

In [None]:
len(type_vocab)

In [None]:
id2type = {y:x for x,y in type_vocab.items()}

In [None]:
def average_precision(output, relevance_labels):
    with torch.no_grad():
        sorted_output = torch.argsort(output, dim=-1, descending=True)
        sorted_labels = torch.gather(relevance_labels, -1, sorted_output).float()
        cum_correct = torch.cumsum(sorted_labels, dim=-1)
        cum_precision = cum_correct / torch.arange(start=1,end=cum_correct.shape[-1]+1, device=cum_correct.device)[None, :]
        cum_precision = cum_precision * sorted_labels
        total_valid = torch.sum(sorted_labels, dim=-1)
        total_valid[total_valid==0] = 1
        average_precision = torch.sum(cum_precision, dim=-1)/total_valid

    return average_precision

In [None]:
per_type_accuracy = {}
per_type_precision = {}
per_type_recall = {}
per_type_f1 = {}
map = {}
precision = {}
recall = {}
f1 = {}

In [None]:
from tqdm.autonotebook import tqdm
checkpoint = "output/CT/Semtab/wiki_train70/4/model_v1_table_0.2_0.6_0.7_10000_1e-4_candnew_0_adam/pytorch_model.bin"
mode = 4
print(mode)
config_class, model_class, _ = MODEL_CLASSES['CT']
config = config_class.from_pretrained(config_name)
config.class_num = len(type_vocab)
config.mode = mode
model = model_class(config, is_simple=True)
checkpoint = torch.load(checkpoint)
model.load_state_dict(checkpoint)
model.to(device)
model.eval()
eval_batch_size = 20
eval_sampler = SequentialSampler(test_dataset)
eval_dataloader = CTLoader(test_dataset, sampler=eval_sampler, batch_size=eval_batch_size, is_train=False)
eval_loss = 0.0
eval_map = 0.0
nb_eval_steps = 0
eval_targets = []
eval_prediction_scores = []
eval_pred = []
eval_mask = []
per_table_result = {}
for batch in tqdm(eval_dataloader, desc="Evaluating"):
    table_ids, input_tok, input_tok_type, input_tok_pos, input_tok_mask, \
        input_ent_text, input_ent_text_length, input_ent, input_ent_type, input_ent_mask, \
        column_entity_mask, column_header_mask, labels_mask, labels = batch
    input_tok = input_tok.to(device)
    input_tok_type = input_tok_type.to(device)
    input_tok_pos = input_tok_pos.to(device)
    input_tok_mask = input_tok_mask.to(device)
    input_ent_text = input_ent_text.to(device)
    input_ent_text_length = input_ent_text_length.to(device)
    input_ent = input_ent.to(device)
    input_ent_type = input_ent_type.to(device)
    input_ent_mask = input_ent_mask.to(device)
    column_entity_mask = column_entity_mask.to(device)
    column_header_mask = column_header_mask.to(device)
    labels_mask = labels_mask.to(device)
    labels = labels.to(device)
    if mode == 1:
        input_ent_mask = input_ent_mask[:,:,input_tok_mask.shape[1]:]
        input_tok = None
        input_tok_type = None
        input_tok_pos = None
        input_tok_mask = None
    elif mode == 2:
        input_tok_mask = input_tok_mask[:,:,:input_tok_mask.shape[1]]
        input_ent_text = None
        input_ent_text_length = None
        input_ent = None
        input_ent_type = None
        input_ent_mask = None
    elif mode == 3:
        input_ent = None
    elif mode == 4:
        input_ent_mask = input_ent_mask[:,:,input_tok_mask.shape[1]:]
        input_tok = None
        input_tok_type = None
        input_tok_pos = None
        input_tok_mask = None
        input_ent = None
    elif mode == 5:
        input_ent_mask = input_ent_mask[:,:,input_tok_mask.shape[1]:]
        input_tok = None
        input_tok_type = None
        input_tok_pos = None
        input_tok_mask = None
        input_ent_text = None
        input_ent_text_length = None
    with torch.no_grad():
        outputs = model(input_tok, input_tok_type, input_tok_pos, input_tok_mask,\
            input_ent_text, input_ent_text_length, input_ent, input_ent_type, input_ent_mask, column_entity_mask, column_header_mask, labels_mask, labels)
        loss = outputs[0]
        prediction_scores = outputs[1]
        for idx, table_id in enumerate(table_ids):
            valid = labels_mask[idx].nonzero().max().item()+1
            if table_id not in per_table_result:
                per_table_result[table_id] = [[],labels_mask[idx,:valid],labels[idx,:valid]]
            per_table_result[table_id][0].append(prediction_scores[idx,:valid])
        
        eval_loss += loss.mean().item()
        eval_targets.extend(labels.view(-1, config.class_num).tolist())
        eval_prediction_scores.extend(prediction_scores.view(-1, config.class_num).tolist())
        eval_pred.extend((prediction_scores.view(-1, config.class_num)==prediction_scores.view(-1, config.class_num).max(-1)[0][:,None]).tolist())
        eval_mask.extend(labels_mask.view(-1).tolist())
    nb_eval_steps += 1
eval_targets = np.array(eval_targets)
eval_prediction_scores = np.array(eval_prediction_scores)
eval_mask = np.array(eval_mask)
eval_prediction_ranks = np.argsort(np.argsort(-eval_prediction_scores))
eval_pred = np.array(eval_pred)
eval_tp = eval_mask[:,np.newaxis]*eval_pred*eval_targets
eval_precision = np.sum(eval_tp,axis=0)/np.sum(eval_mask[:,np.newaxis]*eval_pred,axis=0)
eval_precision = np.nan_to_num(eval_precision, 1)
eval_recall = np.sum(eval_tp,axis=0)/np.sum(eval_mask[:,np.newaxis]*eval_targets,axis=0)
eval_recall = np.nan_to_num(eval_recall, 1)
eval_f1 = 2*eval_precision*eval_recall/(eval_precision+eval_recall)
eval_f1 = np.nan_to_num(eval_f1, 0)
per_type_instance_num = np.sum(eval_mask[:,np.newaxis]*eval_targets,axis=0)
per_type_instance_num[per_type_instance_num==0] = 1
per_type_correct_instance_num = np.sum(eval_mask[:,np.newaxis]*(eval_prediction_ranks<eval_targets.sum(axis=1)[:,np.newaxis])*eval_targets,axis=0)
per_type_accuracy[mode] = per_type_correct_instance_num/per_type_instance_num
per_type_precision[mode] = eval_precision
per_type_recall[mode] = eval_recall
per_type_f1[mode] = eval_f1
precision[mode] = np.sum(eval_tp)/np.sum(eval_mask[:,np.newaxis]*eval_pred)
recall[mode] = np.sum(eval_tp)/np.sum(eval_mask[:,np.newaxis]*eval_targets)
f1[mode] = 2*precision[mode]*recall[mode]/(precision[mode]+recall[mode])

In [None]:
wiki_types = ['City', 'VideoGame', 'Mountain', 'Writer', 'Lake', 'AdministrativeRegion', 'Book', 'Saint', 'Monarch', 'Bird', 'Plant', 'Currency', 'Company', 'Genre', 'Building', 'PoliticalParty', 'Language', 'Country', 'University', 'SportsTeam', 'RadioStation', 'Airport', 'Wrestler', 'Newspaper', 'Mammal', 'Mayor', 'AcademicJournal', 'Scientist', 'Continent', 'Film', 'BaseballPlayer']
non_wiki_types = [x for x in type_vocab if x not in wiki_types]
wiki_types = set([type_vocab[x] for x in wiki_types])
wiki_type_mask = torch.full((len(type_vocab),),-10000.0).to(device)
for i in wiki_types:
    wiki_type_mask[i] = 0

In [None]:
for t,i in type_vocab.items():
    print(t, per_type_f1[4][i],per_type_instance_num[i])

In [None]:
non_wiki_types

In [None]:
type_vocab

In [None]:
total_corr = 0
total_valid = 0
errors = []
for table_id, result in per_table_result.items():
    prediction_scores, label_mask, label = result
    prediction_scores = torch.stack(prediction_scores, 0).mean(0)
    prediction_scores[:,15] = torch.where(prediction_scores[:,15]>prediction_scores[:,27],prediction_scores[:,15],prediction_scores[:,27])
    prediction_scores += wiki_type_mask[None, :]
    pred_acc = ((prediction_scores==prediction_scores.max(-1)[0][:,None])*label).sum(-1)
    total_valid += label_mask.sum().item()
    total_corr += pred_acc.sum().item()
    if pred_acc.sum().item()!=label_mask.sum().item():
        errors.append(table_id)

In [None]:
total_corr

In [None]:
total_valid

In [None]:
errors

In [None]:
 for inspect_id in errors:
    print(inspect_id)
    prediction_scores, label_mask, label = per_table_result[inspect_id]
    prediction_scores = torch.stack(prediction_scores, 0).mean(0)
    for col_id in range(label.shape[0]):
        if label[col_id].sum().item()==0:
            continue
        display(id2type[label[col_id].nonzero().item()])
        display([[id2type[l],prediction_scores[col_id,l].item()] for l in prediction_scores[col_id].argsort().tolist()[::-1][:3]])

In [None]:
label[col_id]

In [None]:
per_table_result['Baseball_Hall_of_Fame_balloting,_2015#0'][0][0].argsort()

# PublicBI Dataset

In [None]:
from torch.utils.data import Dataset
from tqdm import tqdm
import pandas as pd
from os.path import join
import os
os.environ["WORKING_DIR"] = "/home/sanonymous/semantic_data_lake"
os.environ["TYPENAME"] = "type78"
os.environ["PUBLIC_BI_BENCHMARK"] = "/ext/daten-wi/sanonymous/public_bi_benchmark/benchmark"
corpus = "public_bi"
data_dir= "data/public_bi/"
from model.transformers import BertTokenizer

valid_header_path = join(os.environ["WORKING_DIR"], "data", "extract", "out",
                         "valid_headers")

labeled_unlabeled_test_split_path = join(os.environ["WORKING_DIR"], "data",
                                         "extract", "out",
                                         "labeled_unlabeled_test_split")

from sql_metadata import Parser

def getPublicBIColumnNames(domain, table):
    fd = open(join(os.environ["PUBLIC_BI_BENCHMARK"], domain, "tables", f"{table}.table.sql"),"r")
    sqlStmt = fd.read()
    fd.close()
    #print(sqlStmt)
    #res = sql_metadata.get_query_tokens(sqlStmt)
    columns = Parser(sqlStmt).columns
    return columns

def load_public_bi_table(domain, tablename, cols, number_of_rows, random_state, with_header=True):
    df = pd.read_csv(os.path.join(os.environ.get("PUBLIC_BI_BENCHMARK"),
                                  domain, tablename + ".csv"),
                     sep="|", on_bad_lines="skip",
                     header=None, usecols=cols)#.dropna().sample(n=number_of_rows, random_state=random_state)
    if with_header:
        # df_header = pd.read_csv(os.path.join(
        #     os.environ.get("PUBLIC_BI_BENCHMARK"), domain, "samples",
        #     tablename + ".header.csv"),
        #                         sep="|")
        # df.columns = df_header.columns
        columnNames = getPublicBIColumnNames(domain, tablename)
        df.columns = [columnNames[i] for i in cols]
        return df
    else:
        return df

class PublicBiCTDataset(Dataset):

    def _preprocess(self):
        print("creating data...")
        data = []

        for index, table in tqdm(enumerate(self.df["table_id"].unique()), total=len(self.df["table_id"].unique())):
            if index > 0:
                break
            table_id = None 
            pgTitle = None 
            pgEnt = None 
            secTitle = None 
            caption = None
            headers = None 
            entities = [] 
            type_annotations = None

            cols = self.df[self.df["table_id"] == table]["column"].tolist()
            cols.sort()
            df_table = load_public_bi_table(table.split("_")[0], table, cols, 100, 2)

            for i_col, col in enumerate(df_table.columns):
                entities.append([[[row_i, i_col], [None,row_val]] for row_i, row_val in enumerate(df_table[col].tolist())])
            
            # assign semantic types to the column using valid_headers.json
            type_annotations = [[self.valid_headers[table][f"column_{col}"]["semanticType"]] for col in cols]
            data.append([table_id, pgTitle, pgEnt, secTitle, caption, headers, entities, type_annotations])

        print(f"{len(data)} tables loaded")
        pool = Pool(processes=4)
        processed_cols = list(tqdm(pool.imap(partial(
            process_single_CT, config=self), cols, chunksize=1000), total=len(cols)))
        pool.close()
        return data

    def __init__(self, data_dir, entity_vocab, type_vocab, labeled_data_size, unlabeled_data_size, test_data_size, random_state, data_split_set="labeled", add_STEER_train_data=False, max_column=10, max_input_tok=500, max_length=[50, 10, 10], force_new=False, tokenizer=None):
        if tokenizer is not None:
            self.tokenizer = tokenizer
        else:
            self.tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
        self.force_new = force_new
        self.max_input_tok = max_input_tok
        self.max_title_length = max_length[0]
        self.max_header_length = max_length[1]
        self.max_cell_length = max_length[2]
        self.max_column = max_column
        self.entity_vocab = entity_vocab
        self.entity_wikid2id = {
            self.entity_vocab[x]['wiki_id']: x for x in self.entity_vocab}
        self.type_vocab = type_vocab
        self.type_num = len(self.type_vocab)
        # STEER arguments
        self.labeled_data_size = labeled_data_size
        self.unlabeled_data_size = unlabeled_data_size
        self.test_data_size = test_data_size
        self.random_state = random_state
        self.data_split_set = data_split_set
        self.add_STEER_train_data = add_STEER_train_data

        # load the valid headers with real sem. types
        valid_header_file = f"{corpus}_{os.environ['TYPENAME']}_valid.json"
        valid_headers = join(valid_header_path, valid_header_file)
        with open(valid_headers, "r") as file:
            self.valid_headers = json.load(file)

        ## if additional training data by STEER, then overwrite the gold labels with the given labels from STEER Labeling Framework
        def overwrite_labels_with_gen_train_labels(x):
            self.valid_headers[x["table_id"]][f"column_{x['column']}"]["semanticType"] = x["predicted_semantic_type"]

        # load labeled data from labeled, unlabeled, test split file
        with open(join(labeled_unlabeled_test_split_path, f"{corpus}_{self.labeled_data_size}_{self.unlabeled_data_size}_{self.test_data_size}_{self.random_state}.json")) as f:
            self.labeled_unlabeled_test_split_file = json.load(f)
        if self.data_split_set == "labeled":
            self.df = pd.DataFrame({"table_id": [entry.split("+")[0] for entry in self.labeled_unlabeled_test_split_file[f"labeled{self.labeled_data_size}"]],
                                    "column": [int(entry.split("+")[1].split("_")[1]) for entry in self.labeled_unlabeled_test_split_file[f"labeled{self.labeled_data_size}"]]})
        elif self.data_split_set == "test":
            self.df = pd.DataFrame({"table_id": [entry.split("+")[0] for entry in self.labeled_unlabeled_test_split_file[f"test{self.test_data_size}"]],
                                    "column": [int(entry.split("+")[1].split("_")[1]) for entry in self.labeled_unlabeled_test_split_file[f"test{self.test_data_size}"]]})

        # load additional training data provided by STEER
        if self.add_STEER_train_data == True:
            self.df_gen_train_data = pd.read_csv(join(gen_train_path, f"{corpus}_gen_training_data_all_combined_maj_{labeled_data_size}_absolute_{test_data_size}_{random_state}.csv"), names=["table_id", "column", "dataset_id", "predicted_semantic_type"], header=0)
            self.df_gen_train_data["table_id"] = self.df_gen_train_data["table_id"].apply(lambda x: x.split(".csv")[0].split("_")[1])
            self.df_gen_train_data["column"] = self.df_gen_train_data["column"].apply(lambda x: int(x.split("_")[1]))
            ## add additional train data to labeled columns
            self.df = pd.concat([self.df, self.df_gen_train_data[["table_id", "column"]]], ignore_index=True)

            ## overwrite gold labeld with labels from STEER
            self.df_gen_train_data.apply(lambda x: overwrite_labels_with_gen_train_labels(x), axis=1)

    def __len__(self):
        return len(self.data)

    def __getitem__(self, index):
        return self.data[index]

In [None]:
df = load_public_bi_table("NYC", "NYC_2", [2, 5, 7, 16, 18, 19, 27], 100, 2)

In [None]:
df.dropna()

In [None]:
# load entity vocab from entity_vocab.txt
data_dir= "data/public_bi/"
entity_vocab = load_entity_vocab(data_dir, ignore_bad_title=True, min_ent_count=2)
type_vocab = load_type_vocab(data_dir)

# train_data = PublicBiCTDataset(data_dir, entity_vocab, type_vocab, labeled_data_size=1,
#                                unlabeled_data_size="absolute", test_data_size=20.0, random_state=2, data_split_set="labeled")


In [None]:
train_data.df.columns.tolist()

In [None]:
with open(os.path.join("data/wikitables_v2/", 'test.table_col_type.json'), 'r') as f:
    testset = json.load(f)[4]
table_id, pgTitle, pgEnt, secTitle, caption, headers, entities, type_annotations = testset

In [None]:
headers

In [None]:
data = []

def load_data(config):
    pass

for index, table in enumerate(train_data.df["table_id"].unique()):
    if index > 0:
        break
    table_id = None 
    pgTitle = None 
    pgEnt = None 
    secTitle = None 
    caption = None
    headers = None 
    entities = [] 
    type_annotations = None

    print(table)
    cols = train_data.df[train_data.df["table_id"] == table]["column"].tolist()
    cols.sort()
    df_table = load_public_bi_table(table.split("_")[0], table, cols, 100, 2)

    for i_col, col in enumerate(df_table.columns):
        entities.append([[[row_i, i_col], [None,row_val]] for row_i, row_val in enumerate(df_table[col].tolist())])
    
    # assign semantic types to the column using valid_headers.json
    type_annotations = [train_data.valid_headers[table][f"column_{col}"]["semanticType"] for col in cols]

In [None]:
from data_loader.CT_PublicBI_data_loader import *
 
train_data = PublicBiCTDataset(data_dir, entity_vocab, type_vocab, labeled_data_size=1,
                                unlabeled_data_size="absolute", test_data_size=20.0, random_state=2, data_split_set="labeled")

In [None]:
###############################
## Evaluation TURL on Public BI
###############################

from torch.utils.data import DataLoader, Dataset, SequentialSampler, RandomSampler
from sklearn.metrics import classification_report
import json

from data_loader.CT_PublicBI_data_loader import PublicBiCTDataset, CTLoader
data_dir= "data/public_bi/"
entity_vocab = load_entity_vocab(data_dir, ignore_bad_title=True, min_ent_count=2)
type_vocab = load_type_vocab(data_dir)

labeled_data_size = "43.19"
unlabeled_data_size = "36.80"
test_data_size = "20.01"
random_state = 2

data = PublicBiCTDataset(data_dir, entity_vocab, type_vocab, labeled_data_size=labeled_data_size, unlabeled_data_size=unlabeled_data_size, test_data_size=test_data_size,
                    random_state=random_state, data_split_set="test", max_input_tok=500, max_length=[50, 10, 10], force_new=False, tokenizer=None)


for labeled_data_size, unlabeled_data_size in [("43.19","36.80"),("50.84","29.14"),("55.06","24.93"),("59.27","20.72"),("61.24","18.75")]:
    train_sampler = SequentialSampler(data)
    train_dataloader = CTLoader(data, sampler=train_sampler, batch_size=1, is_train=True)

    epoch_iterator = tqdm(train_dataloader, desc="Iteration", disable=-1 not in [-1, 0])
    mode = 1
    print(mode)
    config_class, model_class, _ = MODEL_CLASSES['CT']
    config = config_class.from_pretrained(config_name)
    config.class_num = len(type_vocab)
    config.mode = mode
    model = model_class(config, is_simple=True)
    #checkpoint = checkpoints[mode]

    model_path = f"output/CT/v2/{mode}/model_STEER_{labeled_data_size}_{unlabeled_data_size}_20.0_{random_state}_PublicBI_sameTraindata"
    checkpoint = torch.load(f"{model_path}/pytorch_model.bin")
    model.load_state_dict(checkpoint)
    model.to(device)
    model.eval()

    predicted_labels_all = []
    true_labels_all = []
    for step, batch in enumerate(epoch_iterator):
        # if step > 5:
        #     break
        table_ids, input_tok, input_tok_type, input_tok_pos, input_tok_mask, \
                input_ent_text, input_ent_text_length, input_ent, input_ent_type, input_ent_mask, \
                column_entity_mask, column_header_mask, labels_mask, labels = batch
        input_tok = input_tok.to(device)
        input_tok_type = input_tok_type.to(device)
        input_tok_pos = input_tok_pos.to(device)
        input_tok_mask = input_tok_mask.to(device)
        input_ent_text = input_ent_text.to(device)
        input_ent_text_length = input_ent_text_length.to(device)
        input_ent = input_ent.to(device)
        input_ent_type = input_ent_type.to(device)
        input_ent_mask = input_ent_mask.to(device)
        column_entity_mask = column_entity_mask.to(device)
        column_header_mask = column_header_mask.to(device)
        labels_mask = labels_mask.to(device)
        labels = labels.to(device)

        if mode == 1:
            input_ent_mask = input_ent_mask[:,:,input_tok_mask.shape[1]:]
            input_tok = None
            input_tok_type = None
            input_tok_pos = None
            input_tok_mask = None

        with torch.no_grad():
            outputs = model(input_tok, input_tok_type, input_tok_pos, input_tok_mask,\
                input_ent_text, input_ent_text_length, input_ent, input_ent_type, input_ent_mask, column_entity_mask, column_header_mask, labels_mask, labels)
            loss = outputs[0]
            prediction_scores = outputs[1]
            prediction_labels = (prediction_scores.view(-1, config.class_num)==prediction_scores.view(-1, config.class_num).max(-1)[0][:,None]).tolist()
            true_labels = labels.view(-1, config.class_num).tolist()
            predicted_labels_all.extend([pred.index(True) for pred in prediction_labels])
            true_labels_all.extend([true_label.index(1.0) for true_label in true_labels])

    class_report = classification_report(true_labels_all, predicted_labels_all, output_dict=True)
    with open(f"{model_path}/classification_report.json", "w") as outfile:
        json.dump(class_report, outfile)

# Combine results of multiple runs with different random states

In [None]:
import os
from os.path import join
from dotenv import load_dotenv
load_dotenv(override=True)
import pandas as pd
import json
from sklearn.metrics import classification_report
import numpy as np

add_comment = "PublicBI"
for labeled_data_size in [1,2,3,4,5]:
    path = join("output", "CT", "v2", "1",
        f"model_STEER_{labeled_data_size}_absolute_20.0"
    )
    scores = {
        "f1-scores_macro": [],
        "precisions_macro":[],
        "recalls_macro":[],
        "supports_macro": [],
        "f1-scores_weighted": [],
        "precisions_weighted": [],
        "recalls_weighted": [],
        "supports_weighted": []
    }


    for random_state in [1,2,3,4,5]:
        with open(join(path+f"_{random_state}_{add_comment}", "classification_report.json" ), "r") as f:
            current_class_report = json.load(f)
        for metric in ["macro","weighted"]:
            scores[f"f1-scores_{metric}"].append(current_class_report[f"{metric} avg"]["f1-score"])
            scores[f"precisions_{metric}"].append(current_class_report[f"{metric} avg"]["precision"])
            scores[f"recalls_{metric}"].append(current_class_report[f"{metric} avg"]["recall"])
            scores[f"supports_{metric}"].append(current_class_report[f"{metric} avg"]["support"])

    df_scores = pd.DataFrame(
        np.array([
            scores["f1-scores_macro"], scores["precisions_macro"],
            scores["recalls_macro"], scores["supports_macro"],
            scores["f1-scores_weighted"], scores["precisions_weighted"],
            scores["recalls_weighted"], scores["supports_weighted"]
        ]), index=scores.keys())
    df_scores["mean"] = df_scores.mean(axis=1)
    df_scores["std"] = df_scores.std(axis=1)
    df_scores["var"] = df_scores.var(axis=1)

    df_scores.to_csv(path+f"{add_comment}_mean.csv")
