## Data Creation

Add annotations to original spider training data

In [7]:
SPIDER_DATA_FOLDER = "spider_data"
TRAIN_JSON_FILENAME = "train_spider.json"
ANNOTATION_FILENAME = "annotations_export.csv"
EXPORT_FILENAME = "train_spider.annotated.json"
EXPORT_PER_CATEGORY_PREFIX = "train_spider."
ENCODED_ORIGINAL_FILENAME = "train_spider.questions.sbert.npy"
ENCODED_GENERALIZED_FILENAME = "train_spider.generalized.sbert.npy"

In [8]:
from os import path
import json

Load original training data

In [9]:
with open(path.join(SPIDER_DATA_FOLDER, TRAIN_JSON_FILENAME), "r") as train_json:
    train_data = json.load(train_json)

Load annotations

In [10]:
annotation_labels = ['general', 'domain-specific', 'specialized-terms', 'errors', 'values', 'complex', 'synonyms', 'bad-generalization']
label_count = len(annotation_labels)

extracted_entries = [(title, list()) for title in annotation_labels]

annotations = {}
last_labels = []

with open(path.join(SPIDER_DATA_FOLDER, ANNOTATION_FILENAME), 'r') as annotations_file:
    for line in annotations_file.readlines():
        line_split = line.split(';')
        labels = line_split[:label_count]
        generalized_query = line_split[label_count+1]
        original_query = line_split[label_count+2]
        if labels[0] == "":
            labels = last_labels
        last_labels = labels
        annotations[original_query] = (labels, generalized_query)

Add annotations

In [11]:
original_questions = []
generalized_questions = []

for entry in train_data:
    annotation = annotations.get(entry['question'], None)
    if annotation is not None:
        labels, generalized_query = annotation
        entry['generalisation_categories'] = [annotation_labels[i] for i, label in enumerate(labels) if label == '1'] # TODO Short name instead of label id?
        entry['generalized_question'] = generalized_query

        for i, label_true_false in enumerate(labels):
            if label_true_false == '1':
                extracted_entries[i][1].append(entry)
    else:
        entry['generalisation_categories'] = [] # TODO Short name instead of label id?
        entry['generalized_question'] = entry['question']

    original_questions.append(entry['question'])
    original_questions.append(entry['generalized_question'])

Export annotated data

In [12]:
with open(path.join(SPIDER_DATA_FOLDER, EXPORT_FILENAME), "w") as out_json:
    json.dump(train_data, out_json, indent=4)

for (label, extracted_list) in extracted_entries:
    with open(path.join(SPIDER_DATA_FOLDER, f"{EXPORT_PER_CATEGORY_PREFIX}{label}.json"), "w") as out_json:
        json.dump(extracted_list, out_json, indent=4)

Represent using Sentence-Bert

In [13]:
from sentence_transformers import SentenceTransformer

embedder = SentenceTransformer('bert-base-nli-mean-tokens')

In [None]:
import numpy as np

encoded_file_original_full_path = path.join(SPIDER_DATA_FOLDER, ENCODED_ORIGINAL_FILENAME)
encoded_file_generalized_full_path = path.join(SPIDER_DATA_FOLDER, ENCODED_GENERALIZED_FILENAME)

if path.isfile(encoded_file_original_full_path) and path.isfile(encoded_file_generalized_full_path):
    original_embeddings = np.load(encoded_file_original_full_path)
    generalized_embeddings = np.load(encoded_file_generalized_full_path)
else:
    original_embeddings = embedder.encode(original_questions)
    np.save(encoded_file_original_full_path, original_embeddings)

    generalized_embeddings = embedder.encode(generalized_questions)
    np.save(encoded_file_generalized_full_path, generalized_embeddings)