# 1. Subject augment

In [1]:
import json
import sys
import spacy
from spacy.matcher import Matcher

# !python3 -m spacy download en_core_web_sm
nlp = spacy.load("en_core_web_sm")

In [14]:
def subject_aug(train_path):
    # with open(sys.argv[1]) as f:
    with open(train_path) as f:
        train = json.load(f)

    track_ids = list(train.keys())
    for id_ in track_ids:
        new_text = ""
        for i, text in enumerate(train[id_]["nl"]):
            doc = nlp(text)

            for chunk in doc.noun_chunks:
                nb = chunk.text
                break
            train[id_]["nl"][i] = nb+'. '+train[id_]["nl"][i]
            new_text += nb+'.'
            if i < 2:
                new_text += ' '
        train[id_]["nl"].append(new_text)

    with open(train_path.replace(".json", "_subjectaug.json"), "w") as f:
        json.dump(train, f, indent=5)

subject_aug('../data/test_queries.json')

# 2. Motion augment

In [4]:
matcher = Matcher(nlp.vocab)
pattern = [
    [{"POS": "VERB", "op": "+"}, {"LOWER": "straight"}],
    [{"POS": "VERB", "op": "+"}, {"LOWER": "left"}],
    [{"POS": "VERB", "op": "+"}, {"LOWER": "right"}],
    [{"POS": "VERB", "op": "+"}, {"LOWER": "ahead"}],
    [{"LOWER": "stop"}],
    [{"LOWER": "stops"}],
          ]
matcher.add("motion-chunks", pattern)

def motion_aug(train_path):
    with open(train_path) as f:
        train = json.load(f)

    track_ids = list(train.keys())
    for id_ in track_ids:
        new_text = ""
        for i, text in enumerate(train[id_]["nl"]):
            doc = nlp(text)

            # car aug
            for chunk in doc.noun_chunks:
                nb = chunk.text
                break
            train[id_]["nl"][i] = nb + '. ' + train[id_]["nl"][i]
            new_text += nb+'.'
            if i < 2:
                new_text += ' '

            # motion aug
            matches = matcher(doc)
            for match_id, start, end in matches:
                span = doc[start:end]
                train[id_]["nl"][i] = train[id_]["nl"][i] + ' ' + span.text + '.'

        train[id_]["nl"].append(new_text)

    with open(train_path.replace(".json", "_motionaug.json"), "w") as f:
        json.dump(train, f, indent=4)

motion_aug('../data/test_queries.json')

# 3. Localization augment

In [16]:
matcher = Matcher(nlp.vocab)
pattern = [
    [{"POS": "DET", "op": "+"}, {"LOWER": "intersection"}],
]
matcher.add("location-chunks", pattern)

# run for train and val
# train_path = 'data2021/train.json'
# # train_path = 'data2021/val.json'

path = ['data2022/train-tracks.json', 'data2022/train.json', 'data2022/val.json', 'data2022/test-queries.json']


def localization_aug(train_path):
    # with open(sys.argv[1]) as f:
    with open(train_path) as f:
        train = json.load(f)

    track_ids = list(train.keys())
    for id_ in track_ids:
        new_text = ""
        location_dict = dict()
        has_location = -1
        for i, text in enumerate(train[id_]["nl"]):
            doc = nlp(text)

            # car aug
            for chunk in doc.noun_chunks:
                nb = chunk.text
                break
            train[id_]["nl"][i] = nb + '. ' + train[id_]["nl"][i]
            new_text += nb + '.'
            if i < 2:
                new_text += ' '

            # location aug
            matches = matcher(doc)
            location_text = ''
            for match_id, start, end in matches:
                span = doc[start:end]
                has_location = i
                location_text += ' ' + span.text + '.'
                train[id_]["nl"][i] = train[id_]["nl"][i] + ' ' + span.text + '.'
            # location_dict[i] = location_text

        # # if exist One, broadcast others
        # if has_location != -1:
        #     for i, text in enumerate(train[id_]["nl"]):
        #         if location_dict[i] == '':
        #             train[id_]["nl"][i] = train[id_]["nl"][i] + location_dict[has_location]
        #         else:
        #             train[id_]["nl"][i] = train[id_]["nl"][i] + location_dict[i]

        train[id_]["nl"].append(new_text)

    with open(train_path.replace(".json", "_locationaug.json"), "w") as f:
        json.dump(train, f, indent=4)

localization_aug('../data/test_queries.json')

# 5. Subject + motion + localization augment

In [3]:
matcher = Matcher(nlp.vocab)

# moiton
pattern = [
    [{"POS": "VERB", "op": "+"}, {"LOWER": "straight"}],
    [{"POS": "VERB", "op": "+"}, {"LOWER": "left"}],
    [{"POS": "VERB", "op": "+"}, {"LOWER": "right"}],
    [{"POS": "VERB", "op": "+"}, {"LOWER": "ahead"}],
    [{"LOWER": "stop"}],
    [{"LOWER": "stops"}],
          ]
matcher.add("motion-chunks", pattern)

# localization
pattern = [
    [{"POS": "DET", "op": "+"}, {"LOWER": "intersection"}],
]
matcher.add("location-chunks", pattern)


# subject + motion + localization augment
def triple_aug(train_path):
    with open(train_path) as f:
        train = json.load(f)

    track_ids = list(train.keys())
    for id_ in track_ids:
        new_text = ""
        for i, text in enumerate(train[id_]["nl"]):
            doc = nlp(text)

            # car aug
            for chunk in doc.noun_chunks:
                nb = chunk.text
                break
            train[id_]["nl"][i] = nb + '. ' + train[id_]["nl"][i]
            new_text += nb+'.'
            if i < 2:
                new_text += ' '

            # motion aug
            matches = matcher(doc)
            for match_id, start, end in matches:
                span = doc[start:end]
                train[id_]["nl"][i] = train[id_]["nl"][i] + ' ' + span.text + '.'

        train[id_]["nl"].append(new_text)

    with open(train_path.replace(".json", "_tripleaug.json"), "w") as f:
        json.dump(train, f, indent=4)

triple_aug('../data/test_queries.json')

# 4. Bert embedding

In [8]:
import torch
from transformers import AutoModel, AutoTokenizer
from transformers import RobertaTokenizer, RobertaConfig, RobertaModel

model = RobertaModel.from_pretrained("roberta-base")
tokenizer = RobertaTokenizer.from_pretrained("roberta-base")

Some weights of the model checkpoint at roberta-base were not used when initializing RobertaModel: ['lm_head.bias', 'lm_head.dense.bias', 'lm_head.decoder.weight', 'lm_head.dense.weight', 'lm_head.layer_norm.bias', 'lm_head.layer_norm.weight']
- This IS expected if you are initializing RobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [18]:
# INPUT TEXT MUST BE ALREADY WORD-SEGMENTED!
sentence = 'A blue pickup truck keeps straight at an intersection.'  

# input_ids =  tokenizer(sentence)['input_ids']

input_ids = torch.tensor([tokenizer.encode(sentence)])
print(input_ids)

with torch.no_grad():
    features = model(input_ids)  # Models outputs are now tuples

print(features.last_hidden_state)
print(features.last_hidden_state.shape)

tensor([[   0,  250, 2440, 8517, 2484, 4719, 1359,   23,   41, 8088,    4,    2]])
tensor([[[-0.0862,  0.0928, -0.0275,  ..., -0.0733, -0.0166, -0.0590],
         [-0.0826,  0.1099, -0.1222,  ..., -0.1203, -0.0812,  0.1023],
         [-0.1330, -0.0406, -0.1798,  ...,  0.4730, -0.1230,  0.0430],
         ...,
         [-0.0048,  0.1972,  0.0708,  ...,  0.0869,  0.0936, -0.0588],
         [-0.0739,  0.0877, -0.0582,  ..., -0.0979, -0.0127, -0.0943],
         [-0.0942,  0.1347, -0.0880,  ...,  0.2339, -0.0330, -0.0013]]])
torch.Size([1, 12, 768])
