In [1]:
import os
import torch
import numpy as np
import pandas as pd

from collections import Counter
from transformers import AutoTokenizer, AutoModelForSequenceClassification, EvalPrediction

2022-06-09 03:41:40.331775: W tensorflow/stream_executor/platform/default/dso_loader.cc:59] Could not load dynamic library 'libcudart.so.10.1'; dlerror: libcudart.so.10.1: cannot open shared object file: No such file or directory
2022-06-09 03:41:40.331813: I tensorflow/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine.


In [2]:
def load_dataset(filename: str, split: str = "train") -> pd.DataFrame:
    
    in_df = pd.read_csv(filename % (split, "seq.in"), sep="\n", header=None, names=["text"])
    out_df = pd.read_csv(filename % (split, "seq.out"), sep="\n", header=None, names=["ne"])
    label_df = pd.read_csv(filename % (split, "label"), sep="\n", header=None, names=["intent"])
    
    df = pd.concat([in_df, out_df, label_df], axis=1)
    
    return df

In [3]:
# Load training, validation, testing dataset
usecase = "atis"
filename_template = f"../{usecase}/%s/%s"

train_df = load_dataset(filename_template, "train")
valid_df = load_dataset(filename_template, "valid")
test_df = load_dataset(filename_template, "test")

In [4]:
model_name = "microsoft/deberta-large"

In [5]:
tokenizer = AutoTokenizer.from_pretrained(model_name)

In [8]:
# Align name entities with tokens
def align_tokens(
    text: str = "i want to fly from baltimore to dallas round trip",
    ne_text: str = "O O O O O B-fromloc.city_name O B-toloc.city_name B-round_trip I-round_trip",
    test: bool = False,
) -> list:
    tokens = tokenizer.tokenize(text)
    ne = ne_text.split(" ")

    index = 0
    token_pos = []
    ne_tags = []

    for i, token in enumerate(tokens):
        if i == 0 or token.startswith("Ġ"):
            token_pos.append(0)

            ne_tags.append(ne[index])
            index += 1
        else:
            token_pos.append(1)

            ne_tags.append("I" + ne[index - 1][1:] if ne[index - 1] != "O" else "O")

    if test:
        return pd.DataFrame({"tokens": tokens, "token_pos": token_pos, "ne_tags": ne_tags}).transpose()
        #print(f"tokens: {tokens}\ntoken_pos: {token_pos}\nne_tags: {ne_tags}")
    
    return " ".join(ne_tags)

In [9]:
# Test aligned results
align_tokens(test=True)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13
tokens,i,Ġwant,.,Ġto,Ġfly,Ġfrom,Ġb,alt,imore,Ġto,Ġd,allas,Ġround,Ġtrip
token_pos,0,0,1,0,0,0,0,1,1,0,0,1,0,0
ne_tags,O,O,O,O,O,O,B-fromloc.city_name,I-fromloc.city_name,I-fromloc.city_name,O,B-toloc.city_name,I-toloc.city_name,B-round_trip,I-round_trip


In [292]:
# Test aligned results
align_tokens(text=valid_df["text"][10], ne_text=valid_df["ne"][10], test=True)

Unnamed: 0,0,1,2,3,4,5,6,7,8
tokens,fl,ights,Ġfrom,Ġont,ario,Ġto,Ġfl,or,ida
token_pos,0,1,0,0,1,0,0,1,1
ne_tags,O,O,O,B-fromloc.city_name,I-fromloc.city_name,O,B-toloc.state_name,I-toloc.state_name,I-toloc.state_name


In [293]:
# Get aligned results
train_df["ne_tokens"] = train_df.apply(lambda x: align_tokens(x["text"], x["ne"]), axis=1)
valid_df["ne_tokens"] = valid_df.apply(lambda x: align_tokens(x["text"], x["ne"]), axis=1)
test_df["ne_tokens"] = test_df.apply(lambda x: align_tokens(x["text"], x["ne"]), axis=1)

In [295]:
# Save dataset
suffix = "1_1"
output = f"../{usecase}/processed/"

if not os.path.exists(output):
    os.mkdir(output)

train_df.to_csv(output + f"ner_train_v{suffix}.csv", index=False)
valid_df.to_csv(output + f"ner_valid_v{suffix}.csv", index=False)
test_df.to_csv(output + f"ner_test_v{suffix}.csv", index=False)