In [2]:
import numpy as np
import pandas as pd
import os
import tensorflow as tf
import re
from datasets import load_dataset
from datasets import Dataset
from tqdm.notebook import tqdm
from transformers import AutoTokenizer, LongformerForTokenClassification, LongformerTokenizerFast
from transformers import DataCollatorForTokenClassification
from datasets import load_metric

In [9]:
raw_datasets  = load_dataset("conll2003")
model_checkpoint = "bert-base-cased"
longformer_checkpoint = "allenai/longformer-base-4096"

#see if longformer tokenizer matches bert tokenzier
tokenizer = AutoTokenizer.from_pretrained(longformer_checkpoint, add_prefix_space=True)
# tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)
df_errorIds = pd.read_csv("Error_Ids.csv")
df_errorIds = df_errorIds.drop(['Unnamed: 0'], axis = 1)
metric = load_metric("seqeval")

tokenDict = {
    "Lead" : 0,
    "Position" : 1,
    "Evidence" : 2,
    "Claim" : 3,
    "Concluding Statement" : 4,
    "Counterclaim" : 5,
    "Rebuttal": 6
}

Reusing dataset conll2003 (C:\Users\Brad\.cache\huggingface\datasets\conll2003\conll2003\1.0.0\40e7cb6bcc374f7c349c83acd1e9352a4f09474eb691f64f364ee62eb65d0ca6)
100%|██████████| 3/3 [00:00<00:00, 49.68it/s]


In [7]:
# helper functions

def fileToArray(file_path):
    # turn text into array of words
    text_ds = tf.data.TextLineDataset(file_path).filter(lambda x: tf.cast(tf.strings.length(x), bool))
    text_ds = text_ds.enumerate()
    docTxt = []
    for i in text_ds.as_numpy_iterator():
        line = i[1].decode().split()
        for l in line:
            docTxt.append(l)
    
    docTxt = ' '.join(docTxt)
    return docTxt

def calc_word_indices(full_text, discourse_start, discourse_end):
    start_index = len(full_text[:discourse_start].split())
    token_len = len(full_text[discourse_start:discourse_end].split())
    output = list(range(start_index, start_index + token_len))
    if output[-1] >= len(full_text.split()):
        output = list(range(start_index, start_index + token_len-1))
    return output

def align_labels_with_tokens(labels, word_ids):
    new_labels = []
    current_word = None
    for word_id in word_ids:
        if word_id != current_word:
            # Start of a new word!
            current_word = word_id
            label = -100 if word_id is None else labels[word_id]
            new_labels.append(label)
        elif word_id is None:
            # Special token
            new_labels.append(-100)
        else:
            # Same word as previous token
            label = labels[word_id]
            # If the label is B-XXX we change it to I-XXX
            if label % 2 == 1:
                label += 1
            new_labels.append(label)

    return new_labels

def tokenize_and_align_labels(examples):
    tokenized_inputs = tokenizer(
        examples["tokens"], truncation=True, is_split_into_words=True
    )

    all_labels = examples["ner_tags"]
    new_labels = []
    for i, labels in enumerate(all_labels):
        word_ids = tokenized_inputs.word_ids(i)
        new_labels.append(align_labels_with_tokens(labels, word_ids))

    tokenized_inputs["labels"] = new_labels
    return tokenized_inputs

def dfTags_to_ints(df):
    for index, row in df.iterrows():
        intArray = []
        tokenArray = []
        for i in row[2]:
            intArray.append(int(i))
        df.at[index, "ner_tags"] = intArray.copy()
        # for j in row[1]:
        #     if j != '.' and j != ',':
        #         tokenArray.append(j)
        # df.at[index, "tokens"] = tokenArray.copy()
    return df

def csv_to_df(fileName):
    return None

In [5]:
errorList = list(df_errorIds["id"])
fileName = "trainHugging4.csv"
df = pd.read_csv(fileName, error_bad_lines=False)

df['tokens'] = df['tokens'].apply(lambda a: a.split())
df['ner_tags'] = df['ner_tags'].apply(lambda a: a.split())
df = df.drop(['Unnamed: 0'], axis = 1)

# errorList_idx = list(map(lambda e: df.loc[df["id"] == e].index[0], errorList))
errorList_idx = []
for e in errorList:
    try:
        errorList_idx.append(df.loc[df["id"] == e].index[0])
    except:
        continue
df = df.drop(errorList_idx, 0)
df = dfTags_to_ints(df)

newDataset = Dataset.from_pandas(df)

In [20]:
longError = []
for i in newDataset:
    inputs = tokenizer(i["tokens"], is_split_into_words=True)
    labels = i["ner_tags"]
    word_ids = inputs.word_ids()
    try:
        align_labels_with_tokens(labels, word_ids)
    except:
        longError.append(i["id"])
# df = pd.DataFrame(longError)
# df.to_csv('bigError.csv', mode='a')

In [21]:
bigError = longError + errorList
df = pd.DataFrame(bigError)
df.to_csv('bigError.csv')

In [None]:
tokenized_datasets = newDataset.map(
    tokenize_and_align_labels,
    batched=True,
    remove_columns=newDataset.column_names
)
tokenized_datasets