In [5]:
import numpy as np
import pandas as pd
import os
import tensorflow as tf
import re
from datasets import load_dataset
from datasets import Dataset
from tqdm.notebook import tqdm
from transformers import AutoTokenizer
from transformers import DataCollatorForTokenClassification
from datasets import load_metric

In [6]:
raw_datasets  = load_dataset("conll2003")
model_checkpoint = "bert-base-cased"
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)
df_errorIds = pd.read_csv("Error_Ids.csv")
df_errorIds = df_errorIds.drop(['Unnamed: 0'], axis = 1)
metric = load_metric("seqeval")

tokenDict = {
    "Lead" : 0,
    "Position" : 1,
    "Evidence" : 2,
    "Claim" : 3,
    "Concluding Statement" : 4,
    "Counterclaim" : 5,
    "Rebuttal": 6
}

Reusing dataset conll2003 (C:\Users\Brad\.cache\huggingface\datasets\conll2003\conll2003\1.0.0\40e7cb6bcc374f7c349c83acd1e9352a4f09474eb691f64f364ee62eb65d0ca6)
100%|██████████| 3/3 [00:00<00:00, 27.77it/s]


In [3]:
# helper functions

def fileToArray(file_path):
    # turn text into array of words
    text_ds = tf.data.TextLineDataset(file_path).filter(lambda x: tf.cast(tf.strings.length(x), bool))
    text_ds = text_ds.enumerate()
    docTxt = []
    for i in text_ds.as_numpy_iterator():
        line = i[1].decode().split()
        for l in line:
            docTxt.append(l)
    
    docTxt = ' '.join(docTxt)
    return docTxt

def calc_word_indices(full_text, discourse_start, discourse_end):
    start_index = len(full_text[:discourse_start].split())
    token_len = len(full_text[discourse_start:discourse_end].split())
    output = list(range(start_index, start_index + token_len))
    if output[-1] >= len(full_text.split()):
        output = list(range(start_index, start_index + token_len-1))
    return output

def align_labels_with_tokens(labels, word_ids):
    new_labels = []
    current_word = None
    for word_id in word_ids:
        if word_id != current_word:
            # Start of a new word!
            current_word = word_id
            label = -100 if word_id is None else labels[word_id]
            new_labels.append(label)
        elif word_id is None:
            # Special token
            new_labels.append(-100)
        else:
            # Same word as previous token
            label = labels[word_id]
            # If the label is B-XXX we change it to I-XXX
            if label % 2 == 1:
                label += 1
            new_labels.append(label)

    return new_labels

def tokenize_and_align_labels(examples):
    tokenized_inputs = tokenizer(
        examples["tokens"], truncation=True, is_split_into_words=True
    )

    all_labels = examples["ner_tags"]
    new_labels = []
    for i, labels in enumerate(all_labels):
        word_ids = tokenized_inputs.word_ids(i)
        new_labels.append(align_labels_with_tokens(labels, word_ids))

    tokenized_inputs["labels"] = new_labels
    return tokenized_inputs

def dfTags_to_ints(df):
    for index, row in df.iterrows():
        intArray = []
        tokenArray = []
        for i in row[2]:
            intArray.append(int(i))
        df.at[index, "ner_tags"] = intArray.copy()
        # for j in row[1]:
        #     if j != '.' and j != ',':
        #         tokenArray.append(j)
        # df.at[index, "tokens"] = tokenArray.copy()
    return df

def csv_to_df(fileName):
    return None

In [4]:
#takes in a csv file name and returns a tokenized dataset
#NEEDS to be updated in occurance with the createTokenTrainingSet() function
def createTokenDataset(fileName):

    #maybe create a helper function???
    df = csv_to_df(fileName)
    essayNames = df["id"].unique()
    
    data = {
        "id": [],
        "tokens": [],
        "ner_tags": []
    }

    token_df = pd.DataFrame(data)
    for fileName in tqdm(essayNames):
        df_file = df[df.id.str.contains(fileName,case=False)]
        df_file = df_file.reset_index()
        labels = []
        num_labels = []
        fullText = []
        for i in range(len(df_file.index)):
            df_string = df_file["predictionstring"][i]
            stringArray = df_string.split()
            df_textString = df_file["discourse_text"][i].split()
            for df in df_textString:
                if df != '.' and df != ',':
                    fullText.append(df)
            for s in stringArray:
                labels.append(df_file["discourse_type"][i])
                num_labels.append(tokenDict[str(df_file["discourse_type"][i])])

        token_df.loc[len(token_df.index)] = [fileName, fullText, num_labels]
    
    #find a better way to do this
    token_df['tokens'] = token_df['tokens'].apply(lambda a: ' '.join(map(str, a)))
    token_df['ner_tags'] = token_df['ner_tags'].apply(lambda a: ' '.join(map(str, a)))
    token_df['tokens'] = token_df['tokens'].apply(lambda a: a.split())
    token_df['ner_tags'] = token_df['ner_tags'].apply(lambda a: a.split())

    tokenDataset = Dataset.from_pandas(token_df)

    tokenized_datasets = tokenDataset.map(
        tokenize_and_align_labels,
        batched=True,
    )
    return tokenized_datasets

def createTokenTrainingSet(fileName):
    errorList = list(df_errorIds["id"])
    df = pd.read_csv(fileName)

    df['tokens'] = df['tokens'].apply(lambda a: a.split())
    df['ner_tags'] = df['ner_tags'].apply(lambda a: a.split())
    df = df.drop(['Unnamed: 0'], axis = 1)

    errorList_idx = list(map(lambda e: df.loc[df["id"] == e].index[0], errorList))
    df = df.drop(errorList_idx, 0)
    df = dfTags_to_ints(df)

    newDataset = Dataset.from_pandas(df)

    tokenized_datasets = newDataset.map(
        tokenize_and_align_labels,
        batched=True,
        remove_columns=newDataset.column_names
    )
    return tokenized_datasets

    

In [None]:
tokenTrain = createTokenTrainingSet("trainHugging4.csv")

In [None]:
tokenTrain = tokenTrain.train_test_split(train_size=0.9, test_size=0.1)
tokenTrain

DatasetDict({
    train: Dataset({
        features: ['attention_mask', 'input_ids', 'labels', 'token_type_ids'],
        num_rows: 13974
    })
    test: Dataset({
        features: ['attention_mask', 'input_ids', 'labels', 'token_type_ids'],
        num_rows: 1553
    })
})

In [6]:
data_collator = DataCollatorForTokenClassification(tokenizer=tokenizer)

In [7]:
label_names = ["Lead", "Position", "Evidence", "Claim", "Concluding Statement", "Counterclaim", "Rebuttal"]

In [None]:
labels = ["B-Lead", "B-Lead"]
predictions = ["B-Lead", "B-Claim"]
metric.compute(predictions=[predictions], references=[labels])

  _warn_prf(average, modifier, msg_start, len(result))


{'Claim': {'precision': 0.0, 'recall': 0.0, 'f1': 0.0, 'number': 0},
 'Lead': {'precision': 1.0,
  'recall': 0.5,
  'f1': 0.6666666666666666,
  'number': 2},
 'overall_precision': 0.5,
 'overall_recall': 0.5,
 'overall_f1': 0.5,
 'overall_accuracy': 0.5}

In [12]:
df = pd.read_csv("train.csv")
df = df.loc[df['id'] == "423A1CA112E2"]
df

Unnamed: 0,id,discourse_id,discourse_start,discourse_end,discourse_text,discourse_type,discourse_type_num,predictionstring
0,423A1CA112E2,1622628000000.0,8.0,229.0,Modern humans today are always on their phone....,Lead,Lead 1,1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 1...
1,423A1CA112E2,1622628000000.0,230.0,312.0,They are some really bad consequences when stu...,Position,Position 1,45 46 47 48 49 50 51 52 53 54 55 56 57 58 59
2,423A1CA112E2,1622628000000.0,313.0,401.0,Some certain areas in the United States ban ph...,Evidence,Evidence 1,60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75
3,423A1CA112E2,1622628000000.0,402.0,758.0,"When people have phones, they know about certa...",Evidence,Evidence 2,76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 9...
4,423A1CA112E2,1622628000000.0,759.0,886.0,Driving is one of the way how to get around. P...,Claim,Claim 1,139 140 141 142 143 144 145 146 147 148 149 15...
5,423A1CA112E2,1622628000000.0,887.0,1150.0,That's why there's a thing that's called no te...,Evidence,Evidence 3,163 164 165 166 167 168 169 170 171 172 173 17...
6,423A1CA112E2,1622628000000.0,1151.0,1533.0,Sometimes on the news there is either an accid...,Evidence,Evidence 4,211 212 213 214 215 216 217 218 219 220 221 22...
7,423A1CA112E2,1622628000000.0,1534.0,1602.0,Phones are fine to use and it's also the best ...,Claim,Claim 2,282 283 284 285 286 287 288 289 290 291 292 29...
8,423A1CA112E2,1622628000000.0,1603.0,1890.0,If you go through a problem and you can't find...,Evidence,Evidence 5,297 298 299 300 301 302 303 304 305 306 307 30...
9,423A1CA112E2,1622628000000.0,1891.0,2027.0,The news always updated when people do somethi...,Concluding Statement,Concluding Statement 1,355 356 357 358 359 360 361 362 363 364 365 36...


In [55]:
def myEval(true_labels, true_predictions):
    label_dict = []
    pred_dict = []
    curr_type = true_labels[0][0]
    
    seqArray = []   
    for idx, i in enumerate(true_labels[0]):
        if i == curr_type:
            seqArray.append(idx)
        else:
            seqArray.append(idx)
            label_dict.append([curr_type, seqArray])
            curr_type = i
            seqArray = []
    
    seqArray = []
    for idx, i in enumerate(true_predictions[0]):
        if i == curr_type:
            seqArray.append(idx)
        else:
            seqArray.append(idx)
            pred_dict.append([curr_type, seqArray])
            curr_type = i
            seqArray = []
    
    #loop through predictions to find true & false positives
    truth_table = {
        "fp": 0,
        "tp": 0,
        "fn": 0
    }
    for p in pred_dict:
        foundMatch = False
        for l in label_dict:
            if p[0] == l[0] and len(list(set(p[1]) & set(l[1]))) > len(p) / 2:
                foundMatch = True
        
        if foundMatch:
            truth_table["tp"] = truth_table["tp"] + 1
        else:
            truth_table["fp"] = truth_table["fp"] + 1

    #loop through labels to find false negatives
    for l in label_dict:
        foundMatch = False
        for p in pred_dict:
            if l[0] == p[0] and len(list(set(l[1]) & set(p[1]))) >= len(l) / 2:
                foundMatch = True
        
        if not foundMatch:
            truth_table["fn"] = truth_table["fn"] + 1

    precision = truth_table["tp"] / (truth_table["tp"] + truth_table["fp"])
    recall = truth_table["tp"] / (truth_table["tp"] + truth_table["fn"])
    f1_score = 2 * precision * recall / (precision + recall)

    return precision, recall, f1_score

    

In [54]:
def compute_metrics(eval_preds):
    logits, labels = eval_preds
    predictions = np.argmax(logits, axis=-1)

    # Remove ignored index (special tokens) and convert to labels
    true_labels = [[label_names[l] for l in label if l != -100] for label in labels]
    true_predictions = [
        [label_names[p] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]
    precision, recall, f1_score = myEval(true_labels, true_predictions)
    all_metrics = metric.compute(predictions=true_predictions, references=true_labels)
    return {
        "precision": precision,
        "recall": recall,
        "f1": f1_score,
        "accuracy": all_metrics["overall_accuracy"],
    }

In [56]:
labelsTest = [['Position', 'Position', 'Position', 'Position', 'Position', 'Position', 'Position', 'Position', 'Position', 'Position', 'Position', 'Evidence', 'Evidence', 'Evidence', 'Evidence', 'Evidence', 'Evidence', 'Evidence', 'Evidence', 'Evidence', 'Evidence', 'Evidence', 'Evidence', 'Evidence', 'Evidence', 'Evidence', 'Evidence', 'Evidence', 'Evidence', 'Evidence', 'Evidence', 'Evidence', 'Evidence', 'Evidence', 'Evidence', 'Evidence', 'Evidence', 'Evidence', 'Evidence', 'Evidence', 'Evidence', 'Evidence', 'Evidence', 'Evidence', 'Evidence', 'Evidence', 'Evidence', 'Evidence', 'Evidence', 'Evidence', 'Evidence', 'Evidence', 'Evidence', 'Evidence', 'Evidence', 'Evidence', 'Evidence', 'Evidence', 'Evidence', 'Evidence', 'Evidence', 'Evidence', 'Evidence', 'Evidence', 'Evidence', 'Evidence', 'Evidence', 'Evidence', 'Evidence', 'Evidence', 'Evidence', 'Counterclaim', 'Counterclaim', 'Counterclaim', 'Counterclaim', 'Counterclaim', 'Counterclaim', 'Counterclaim', 'Counterclaim', 'Counterclaim', 'Counterclaim', 'Counterclaim', 'Counterclaim', 'Counterclaim', 'Counterclaim', 'Counterclaim', 'Counterclaim', 'Counterclaim', 'Counterclaim', 'Counterclaim', 'Counterclaim', 'Counterclaim', 'Counterclaim', 'Counterclaim', 'Counterclaim', 'Counterclaim', 'Counterclaim', 'Rebuttal', 'Rebuttal', 'Rebuttal', 'Rebuttal', 'Rebuttal', 'Rebuttal', 'Rebuttal', 'Rebuttal', 'Rebuttal', 'Rebuttal', 'Rebuttal', 'Rebuttal', 'Rebuttal', 'Rebuttal', 'Rebuttal', 'Rebuttal', 'Rebuttal', 'Rebuttal', 'Rebuttal', 'Claim', 'Claim', 'Claim', 'Claim', 'Claim', 'Claim', 'Claim', 'Claim', 'Claim', 'Claim', 'Claim', 'Claim', 'Claim', 'Claim', 'Claim', 'Claim', 'Claim', 'Claim', 'Claim', 'Claim', 'Claim', 'Concluding Statement', 'Claim', 'Claim', 'Claim', 'Claim', 'Claim', 'Claim', 'Claim', 'Claim', 'Claim', 'Claim', 'Claim', 'Claim', 'Claim', 'Claim', 'Claim', 'Claim', 'Claim', 'Concluding Statement', 'Claim', 'Claim', 'Claim', 'Claim', 'Claim', 'Claim', 'Claim', 'Claim', 'Claim', 'Claim', 'Claim', 'Evidence', 'Evidence', 'Evidence', 'Evidence', 'Evidence', 'Evidence', 'Evidence', 'Evidence', 'Evidence', 'Evidence', 'Evidence', 'Evidence', 'Evidence', 'Evidence', 'Evidence', 'Evidence', 'Evidence', 'Evidence', 'Evidence', 'Evidence', 'Evidence', 'Evidence', 'Evidence', 'Evidence', 'Evidence', 'Evidence', 'Evidence', 'Evidence', 'Evidence', 'Evidence', 'Evidence', 'Evidence', 'Evidence', 'Evidence', 'Evidence', 'Evidence', 'Evidence', 'Evidence', 'Evidence', 'Evidence', 'Evidence', 'Evidence', 'Evidence', 'Evidence', 'Evidence', 'Evidence', 'Evidence', 'Evidence', 'Evidence', 'Evidence', 'Evidence', 'Evidence', 'Evidence', 'Evidence', 'Evidence', 'Evidence', 'Evidence', 'Evidence', 'Evidence', 'Evidence', 'Evidence', 'Evidence', 'Evidence', 'Evidence', 'Evidence', 'Evidence', 'Evidence', 'Evidence', 'Evidence', 'Evidence', 'Evidence', 'Evidence', 'Evidence', 'Evidence', 'Evidence', 'Evidence', 'Evidence', 'Evidence', 'Evidence', 'Evidence', 'Evidence', 'Evidence', 'Evidence', 'Evidence', 'Evidence', 'Evidence', 'Evidence', 'Evidence', 'Evidence', 'Evidence', 'Evidence', 'Evidence', 'Evidence', 'Evidence', 'Evidence', 'Evidence', 'Evidence', 'Evidence', 'Evidence', 'Evidence', 'Evidence', 'Evidence', 'Evidence', 'Evidence', 'Evidence', 'Evidence', 'Evidence', 'Evidence', 'Evidence', 'Evidence', 'Evidence', 'Evidence', 'Evidence', 'Evidence', 'Evidence', 'Evidence', 'Evidence', 'Evidence', 'Concluding Statement', 'Concluding Statement', 'Concluding Statement', 'Concluding Statement', 'Concluding Statement', 'Concluding Statement', 'Concluding Statement', 'Concluding Statement', 'Concluding Statement', 'Concluding Statement', 'Concluding Statement', 'Concluding Statement', 'Concluding Statement', 'Concluding Statement', 'Concluding Statement', 'Concluding Statement', 'Concluding Statement', 'Concluding Statement', 'Concluding Statement', 'Concluding Statement', 'Concluding Statement', 'Concluding Statement', 'Concluding Statement', 'Concluding Statement', 'Concluding Statement', 'Concluding Statement', 'Concluding Statement', 'Concluding Statement', 'Concluding Statement', 'Concluding Statement', 'Concluding Statement', 'Concluding Statement', 'Concluding Statement', 'Concluding Statement', 'Concluding Statement', 'Concluding Statement', 'Concluding Statement', 'Concluding Statement', 'Concluding Statement', 'Concluding Statement', 'Concluding Statement', 'Concluding Statement', 'Concluding Statement', 'Concluding Statement', 'Concluding Statement', 'Concluding Statement', 'Concluding Statement', 'Concluding Statement', 'Concluding Statement', 'Concluding Statement', 'Concluding Statement', 'Concluding Statement', 'Concluding Statement', 'Concluding Statement', 'Concluding Statement', 'Concluding Statement']]

In [57]:
label_dict = []
curr_type = labelsTest[0][0]
seqArray = []   
for idx, i in enumerate(labelsTest[0]):
    if i == curr_type:
        seqArray.append(idx)
    else:
        seqArray.append(idx)
        label_dict.append([curr_type, seqArray])
        curr_type = i
        seqArray = []
        
print("TRUE LABELS")
for i in label_dict:
    print(i)

TRUE LABELS
['Position', [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11]]
['Evidence', [12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71]]
['Counterclaim', [72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97]]
['Rebuttal', [98, 99, 100, 101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116]]
['Claim', [117, 118, 119, 120, 121, 122, 123, 124, 125, 126, 127, 128, 129, 130, 131, 132, 133, 134, 135, 136, 137]]
['Concluding Statement', [138]]
['Claim', [139, 140, 141, 142, 143, 144, 145, 146, 147, 148, 149, 150, 151, 152, 153, 154, 155]]
['Concluding Statement', [156]]
['Claim', [157, 158, 159, 160, 161, 162, 163, 164, 165, 166, 167]]
['Evidence', [168, 169, 170, 171, 172, 173, 174, 175, 176, 177, 178, 179, 180, 181, 182, 183, 184, 185, 

In [35]:
pred_dict = label_dict.copy()
test1 = [1, 2, 3]
test2 = [2, 3, 4, 5]
# list(set(test1) & set(test2))
int(len(test1) / 2)

1

In [52]:
#loop through predictions to find true & false positives
truth_table = {
    "fp": 0,
    "tp": 0,
    "fn": 0
}
for p in pred_dict:
    foundMatch = False
    for l in label_dict:
        if p[0] == l[0] and len(list(set(p[1]) & set(l[1]))) > len(p) / 2:
            foundMatch = True
    
    if foundMatch:
        truth_table["tp"] = truth_table["tp"] + 1
    else:
        truth_table["fp"] = truth_table["fp"] + 1

#loop through labels to find false negatives
for l in label_dict:
    foundMatch = False
    for p in pred_dict:
        if l[0] == p[0] and len(list(set(l[1]) & set(p[1]))) >= len(l) / 2:
            foundMatch = True
    
    if not foundMatch:
        truth_table["fn"] = truth_table["fn"] + 1

precision = truth_table["tp"] / (truth_table["tp"] + truth_table["fp"])
recall = truth_table["tp"] / (truth_table["tp"] + truth_table["fn"])
f1_score = 2 * precision * recall / (precision + recall)


0
8


In [7]:
trainingCSV = pd.read_csv("train.csv")
trainingCSV.head()

Unnamed: 0,id,discourse_id,discourse_start,discourse_end,discourse_text,discourse_type,discourse_type_num,predictionstring
0,423A1CA112E2,1622628000000.0,8.0,229.0,Modern humans today are always on their phone....,Lead,Lead 1,1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 1...
1,423A1CA112E2,1622628000000.0,230.0,312.0,They are some really bad consequences when stu...,Position,Position 1,45 46 47 48 49 50 51 52 53 54 55 56 57 58 59
2,423A1CA112E2,1622628000000.0,313.0,401.0,Some certain areas in the United States ban ph...,Evidence,Evidence 1,60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75
3,423A1CA112E2,1622628000000.0,402.0,758.0,"When people have phones, they know about certa...",Evidence,Evidence 2,76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 9...
4,423A1CA112E2,1622628000000.0,759.0,886.0,Driving is one of the way how to get around. P...,Claim,Claim 1,139 140 141 142 143 144 145 146 147 148 149 15...


In [19]:
true_labels = []
true_predictions = []

In [16]:
id2label = {str(i): label for i, label in enumerate(label_names)}
label2id = {v: k for k, v in id2label.items()}

In [17]:
from transformers import AutoModelForTokenClassification

model = AutoModelForTokenClassification.from_pretrained(
    model_checkpoint,
    id2label=id2label,
    label2id=label2id,
)

Some weights of the model checkpoint at bert-base-cased were not used when initializing BertForTokenClassification: ['cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.weight']
- This IS expected if you are initializing BertForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForTokenClassification were not initialized from the model checkpoint at bert-base-cas

In [18]:
from transformers import TrainingArguments

args = TrainingArguments(
    "bert-finetuned-ner",
    evaluation_strategy="epoch",
    save_strategy="epoch",
    learning_rate=2e-5,
    num_train_epochs=3,
    weight_decay=0.01,
    push_to_hub=False,
)

In [14]:
smallToken = tokenTrain["train"][:10]
smallToken = Dataset.from_dict(smallToken)
smallToken = smallToken.train_test_split(train_size=0.9, test_size=0.1)

In [None]:
from transformers import Trainer

trainer = Trainer(
    model=model,
    args=args,
    train_dataset=smallToken["train"],
    eval_dataset=smallToken["test"],
    data_collator=data_collator,
    compute_metrics=compute_metrics,
    tokenizer=tokenizer,
)
trainer.train()

In [None]:
# EVERYTHING BELOW IS NOT USEFUL, CANNOT INSTALL GIT-LFS ON WINDOWS, USE COLAB

In [None]:
data = ""
with open('test/0FB0700DAF44.txt', 'r') as file:
    data = file.read().replace('\n', '')

In [58]:
from transformers import pipeline

# Replace this with your own checkpoint
model_checkpoint = "brad1141/bert-finetuned-ner"
longformer_checkpoint = "allenai/longformer-base-4096"
token_classifier = pipeline(
    "token-classification", model=longformer_checkpoint, aggregation_strategy="simple"
)
predicts = token_classifier(data)

Downloading:   9%|▉         | 52.5M/570M [00:05<00:51, 10.5MB/s]

KeyboardInterrupt: 

Downloading:   9%|▉         | 52.7M/570M [00:19<00:51, 10.5MB/s]

In [24]:
def calc_word_indices(full_text, discourse_start, discourse_end):
    start_index = len(full_text[:discourse_start].split())
    token_len = len(full_text[discourse_start:discourse_end].split())
    output = list(range(start_index, start_index + token_len))
    if output[-1] >= len(full_text.split()):
        output = list(range(start_index, start_index + token_len-1))
    return output

outputt = calc_word_indices(data, 0, 13)
outputt = " ".join(str(x) for x in outputt)
print(outputt)

0 1 2


In [29]:
dict = {'id':[],
        'class': [],
        'predictionstring':[]}
sub_df = pd.DataFrame(dict)
sub_df.head()


Unnamed: 0,id,class,predictionstring


In [54]:
fileNames = os.listdir('train')
fileNames = fileNames[:10000]
fileNames[0]

'0000D23A521A.txt'

In [57]:
def predict_and_format(fileName):
    data = ""
    with open('train/' + fileName, 'r') as file:
        data = file.read().replace('\n', '')
    
    predicts = token_classifier(data)

    for p in predicts:
        word_Indices_Array = calc_word_indices(data, p["start"], p["end"])
        word_Indices_String = " ".join(str(x) for x in word_Indices_Array)
        word_class = p["entity_group"]
        word_id = fileName[:-4]
        sub_df.loc[len(sub_df.index)] = [word_id, word_class, word_Indices_String]

In [None]:
for file in tqdm(fileNames):
    predict_and_format(file)

In [None]:
sub_df

In [49]:
sub_df.to_csv('submission.csv',index=False)