In [91]:
import numpy as np
import pandas as pd
import os
import tensorflow as tf
import re
from datasets import load_dataset
from datasets import Dataset

In [90]:
raw_datasets  = load_dataset("conll2003")

Reusing dataset conll2003 (C:\Users\Brad\.cache\huggingface\datasets\conll2003\conll2003\1.0.0\40e7cb6bcc374f7c349c83acd1e9352a4f09474eb691f64f364ee62eb65d0ca6)
100%|██████████| 3/3 [00:00<00:00, 95.53it/s]


In [139]:
def fileToArray(file_path):
    # turn text into array of words
    text_ds = tf.data.TextLineDataset(file_path).filter(lambda x: tf.cast(tf.strings.length(x), bool))
    text_ds = text_ds.enumerate()
    docTxt = []
    for i in text_ds.as_numpy_iterator():
        line = i[1].decode().split()
        for l in line:
            docTxt.append(l)
    
    docTxt = ' '.join(docTxt)
    return docTxt

def calc_word_indices(full_text, discourse_start, discourse_end):
    start_index = len(full_text[:discourse_start].split())
    token_len = len(full_text[discourse_start:discourse_end].split())
    output = list(range(start_index, start_index + token_len))
    if output[-1] >= len(full_text.split()):
        output = list(range(start_index, start_index + token_len-1))
    return output

def align_labels_with_tokens(labels, word_ids):
    new_labels = []
    current_word = None
    for word_id in word_ids:
        if word_id != current_word:
            # Start of a new word!
            current_word = word_id
            label = -100 if word_id is None else labels[word_id]
            new_labels.append(label)
        elif word_id is None:
            # Special token
            new_labels.append(-100)
        else:
            # Same word as previous token
            label = labels[word_id]
            # If the label is B-XXX we change it to I-XXX
            if label % 2 == 1:
                label += 1
            new_labels.append(label)

    return new_labels

def stringToArray(mystr):
    wordList = re.sub("[^\w]", " ",  mystr).split()
    return wordList

def getIntArray(strArray):
    intArray = []
    for s in strArray:
        intArray.append(int(s))
    return intArray

In [3]:
train = pd.read_csv('train.csv')
train.head()

Unnamed: 0,id,discourse_id,discourse_start,discourse_end,discourse_text,discourse_type,discourse_type_num,predictionstring
0,423A1CA112E2,1622628000000.0,8.0,229.0,Modern humans today are always on their phone....,Lead,Lead 1,1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 1...
1,423A1CA112E2,1622628000000.0,230.0,312.0,They are some really bad consequences when stu...,Position,Position 1,45 46 47 48 49 50 51 52 53 54 55 56 57 58 59
2,423A1CA112E2,1622628000000.0,313.0,401.0,Some certain areas in the United States ban ph...,Evidence,Evidence 1,60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75
3,423A1CA112E2,1622628000000.0,402.0,758.0,"When people have phones, they know about certa...",Evidence,Evidence 2,76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 9...
4,423A1CA112E2,1622628000000.0,759.0,886.0,Driving is one of the way how to get around. P...,Claim,Claim 1,139 140 141 142 143 144 145 146 147 148 149 15...


In [4]:
train["predictionstring"][100]

'80 81 82 83 84 85 86 87 88 89 90 91 92'

In [5]:
sampleTxt = fileToArray("train/" + str(train["id"][0]) + ".txt")
ftest = calc_word_indices(sampleTxt, int(train["discourse_start"][0]), int(train["discourse_end"][0]))
print(ftest)

[2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45]


In [6]:
#make a dummy submission
data = {'id': [], 'class': [], 'predictionstring': []}
df = pd.DataFrame(data)
df.head()

Unnamed: 0,id,class,predictionstring


In [7]:
for t in os.walk('test'):
    print(t)

('test', [], ['0FB0700DAF44.txt', '18409261F5C2.txt', 'D46BCB48440A.txt', 'D72CB1C11673.txt', 'DF920E0A7337.txt'])


In [8]:
train["discourse_type"].unique()

array(['Lead', 'Position', 'Evidence', 'Claim', 'Concluding Statement',
       'Counterclaim', 'Rebuttal'], dtype=object)

In [9]:
#make dictionary to convert discourse types to integers
tokenDict = {
    "Lead" : 0,
    "Position" : 1,
    "Evidence" : 2,
    "Claim" : 3,
    "Concluding Statement" : 4,
    "Counterclaim" : 5,
    "Rebuttal": 6
}

In [117]:
df_file = train[train.id.str.contains(str(train["id"][0]),case=False)]
labels = []
num_labels = []
fullText = []
for i in range(len(df_file.index)):
    df_string = df_file["predictionstring"][i]
    stringArray = df_string.split()
    df_textString = df_file["discourse_text"][i].split()
    for df in df_textString:
        # try not appending word if it is a '.' or a ','
        fullText.append(df)
    for s in stringArray:
        labels.append(df_file["discourse_type"][i])
        num_labels.append(tokenDict[str(df_file["discourse_type"][i])])
        

print(fullText)
print(num_labels)

['Modern', 'humans', 'today', 'are', 'always', 'on', 'their', 'phone.', 'They', 'are', 'always', 'on', 'their', 'phone', 'more', 'than', '5', 'hours', 'a', 'day', 'no', 'stop', '.All', 'they', 'do', 'is', 'text', 'back', 'and', 'forward', 'and', 'just', 'have', 'group', 'Chats', 'on', 'social', 'media.', 'They', 'even', 'do', 'it', 'while', 'driving.', 'They', 'are', 'some', 'really', 'bad', 'consequences', 'when', 'stuff', 'happens', 'when', 'it', 'comes', 'to', 'a', 'phone.', 'Some', 'certain', 'areas', 'in', 'the', 'United', 'States', 'ban', 'phones', 'from', 'class', 'rooms', 'just', 'because', 'of', 'it.', 'When', 'people', 'have', 'phones,', 'they', 'know', 'about', 'certain', 'apps', 'that', 'they', 'have', '.Apps', 'like', 'Facebook', 'Twitter', 'Instagram', 'and', 'Snapchat.', 'So', 'like', 'if', 'a', 'friend', 'moves', 'away', 'and', 'you', 'want', 'to', 'be', 'in', 'contact', 'you', 'can', 'still', 'be', 'in', 'contact', 'by', 'posting', 'videos', 'or', 'text', 'messages.', 

In [118]:
from transformers import AutoTokenizer
model_checkpoint = "bert-base-cased"
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)

In [120]:
inputs = tokenizer(fullText, is_split_into_words=True)
word_ids = inputs.word_ids()
print(len(num_labels))
print(len(word_ids))
print(align_labels_with_tokens(num_labels, word_ids))

378
452
[-100, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 4, 3, 3, 3, 3, 3, 3, 3, 3, 3, 4, 3, 3, 3, 3, 3, 4, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2

In [87]:
# tokenize kaggle dataset through mapping
kaggleTrain = load_dataset('csv', data_files="trainHugging1.csv")

Using custom data configuration default-8508840b1601b217
Reusing dataset csv (C:\Users\Brad\.cache\huggingface\datasets\csv\default-8508840b1601b217\0.0.0\6b9057d9e23d9d8a2f05b985917a0da84d70c5dae3d22ddd8a3f22fb01c69d9e)
100%|██████████| 1/1 [00:00<00:00, 143.21it/s]


In [257]:
def tokenize_and_align_labels(examples):
    tokenized_inputs = tokenizer(
        examples["tokens"], truncation=True, is_split_into_words=True
    )

    all_labels = examples["ner_tags"]
    new_labels = []
    for i, labels in enumerate(all_labels):
        try: 
            word_ids = tokenized_inputs.word_ids(i)
            new_labels.append(align_labels_with_tokens(labels, word_ids))
        except:
            new_labels.append([-100])

    tokenized_inputs["labels"] = new_labels
    return tokenized_inputs

In [249]:
# run this after trainHugging4
kaggleTrain = pd.read_csv("trainHugging4.csv")
kaggleTrain['tokens'] = kaggleTrain['tokens'].apply(lambda a: a.split())
kaggleTrain['ner_tags'] = kaggleTrain['ner_tags'].apply(lambda a: a.split())
kaggleTrain = kaggleTrain.drop(['Unnamed: 0'], axis = 1)
kaggleTrain.head()

Unnamed: 0,id,tokens,ner_tags
0,423A1CA112E2,"[Modern, humans, today, are, always, on, their...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
1,A8445CABFECE,"[Drivers, should, not, be, able, to, use, phon...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 3, 3, 3, ..."
2,6B4F7A0165B9,"[The, ability, to, stay, connected, to, people...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
3,E05C7F5C1156,"[People, are, debating, whether, if, drivers, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
4,50B3435E475B,"[Over, half, of, drivers, in, today's, society...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."


In [250]:
for i, row in kaggleTrain.iterrows():
    numTags = []
    for r in row["ner_tags"]:
        numTags.append(int(r))
    
    kaggleTrain.at[i, "ner_tags"] = numTags

In [244]:
kaggleTrain["ner_tags"][0]

[0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,


In [309]:
kaggleData = Dataset.from_pandas(kaggleTrain)

In [331]:
inputs = tokenizer(kaggleData["tokens"][1], is_split_into_words=True)
inputs.tokens()
word_ids = inputs.word_ids()
taggs = getIntArray(kaggleData["ner_tags"][1])
print(len(kaggleData["tokens"][1]))

209


In [None]:
print(align_labels_with_tokens(kaggleData["ner_tags"][0], word_ids))

In [258]:
tokenized_datasets = kaggleData.map(
    tokenize_and_align_labels,
    batched=True,
)

100%|██████████| 16/16 [00:48<00:00,  3.04s/ba]


In [264]:
tokenized_datasets

Dataset({
    features: ['attention_mask', 'id', 'input_ids', 'labels', 'ner_tags', 'token_type_ids', 'tokens'],
    num_rows: 15594
})

In [169]:
# #errors
# C3811E7F1750
# 9B7494278BAE

In [265]:
df_token = tokenized_datasets.to_pandas()
df_token.head()

Unnamed: 0,attention_mask,id,input_ids,labels,ner_tags,token_type_ids,tokens
0,"[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...",423A1CA112E2,"[101, 4825, 3612, 2052, 1132, 1579, 1113, 1147...","[-100, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[Modern, humans, today, are, always, on, their..."
1,"[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...",A8445CABFECE,"[101, 23004, 1431, 1136, 1129, 1682, 1106, 132...","[-100, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, ...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 3, 3, 3, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[Drivers, should, not, be, able, to, use, phon..."
2,"[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...",6B4F7A0165B9,"[101, 1109, 2912, 1106, 2215, 3387, 1106, 1234...","[-100, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[The, ability, to, stay, connected, to, people..."
3,"[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...",E05C7F5C1156,"[101, 2563, 1132, 27066, 2480, 1191, 7016, 143...","[-100, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[People, are, debating, whether, if, drivers, ..."
4,"[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...",50B3435E475B,"[101, 3278, 1544, 1104, 7016, 1107, 2052, 112,...","[-100, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[Over, half, of, drivers, in, today's, society..."


In [335]:
df_tokenError = pd.DataFrame(columns=["id", "labels", "ner_tags", "tokens"])

In [336]:
for index, row in df_token.iterrows():
    if len(row["labels"]) < 5:
        df_tokenError.loc[len(df_tokenError)] = row

In [337]:
df_tokenError.head()

Unnamed: 0,id,labels,ner_tags,tokens
0,C3811E7F1750,[-100],"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[BOOM!!, You're, on, I-75, on, the, ground, bl..."
1,9B7494278BAE,[-100],"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, ...","[Even, though, majority, of, humans, own, and,..."
2,93B9C33FF16D,[-100],"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[Their, are, so, many, things, you, can, do, t..."
3,ABE35EAFEC00,[-100],"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[more, people, die, in, car, related, accidnts..."
4,F5E4D811501C,[-100],"[1, 1, 1, 1, 1, 1, 1, 1, 3, 3, 3, 3, 3, 3, 3, ...","[Limiting, car, usage, is, a, very, good, idea..."


In [333]:
# ner_tags < tokens by 1
inputs = tokenizer(list(df_tokenError["tokens"][0]), is_split_into_words=True)
inputs.tokens()
word_ids = inputs.word_ids()
taggs = getIntArray(df_tokenError["ner_tags"][0])
print(len(df_tokenError["ner_tags"][3]))

205


In [323]:
print(word_ids)

[None, 0, 0, 0, 0, 0, 1, 1, 1, 2, 3, 3, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 12, 13, 14, 14, 15, 16, 17, 18, 18, 18, 19, 20, 21, 21, 22, 22, 23, 24, 25, 26, 27, 27, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 41, 42, 43, 44, 45, 46, 47, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 57, 58, 58, 59, 60, 61, 62, 62, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 76, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 86, 87, 88, 89, 90, 91, 92, 92, 93, 93, 94, 95, 96, 97, 98, 99, 100, 101, 102, 103, 103, 104, 105, 106, 107, 108, 109, 110, 111, 112, 113, 113, 114, 115, 116, 117, 118, 119, 120, 121, 122, 123, 123, 124, 125, 126, 127, 128, 129, 130, 131, 132, 132, 133, 134, 135, 136, 137, 137, 137, 138, 139, 140, 141, 142, 143, 144, 144, 145, 146, 146, 147, 147, 148, 149, 150, 151, 152, 153, 154, 155, 156, 157, 157, 158, 159, 160, 161, 162, 163, 164, 165, 166, 167, 168, 169, 170, 171, 172, 173, 174, 175, 176, 177, 178, 178, 179, 179, 179, 180, 181, 182, 183, 183, 184, 185, 185

In [306]:
print(align_labels_with_tokens(taggs, word_ids))

IndexError: list index out of range

In [180]:
errorRow = kaggleTrain.loc[kaggleTrain['id'] == "C3811E7F1750"]

In [181]:
errorRow["tokens"][12]

['BOOM!!',
 "You're",
 'on',
 'I-75',
 'on',
 'the',
 'ground',
 'bleeding',
 'out',
 'watching',
 'everyone',
 'surrounding',
 'you,',
 'calling',
 '911.',
 'At',
 'this',
 'point',
 "you're",
 'thinking',
 'to',
 'yourself,',
 '"why',
 'the',
 'freak',
 'did',
 'this',
 'happen?"',
 'A',
 'couple',
 'hours',
 'later',
 'you',
 'found',
 'out',
 'an',
 'adult',
 'was',
 'on',
 'his',
 'phone',
 'texting',
 'and',
 'driving',
 'before',
 'the',
 'wreck',
 'happened,',
 'which',
 'is',
 'the',
 'most',
 'selfish',
 'thing',
 'to',
 'do',
 'while',
 'driving.',
 'Putting',
 'your',
 'life',
 'and',
 "other's",
 'lives',
 'in',
 'danger',
 'just',
 'so',
 'you',
 'can',
 'send',
 'a',
 'stupid',
 'text',
 'to',
 'your',
 'girl/boyfriend',
 'is',
 'completely',
 'selfish',
 'I',
 'firmly',
 'believe',
 'that',
 'we',
 'need',
 'stricter',
 'phone',
 'laws',
 'to',
 'in',
 'jail',
 'people.',
 'Texting',
 'while',
 'driving',
 'is',
 'in',
 'the',
 'top',
 'five',
 'causes',
 'for',
 'death

In [184]:
tok = [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4]
wro = [None, 0, 0, 0, 0, 0, 1, 1, 1, 2, 3, 3, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 12, 13, 14, 14, 15, 16, 17, 18, 18, 18, 19, 20, 21, 21, 22, 22, 23, 24, 25, 26, 27, 27, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 41, 42, 43, 44, 45, 46, 47, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 57, 58, 58, 59, 60, 61, 62, 62, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 76, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 86, 87, 88, 89, 90, 91, 92, 92, 93, 93, 94, 95, 96, 97, 98, 99, 100, 101, 102, 103, 103, 104, 105, 106, 107, 108, 109, 110, 111, 112, 113, 113, 114, 115, 116, 117, 118, 119, 120, 121, 122, 123, 123, 124, 125, 126, 127, 128, 129, 130, 131, 132, 132, 133, 134, 135, 136, 137, 137, 137, 138, 139, 140, 141, 142, 143, 144, 144, 145, 146, 146, 147, 147, 148, 149, 150, 151, 152, 153, 154, 155, 156, 157, 157, 158, 159, 160, 161, 162, 163, 164, 165, 166, 167, 168, 169, 170, 171, 172, 173, 174, 175, 176, 177, 178, 178, 179, 179, 179, 180, 181, 182, 183, 183, 184, 185, 185, 186, 187, 187, 187, 188, 188, 189, 190, 190, 191, 192, 193, 194, 195, 196, 197, 198, 199, 200, 201, 202, 203, 204, 205, 206, 207, 208, 209, 210, 210, 211, 212, 213, 214, 215, 216, 217, 218, 219, 220, 221, 222, 223, 224, 225, 226, 226, 227, 228, 228, 228, 229, 230, 231, 232, 233, 234, 235, 236, 237, 238, 239, 240, 241, 242, 243, 243, 244, 245, 245, 246, 247, 247, 248, 249, 250, 250, 250, 251, 251, 252, 253, 254, 254, 254, 255, 256, 257, 258, 259, 260, 261, 262, 263, 263, 264, 265, 266, 267, 268, 269, 270, 271, 272, 273, 274, 275, 276, 277, 278, 279, 279, 280, 281, 282, 283, 284, 285, 286, 287, 287, 288, 289, 290, 291, 292, 293, 294, 295, 296, 297, 298, 299, 300, 301, 302, 303, 304, 305, 306, 307, 308, 308, 308, 309, 310, 311, 312, 313, 314, 315, 316, 317, 318, 319, 320, 321, 321, None]


In [190]:
inputs = tokenizer(kaggleTrain['ner_tags'][0], is_split_into_words=True)
word_ids = inputs.word_ids()
print(len(num_labels))
print(len(word_ids))
print(align_labels_with_tokens(kaggleTrain['ner_tags'][0], word_ids))

378
380
[-100, '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '1', '1', '1', '1', '1', '1', '1', '1', '1', '1', '1', '1', '1', '1', '1', '2', '2', '2', '2', '2', '2', '2', '2', '2', '2', '2', '2', '2', '2', '2', '2', '2', '2', '2', '2', '2', '2', '2', '2', '2', '2', '2', '2', '2', '2', '2', '2', '2', '2', '2', '2', '2', '2', '2', '2', '2', '2', '2', '2', '2', '2', '2', '2', '2', '2', '2', '2', '2', '2', '2', '2', '2', '2', '2', '2', '2', '2', '2', '2', '2', '2', '2', '2', '2', '2', '2', '2', '2', '2', '2', '2', '2', '2', '2', '3', '3', '3', '3', '3', '3', '3', '3', '3', '3', '3', '3', '3', '3', '3', '3', '3', '3', '3', '3', '3', '3', '3', '3', '2', '2', '2', '2', '2', '2', '2', '2', '2', '2', '2', '2', '2', '2', '2', '2', '2', '2', '2', '2', '2', '2', '2', '2', '2', '2', '2', '2', '2', '2', '2', '2', '2', '2', '2', 

In [339]:
df_errorTest = pd.read_csv("train.csv")
singleErr = df_errorTest[df_errorTest["id"] == "C3811E7F1750"]
singleErr.head()

Unnamed: 0,id,discourse_id,discourse_start,discourse_end,discourse_text,discourse_type,discourse_type_num,predictionstring
99,C3811E7F1750,1622473000000.0,0.0,461.0,BOOM!! You're on I-75 on the ground bleeding o...,Lead,Lead 1,0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18...
100,C3811E7F1750,1622473000000.0,462.0,531.0,I firmly believe that we need stricter phone l...,Position,Position 1,80 81 82 83 84 85 86 87 88 89 90 91 92
101,C3811E7F1750,1622473000000.0,532.0,641.0,Texting while driving is in the top five cause...,Claim,Claim 1,93 94 95 96 97 98 99 100 101 102 103 104 105 1...
102,C3811E7F1750,1622473000000.0,641.0,837.0,We need to stand together united and encourage...,Evidence,Evidence 1,114 115 116 117 118 119 120 121 122 123 124 12...
103,C3811E7F1750,1622473000000.0,838.0,998.0,Cellphones are brain washing us to think we al...,Claim,Claim 2,147 148 149 150 151 152 153 154 155 156 157 15...


In [341]:
train.head()

Unnamed: 0,id,discourse_id,discourse_start,discourse_end,discourse_text,discourse_type,discourse_type_num,predictionstring
0,423A1CA112E2,1622628000000.0,8.0,229.0,Modern humans today are always on their phone....,Lead,Lead 1,1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 1...
1,423A1CA112E2,1622628000000.0,230.0,312.0,They are some really bad consequences when stu...,Position,Position 1,45 46 47 48 49 50 51 52 53 54 55 56 57 58 59
2,423A1CA112E2,1622628000000.0,313.0,401.0,Some certain areas in the United States ban ph...,Evidence,Evidence 1,60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75
3,423A1CA112E2,1622628000000.0,402.0,758.0,"When people have phones, they know about certa...",Evidence,Evidence 2,76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 9...
4,423A1CA112E2,1622628000000.0,759.0,886.0,Driving is one of the way how to get around. P...,Claim,Claim 1,139 140 141 142 143 144 145 146 147 148 149 15...


In [373]:
df_file = train[train.id.str.contains("F5E4D811501C")]
df_file = df_file.reset_index()
labels = []
num_labels = []
fullText = []
for i in range(len(df_file.index)):
    ps = df_file["predictionstring"][i].split()
    dt = df_file["discourse_text"][i].split() 
    print(len(ps))
    print(len(dt))
    df_string = df_file["predictionstring"][i]
    stringArray = df_string.split()
    df_textString = df_file["discourse_text"][i].split()
    for df in df_textString:
        fullText.append(df)
    for s in stringArray:
        labels.append(df_file["discourse_type"][i])
        num_labels.append(tokenDict[str(df_file["discourse_type"][i])])
        

print(fullText)
print(num_labels)

8
8
7
7
4
4
3
3
16
16
14
14
59
59
17
17
96
96
39
40
['Limiting', 'car', 'usage', 'is', 'a', 'very', 'good', 'idea', 'It', 'can', 'save', 'a', 'lot', 'of', 'money,', 'keep', 'pollution', 'from', 'happening', 'keep', 'people', 'safe.', 'If', 'more', 'people', 'tend', 'to', 'save', 'money', 'because', 'of', 'cars,', 'then', 'they', 'can', 'buy', 'more', 'stuff.', 'If', 'more', 'people', 'save', 'the', 'enviornment', 'people', 'can', 'have', 'fresh', 'air', 'to', 'breath', 'in.', '"In', 'German', 'Suburb,', 'Life', 'Goes', 'On', 'Without', 'Cars"', 'by', 'Elisabeth', 'Rosenthal,', 'it', 'states', 'that', '"As', 'a', 'result', 'of', 'buying', 'a', 'parking', 'space', 'for', '$40,000,', '70', 'percent', 'of', "Vauban's", 'families', 'do', 'not', 'own', 'cars,', 'and', '57', 'percent', 'sold', 'a', 'car', 'to', 'move', 'here.', 'When', 'I', 'had', 'a', 'car', 'I', 'was', 'always', 'tense.', "I'm", 'much', 'happier', 'this', 'way,"', 'said', 'Heidrun', 'Walter', 'This', 'quote', 'explains', 't

In [382]:
df_file = train[train.id.str.contains("423A1CA112E2")]
df_file = df_file.reset_index()
idx = len(df_file) - 1
ps = df_file["predictionstring"][idx].split()
dt = df_file["discourse_text"][idx].split()
print(ps)

['355', '356', '357', '358', '359', '360', '361', '362', '363', '364', '365', '366', '367', '368', '369', '370', '371', '372', '373', '374', '375', '376', '377', '378']


In [383]:
print(dt)

['The', 'news', 'always', 'updated', 'when', 'people', 'do', 'something', 'stupid', 'around', 'that', 'involves', 'their', 'phones.', 'The', 'safest', 'way', 'is', 'the', 'best', 'way', 'to', 'stay', 'safe.']


In [91]:
fileString = fileToArray("train/" + str(train["id"][0]) + ".txt")
fileArray = stringToArray(fileString)
df_file = train[train.id.str.contains(str(train["id"][0]),case=False)]
print(len(fileString))


2020
