In [91]:
import numpy as np
import pandas as pd
import os
import tensorflow as tf
import re
from datasets import load_dataset
from datasets import Dataset

In [90]:
raw_datasets  = load_dataset("conll2003")

Reusing dataset conll2003 (C:\Users\Brad\.cache\huggingface\datasets\conll2003\conll2003\1.0.0\40e7cb6bcc374f7c349c83acd1e9352a4f09474eb691f64f364ee62eb65d0ca6)
100%|██████████| 3/3 [00:00<00:00, 95.53it/s]


In [2]:
def fileToArray(file_path):
    # turn text into array of words
    text_ds = tf.data.TextLineDataset(file_path).filter(lambda x: tf.cast(tf.strings.length(x), bool))
    text_ds = text_ds.enumerate()
    docTxt = []
    for i in text_ds.as_numpy_iterator():
        line = i[1].decode().split()
        for l in line:
            docTxt.append(l)
    
    docTxt = ' '.join(docTxt)
    return docTxt

def calc_word_indices(full_text, discourse_start, discourse_end):
    start_index = len(full_text[:discourse_start].split())
    token_len = len(full_text[discourse_start:discourse_end].split())
    output = list(range(start_index, start_index + token_len))
    if output[-1] >= len(full_text.split()):
        output = list(range(start_index, start_index + token_len-1))
    return output

def align_labels_with_tokens(labels, word_ids):
    new_labels = []
    current_word = None
    for word_id in word_ids:
        if word_id != current_word:
            # Start of a new word!
            current_word = word_id
            label = -100 if word_id is None else labels[word_id]
            new_labels.append(label)
        elif word_id is None:
            # Special token
            new_labels.append(-100)
        else:
            # Same word as previous token
            label = labels[word_id]
            # If the label is B-XXX we change it to I-XXX
            if label % 2 == 1:
                label += 1
            new_labels.append(label)

    return new_labels

def stringToArray(mystr):
    wordList = re.sub("[^\w]", " ",  mystr).split()
    return wordList

In [3]:
train = pd.read_csv('train.csv')
train.head()

Unnamed: 0,id,discourse_id,discourse_start,discourse_end,discourse_text,discourse_type,discourse_type_num,predictionstring
0,423A1CA112E2,1622628000000.0,8.0,229.0,Modern humans today are always on their phone....,Lead,Lead 1,1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 1...
1,423A1CA112E2,1622628000000.0,230.0,312.0,They are some really bad consequences when stu...,Position,Position 1,45 46 47 48 49 50 51 52 53 54 55 56 57 58 59
2,423A1CA112E2,1622628000000.0,313.0,401.0,Some certain areas in the United States ban ph...,Evidence,Evidence 1,60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75
3,423A1CA112E2,1622628000000.0,402.0,758.0,"When people have phones, they know about certa...",Evidence,Evidence 2,76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 9...
4,423A1CA112E2,1622628000000.0,759.0,886.0,Driving is one of the way how to get around. P...,Claim,Claim 1,139 140 141 142 143 144 145 146 147 148 149 15...


In [4]:
train["predictionstring"][100]

'80 81 82 83 84 85 86 87 88 89 90 91 92'

In [5]:
sampleTxt = fileToArray("train/" + str(train["id"][0]) + ".txt")
ftest = calc_word_indices(sampleTxt, int(train["discourse_start"][0]), int(train["discourse_end"][0]))
print(ftest)

[2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45]


In [6]:
#make a dummy submission
data = {'id': [], 'class': [], 'predictionstring': []}
df = pd.DataFrame(data)
df.head()

Unnamed: 0,id,class,predictionstring


In [7]:
for t in os.walk('test'):
    print(t)

('test', [], ['0FB0700DAF44.txt', '18409261F5C2.txt', 'D46BCB48440A.txt', 'D72CB1C11673.txt', 'DF920E0A7337.txt'])


In [8]:
train["discourse_type"].unique()

array(['Lead', 'Position', 'Evidence', 'Claim', 'Concluding Statement',
       'Counterclaim', 'Rebuttal'], dtype=object)

In [9]:
#make dictionary to convert discourse types to integers
tokenDict = {
    "Lead" : 0,
    "Position" : 1,
    "Evidence" : 2,
    "Claim" : 3,
    "Concluding Statement" : 4,
    "Counterclaim" : 5,
    "Rebuttal": 6
}

In [10]:
df_file = train[train.id.str.contains(str(train["id"][0]),case=False)]
labels = []
num_labels = []
fullText = []
for i in range(len(df_file.index)):
    df_string = df_file["predictionstring"][i]
    stringArray = df_string.split()
    df_textString = df_file["discourse_text"][i].split()
    for df in df_textString:
        fullText.append(df)
    for s in stringArray:
        labels.append(df_file["discourse_type"][i])
        num_labels.append(tokenDict[str(df_file["discourse_type"][i])])
        

print(fullText)
print(num_labels)

['Modern', 'humans', 'today', 'are', 'always', 'on', 'their', 'phone.', 'They', 'are', 'always', 'on', 'their', 'phone', 'more', 'than', '5', 'hours', 'a', 'day', 'no', 'stop', '.All', 'they', 'do', 'is', 'text', 'back', 'and', 'forward', 'and', 'just', 'have', 'group', 'Chats', 'on', 'social', 'media.', 'They', 'even', 'do', 'it', 'while', 'driving.', 'They', 'are', 'some', 'really', 'bad', 'consequences', 'when', 'stuff', 'happens', 'when', 'it', 'comes', 'to', 'a', 'phone.', 'Some', 'certain', 'areas', 'in', 'the', 'United', 'States', 'ban', 'phones', 'from', 'class', 'rooms', 'just', 'because', 'of', 'it.', 'When', 'people', 'have', 'phones,', 'they', 'know', 'about', 'certain', 'apps', 'that', 'they', 'have', '.Apps', 'like', 'Facebook', 'Twitter', 'Instagram', 'and', 'Snapchat.', 'So', 'like', 'if', 'a', 'friend', 'moves', 'away', 'and', 'you', 'want', 'to', 'be', 'in', 'contact', 'you', 'can', 'still', 'be', 'in', 'contact', 'by', 'posting', 'videos', 'or', 'text', 'messages.', 

In [107]:
from transformers import AutoTokenizer
model_checkpoint = "bert-base-cased"
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)

In [12]:
inputs = tokenizer(fullText, is_split_into_words=True)
inputs.tokens()

['[CLS]',
 'Modern',
 'humans',
 'today',
 'are',
 'always',
 'on',
 'their',
 'phone',
 '.',
 'They',
 'are',
 'always',
 'on',
 'their',
 'phone',
 'more',
 'than',
 '5',
 'hours',
 'a',
 'day',
 'no',
 'stop',
 '.',
 'All',
 'they',
 'do',
 'is',
 'text',
 'back',
 'and',
 'forward',
 'and',
 'just',
 'have',
 'group',
 'Cha',
 '##ts',
 'on',
 'social',
 'media',
 '.',
 'They',
 'even',
 'do',
 'it',
 'while',
 'driving',
 '.',
 'They',
 'are',
 'some',
 'really',
 'bad',
 'consequences',
 'when',
 'stuff',
 'happens',
 'when',
 'it',
 'comes',
 'to',
 'a',
 'phone',
 '.',
 'Some',
 'certain',
 'areas',
 'in',
 'the',
 'United',
 'States',
 'ban',
 'phones',
 'from',
 'class',
 'rooms',
 'just',
 'because',
 'of',
 'it',
 '.',
 'When',
 'people',
 'have',
 'phones',
 ',',
 'they',
 'know',
 'about',
 'certain',
 'apps',
 'that',
 'they',
 'have',
 '.',
 'A',
 '##pps',
 'like',
 'Facebook',
 'Twitter',
 'In',
 '##sta',
 '##gram',
 'and',
 'S',
 '##nap',
 '##cha',
 '##t',
 '.',
 'So',

In [13]:
word_ids = inputs.word_ids()
print(align_labels_with_tokens(num_labels, word_ids))

[-100, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 4, 3, 3, 3, 3, 3, 3, 3, 3, 3, 4, 3, 3, 3, 3, 3, 4, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 

In [87]:
# tokenize kaggle dataset through mapping
kaggleTrain = load_dataset('csv', data_files="trainHugging1.csv")

Using custom data configuration default-8508840b1601b217
Reusing dataset csv (C:\Users\Brad\.cache\huggingface\datasets\csv\default-8508840b1601b217\0.0.0\6b9057d9e23d9d8a2f05b985917a0da84d70c5dae3d22ddd8a3f22fb01c69d9e)
100%|██████████| 1/1 [00:00<00:00, 143.21it/s]


In [105]:
def tokenize_and_align_labels(examples):
    print(examples["ner_tags"])
    tokenized_inputs = tokenizer(
        examples["tokens"], truncation=True, is_split_into_words=True
    )
    all_labels = examples["ner_tags"]
    new_labels = []
    for i, labels in enumerate(all_labels):
        word_ids = tokenized_inputs.word_ids(i)
        new_labels.append(align_labels_with_tokens(labels, word_ids))

    tokenized_inputs["labels"] = new_labels
    return tokenized_inputs

In [96]:
# run this after trainHugging3
kaggleTrain = pd.read_csv("trainHugging3.csv")
kaggleTrain['tokens'] = kaggleTrain['tokens'].apply(lambda a: a.split())
kaggleTrain['ner_tags'] = kaggleTrain['ner_tags'].apply(lambda a: a.split())
kaggleTrain = kaggleTrain.drop(['Unnamed: 0'], axis = 1)
kaggleTrain.head()

Unnamed: 0,id,tokens,ner_tags
0,423A1CA112E2,"[Modern, humans, today, are, always, on, their...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
1,A8445CABFECE,"[Drivers, should, not, be, able, to, use, phon...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 3, 3, 3, ..."
2,6B4F7A0165B9,"[The, ability, to, stay, connected, to, people...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
3,E05C7F5C1156,"[People, are, debating, whether, if, drivers, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
4,50B3435E475B,"[Over, half, of, drivers, in, today's, society...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."


In [97]:
kaggleData = Dataset.from_pandas(kaggleTrain)

In [114]:
print(len(kaggleData["ner_tags"][0]))
print(len(kaggleData["tokens"][0]))

378
378


In [116]:
inputs = tokenizer(kaggleData["tokens"][0], is_split_into_words=True)
inputs.tokens()
word_ids = inputs.word_ids()
print(len(word_ids))
print(align_labels_with_tokens(kaggleData["ner_tags"][0], word_ids))

452


TypeError: not all arguments converted during string formatting

In [106]:
tokenized_datasets = kaggleData.map(
    tokenize_and_align_labels,
    batched=True,
)

  0%|          | 0/1 [00:00<?, ?ba/s]

[['0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '1', '1', '1', '1', '1', '1', '1', '1', '1', '1', '1', '1', '1', '1', '1', '2', '2', '2', '2', '2', '2', '2', '2', '2', '2', '2', '2', '2', '2', '2', '2', '2', '2', '2', '2', '2', '2', '2', '2', '2', '2', '2', '2', '2', '2', '2', '2', '2', '2', '2', '2', '2', '2', '2', '2', '2', '2', '2', '2', '2', '2', '2', '2', '2', '2', '2', '2', '2', '2', '2', '2', '2', '2', '2', '2', '2', '2', '2', '2', '2', '2', '2', '2', '2', '2', '2', '2', '2', '2', '2', '2', '2', '2', '2', '3', '3', '3', '3', '3', '3', '3', '3', '3', '3', '3', '3', '3', '3', '3', '3', '3', '3', '3', '3', '3', '3', '3', '3', '2', '2', '2', '2', '2', '2', '2', '2', '2', '2', '2', '2', '2', '2', '2', '2', '2', '2', '2', '2', '2', '2', '2', '2', '2', '2', '2', '2', '2', '2', '2', '2', '2', '2', '2', '2', '2', '2'

  0%|          | 0/1 [00:03<?, ?ba/s]


TypeError: not all arguments converted during string formatting

In [91]:
fileString = fileToArray("train/" + str(train["id"][0]) + ".txt")
fileArray = stringToArray(fileString)
df_file = train[train.id.str.contains(str(train["id"][0]),case=False)]
print(len(fileString))


2020
