In [429]:
import numpy as np
import pandas as pd
import os
import tensorflow as tf
import re
from datasets import load_dataset
from datasets import Dataset
from tqdm.notebook import tqdm
from transformers import AutoTokenizer

In [434]:
raw_datasets  = load_dataset("conll2003")
model_checkpoint = "bert-base-cased"
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)

tokenDict = {
    "Lead" : 0,
    "Position" : 1,
    "Evidence" : 2,
    "Claim" : 3,
    "Concluding Statement" : 4,
    "Counterclaim" : 5,
    "Rebuttal": 6
}

Reusing dataset conll2003 (C:\Users\Brad\.cache\huggingface\datasets\conll2003\conll2003\1.0.0\40e7cb6bcc374f7c349c83acd1e9352a4f09474eb691f64f364ee62eb65d0ca6)
100%|██████████| 3/3 [00:00<00:00, 264.34it/s]


In [500]:
def fileToArray(file_path):
    # turn text into array of words
    text_ds = tf.data.TextLineDataset(file_path).filter(lambda x: tf.cast(tf.strings.length(x), bool))
    text_ds = text_ds.enumerate()
    docTxt = []
    for i in text_ds.as_numpy_iterator():
        line = i[1].decode().split()
        for l in line:
            docTxt.append(l)
    
    docTxt = ' '.join(docTxt)
    return docTxt

def calc_word_indices(full_text, discourse_start, discourse_end):
    start_index = len(full_text[:discourse_start].split())
    token_len = len(full_text[discourse_start:discourse_end].split())
    output = list(range(start_index, start_index + token_len))
    if output[-1] >= len(full_text.split()):
        output = list(range(start_index, start_index + token_len-1))
    return output

def align_labels_with_tokens(labels, word_ids):
    new_labels = []
    current_word = None
    for word_id in word_ids:
        if word_id != current_word:
            # Start of a new word!
            current_word = word_id
            label = -100 if word_id is None else labels[word_id]
            new_labels.append(label)
        elif word_id is None:
            # Special token
            new_labels.append(-100)
        else:
            # Same word as previous token
            label = labels[word_id]
            # If the label is B-XXX we change it to I-XXX
            if label % 2 == 1:
                label += 1
            new_labels.append(label)

    return new_labels

def stringToArray(mystr):
    wordList = re.sub("[^\w]", " ",  mystr).split()
    return wordList

def getIntArray(strArrays):
    intArrays = []
    for strArray in strArrays:
        intArray = []
        for s in strArray:
            intArray.append(int(s))
        intArrays.append(intArray)
    return intArrays

def csv_to_df(fileName):
    df = pd.read_csv(fileName)
    df['tokens'] = df['tokens'].apply(lambda a: a.split())
    df['ner_tags'] = df['ner_tags'].apply(lambda a: a.split())
    df = df.drop(['Unnamed: 0'], axis = 1)
    return df

def tokenize_and_align_labels(examples):
    tokenized_inputs = tokenizer(
        examples["tokens"], truncation=True, is_split_into_words=True
    )

    all_labels = getIntArray(examples["ner_tags"])
    new_labels = []
    for i, labels in enumerate(all_labels):
        # try: 
        #     word_ids = tokenized_inputs.word_ids(i)
        #     new_labels.append(align_labels_with_tokens(labels, word_ids))
        # except:
        #     new_labels.append([-100])
        
        word_ids = tokenized_inputs.word_ids(i)
        new_labels.append(align_labels_with_tokens(labels, word_ids))

    tokenized_inputs["labels"] = new_labels
    return tokenized_inputs

In [432]:
#takes in a csv file name and returns a tokenized dataset
def createTokenDataset(fileName):
    df = csv_to_df(fileName)
    essayNames = df["id"].unique()
    
    data = {
        "id": [],
        "tokens": [],
        "ner_tags": []
    }

    token_df = pd.DataFrame(data)
    for fileName in tqdm(essayNames):
        df_file = df[df.id.str.contains(fileName,case=False)]
        df_file = df_file.reset_index()
        labels = []
        num_labels = []
        fullText = []
        for i in range(len(df_file.index)):
            df_string = df_file["predictionstring"][i]
            stringArray = df_string.split()
            df_textString = df_file["discourse_text"][i].split()
            for df in df_textString:
                if df != '.' and df != ',':
                    fullText.append(df)
            for s in stringArray:
                labels.append(df_file["discourse_type"][i])
                num_labels.append(tokenDict[str(df_file["discourse_type"][i])])

        token_df.loc[len(token_df.index)] = [fileName, fullText, num_labels]
    
    #find a better way to do this
    token_df['tokens'] = token_df['tokens'].apply(lambda a: ' '.join(map(str, a)))
    token_df['ner_tags'] = token_df['ner_tags'].apply(lambda a: ' '.join(map(str, a)))
    token_df['tokens'] = token_df['tokens'].apply(lambda a: a.split())
    token_df['ner_tags'] = token_df['ner_tags'].apply(lambda a: a.split())

    tokenDataset = Dataset.from_pandas(token_df)

    tokenized_datasets = tokenDataset.map(
        tokenize_and_align_labels,
        batched=True,
    )
    return tokenized_datasets
    

In [436]:
train = csv_to_df("trainHugging5.csv")
newDataset = Dataset.from_pandas(train)

tokenized_datasets = newDataset.map(
    tokenize_and_align_labels,
    batched=True,
)

  0%|          | 0/16 [00:04<?, ?ba/s]


TypeError: not all arguments converted during string formatting

In [339]:
df_errorTest = pd.read_csv("train.csv")
singleErr = df_errorTest[df_errorTest["id"] == "C3811E7F1750"]
singleErr.head()

Unnamed: 0,id,discourse_id,discourse_start,discourse_end,discourse_text,discourse_type,discourse_type_num,predictionstring
99,C3811E7F1750,1622473000000.0,0.0,461.0,BOOM!! You're on I-75 on the ground bleeding o...,Lead,Lead 1,0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18...
100,C3811E7F1750,1622473000000.0,462.0,531.0,I firmly believe that we need stricter phone l...,Position,Position 1,80 81 82 83 84 85 86 87 88 89 90 91 92
101,C3811E7F1750,1622473000000.0,532.0,641.0,Texting while driving is in the top five cause...,Claim,Claim 1,93 94 95 96 97 98 99 100 101 102 103 104 105 1...
102,C3811E7F1750,1622473000000.0,641.0,837.0,We need to stand together united and encourage...,Evidence,Evidence 1,114 115 116 117 118 119 120 121 122 123 124 12...
103,C3811E7F1750,1622473000000.0,838.0,998.0,Cellphones are brain washing us to think we al...,Claim,Claim 2,147 148 149 150 151 152 153 154 155 156 157 15...


In [341]:
train.head()

Unnamed: 0,id,discourse_id,discourse_start,discourse_end,discourse_text,discourse_type,discourse_type_num,predictionstring
0,423A1CA112E2,1622628000000.0,8.0,229.0,Modern humans today are always on their phone....,Lead,Lead 1,1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 1...
1,423A1CA112E2,1622628000000.0,230.0,312.0,They are some really bad consequences when stu...,Position,Position 1,45 46 47 48 49 50 51 52 53 54 55 56 57 58 59
2,423A1CA112E2,1622628000000.0,313.0,401.0,Some certain areas in the United States ban ph...,Evidence,Evidence 1,60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75
3,423A1CA112E2,1622628000000.0,402.0,758.0,"When people have phones, they know about certa...",Evidence,Evidence 2,76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 9...
4,423A1CA112E2,1622628000000.0,759.0,886.0,Driving is one of the way how to get around. P...,Claim,Claim 1,139 140 141 142 143 144 145 146 147 148 149 15...


In [373]:
df_file = train[train.id.str.contains("F5E4D811501C")]
df_file = df_file.reset_index()
labels = []
num_labels = []
fullText = []
for i in range(len(df_file.index)):
    ps = df_file["predictionstring"][i].split()
    dt = df_file["discourse_text"][i].split() 
    print(len(ps))
    print(len(dt))
    df_string = df_file["predictionstring"][i]
    stringArray = df_string.split()
    df_textString = df_file["discourse_text"][i].split()
    for df in df_textString:
        fullText.append(df)
    for s in stringArray:
        labels.append(df_file["discourse_type"][i])
        num_labels.append(tokenDict[str(df_file["discourse_type"][i])])
        

print(fullText)
print(num_labels)

8
8
7
7
4
4
3
3
16
16
14
14
59
59
17
17
96
96
39
40
['Limiting', 'car', 'usage', 'is', 'a', 'very', 'good', 'idea', 'It', 'can', 'save', 'a', 'lot', 'of', 'money,', 'keep', 'pollution', 'from', 'happening', 'keep', 'people', 'safe.', 'If', 'more', 'people', 'tend', 'to', 'save', 'money', 'because', 'of', 'cars,', 'then', 'they', 'can', 'buy', 'more', 'stuff.', 'If', 'more', 'people', 'save', 'the', 'enviornment', 'people', 'can', 'have', 'fresh', 'air', 'to', 'breath', 'in.', '"In', 'German', 'Suburb,', 'Life', 'Goes', 'On', 'Without', 'Cars"', 'by', 'Elisabeth', 'Rosenthal,', 'it', 'states', 'that', '"As', 'a', 'result', 'of', 'buying', 'a', 'parking', 'space', 'for', '$40,000,', '70', 'percent', 'of', "Vauban's", 'families', 'do', 'not', 'own', 'cars,', 'and', '57', 'percent', 'sold', 'a', 'car', 'to', 'move', 'here.', 'When', 'I', 'had', 'a', 'car', 'I', 'was', 'always', 'tense.', "I'm", 'much', 'happier', 'this', 'way,"', 'said', 'Heidrun', 'Walter', 'This', 'quote', 'explains', 't

In [382]:
df_file = train[train.id.str.contains("423A1CA112E2")]
df_file = df_file.reset_index()
idx = len(df_file) - 1
ps = df_file["predictionstring"][idx].split()
dt = df_file["discourse_text"][idx].split()
print(ps)

['355', '356', '357', '358', '359', '360', '361', '362', '363', '364', '365', '366', '367', '368', '369', '370', '371', '372', '373', '374', '375', '376', '377', '378']


In [383]:
print(dt)

['The', 'news', 'always', 'updated', 'when', 'people', 'do', 'something', 'stupid', 'around', 'that', 'involves', 'their', 'phones.', 'The', 'safest', 'way', 'is', 'the', 'best', 'way', 'to', 'stay', 'safe.']


In [388]:
fixTrain = csv_to_df("trainHugging4.csv")
fixTrain.head()

Unnamed: 0,id,tokens,ner_tags
0,423A1CA112E2,"[Modern, humans, today, are, always, on, their...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
1,A8445CABFECE,"[Drivers, should, not, be, able, to, use, phon...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 3, 3, 3, ..."
2,6B4F7A0165B9,"[The, ability, to, stay, connected, to, people...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
3,E05C7F5C1156,"[People, are, debating, whether, if, drivers, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
4,50B3435E475B,"[Over, half, of, drivers, in, today's, society...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."


In [423]:
fixTrainSample = fixTrain[fixTrain["id"] == 'F5E4D811501C']
print(list(fixTrainSample.index)[0])

1070


In [424]:
longTokens = list(fixTrainSample["tokens"])[0]
for t in longTokens:
    if t == '.' or t == ',':
        print(t)

In [502]:
errorIds = list(df_tokenError["id"])
for e in errorIds:
    row = fixTrain[fixTrain["id"] == e]
    tokens = list(row["tokens"])[0]
    for t in tokens:
        if t == '.' or t == ',':
            print("r")
            tokens.remove(t)
    fixTrain.at[list(row.index)[0], "tokens"] = tokens.copy()


In [510]:
data = {
    "id": errorIds
}

token_df = pd.DataFrame(data)
token_df.to_csv("Error_Ids.csv")

In [503]:
fixData = Dataset.from_pandas(fixTrain)
fixData

Dataset({
    features: ['id', 'tokens', 'ner_tags'],
    num_rows: 15594
})

In [509]:
inputs = tokenizer(fixData[1070]["tokens"], is_split_into_words=True)

labels = fixData[0]["ner_tags"]
word_ids = inputs.word_ids()
print(len(labels))
print(len(word_ids))
print(align_labels_with_tokens(labels, word_ids))

378
334


TypeError: not all arguments converted during string formatting

In [495]:
fixTrain.to_csv("trainHugging5.csv")
fixTrain.

In [484]:
newTrain = csv_to_df("trainHugging4.csv")
# newTrain = newTrain.drop(['Unnamed: 0'], axis = 1)
# for index, row in newTrain.iterrows():
#     tags = []
#     for c in row[2]:
#         try:
#             tags.append(int(c))
#         except:
#             continue
#     newTrain.at[index, "ner_tags"] = tags


In [485]:
newTrain.head()

Unnamed: 0,id,tokens,ner_tags
0,423A1CA112E2,"[Modern, humans, today, are, always, on, their...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
1,A8445CABFECE,"[Drivers, should, not, be, able, to, use, phon...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 3, 3, 3, ..."
2,6B4F7A0165B9,"[The, ability, to, stay, connected, to, people...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
3,E05C7F5C1156,"[People, are, debating, whether, if, drivers, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
4,50B3435E475B,"[Over, half, of, drivers, in, today's, society...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."


In [486]:
newDataset = Dataset.from_pandas(newTrain)

In [501]:
tokenized_datasets = fixData.map(
    tokenize_and_align_labels,
    batched=True,
)

  0%|          | 0/16 [00:03<?, ?ba/s]


IndexError: list index out of range

In [91]:
fileString = fileToArray("train/" + str(train["id"][0]) + ".txt")
fileArray = stringToArray(fileString)
df_file = train[train.id.str.contains(str(train["id"][0]),case=False)]
print(len(fileString))


2020
