In [127]:
from datasets import load_dataset
import pandas as pd
from tqdm.notebook import tqdm

In [6]:
raw_datasets  = load_dataset("conll2003")
raw_datasets

Reusing dataset conll2003 (C:\Users\Brad\.cache\huggingface\datasets\conll2003\conll2003\1.0.0\40e7cb6bcc374f7c349c83acd1e9352a4f09474eb691f64f364ee62eb65d0ca6)
100%|██████████| 3/3 [00:00<00:00, 429.63it/s]


DatasetDict({
    train: Dataset({
        features: ['id', 'tokens', 'pos_tags', 'chunk_tags', 'ner_tags'],
        num_rows: 14041
    })
    validation: Dataset({
        features: ['id', 'tokens', 'pos_tags', 'chunk_tags', 'ner_tags'],
        num_rows: 3250
    })
    test: Dataset({
        features: ['id', 'tokens', 'pos_tags', 'chunk_tags', 'ner_tags'],
        num_rows: 3453
    })
})

In [38]:
raw_datasets['train'][0]['ner_tags']

[3, 0, 7, 0, 0, 0, 7, 0, 0]

In [39]:
# O means the word doesn’t correspond to any entity.
# B-PER/I-PER means the word corresponds to the beginning of/is inside a person entity.
# B-ORG/I-ORG means the word corresponds to the beginning of/is inside an organization entity.
# B-LOC/I-LOC means the word corresponds to the beginning of/is inside a location entity.
# B-MISC/I-MISC means the word corresponds to the beginning of/is inside a miscellaneous entity.
ner_feature = raw_datasets["train"].features["ner_tags"]
label_names = ner_feature.feature.names
label_names

['O', 'B-PER', 'I-PER', 'B-ORG', 'I-ORG', 'B-LOC', 'I-LOC', 'B-MISC', 'I-MISC']

In [40]:
from transformers import AutoTokenizer
model_checkpoint = "bert-base-cased"
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)

In [41]:
inputs = tokenizer(raw_datasets["train"][0]["tokens"], is_split_into_words=True)
inputs.tokens()

['[CLS]',
 'EU',
 'rejects',
 'German',
 'call',
 'to',
 'boycott',
 'British',
 'la',
 '##mb',
 '.',
 '[SEP]']

In [42]:
def align_labels_with_tokens(labels, word_ids):
    new_labels = []
    current_word = None
    for word_id in word_ids:
        if word_id != current_word:
            # Start of a new word!
            current_word = word_id
            label = -100 if word_id is None else labels[word_id]
            new_labels.append(label)
        elif word_id is None:
            # Special token
            new_labels.append(-100)
        else:
            # Same word as previous token
            label = labels[word_id]
            # If the label is B-XXX we change it to I-XXX
            if label % 2 == 1:
                label += 1
            new_labels.append(label)

    return new_labels

def tokenize_and_align_labels(examples):
    tokenized_inputs = tokenizer(
        examples["tokens"], truncation=True, is_split_into_words=True
    )
    all_labels = examples["ner_tags"]
    new_labels = []
    for i, labels in enumerate(all_labels):
        word_ids = tokenized_inputs.word_ids(i)
        new_labels.append(align_labels_with_tokens(labels, word_ids))

    tokenized_inputs["labels"] = new_labels
    return tokenized_inputs

In [43]:
labels = raw_datasets["train"][0]["ner_tags"]
word_ids = inputs.word_ids()
print(labels)
print(align_labels_with_tokens(labels, word_ids))

[3, 0, 7, 0, 0, 0, 7, 0, 0]
[-100, 3, 0, 7, 0, 0, 0, 7, 0, 0, 0, -100]


In [44]:
tokenized_datasets = raw_datasets.map(
    tokenize_and_align_labels,
    batched=True,
    remove_columns=raw_datasets["train"].column_names,
)

100%|██████████| 15/15 [00:04<00:00,  3.37ba/s]
100%|██████████| 4/4 [00:00<00:00,  4.68ba/s]
100%|██████████| 4/4 [00:00<00:00,  4.13ba/s]


In [45]:
tokenized_datasets

DatasetDict({
    train: Dataset({
        features: ['attention_mask', 'input_ids', 'labels', 'token_type_ids'],
        num_rows: 14041
    })
    validation: Dataset({
        features: ['attention_mask', 'input_ids', 'labels', 'token_type_ids'],
        num_rows: 3250
    })
    test: Dataset({
        features: ['attention_mask', 'input_ids', 'labels', 'token_type_ids'],
        num_rows: 3453
    })
})

In [52]:
kaggleTrain = load_dataset('csv', data_files="train.csv")

Using custom data configuration default-5d614e759366f6b8


Downloading and preparing dataset csv/default to C:\Users\Brad\.cache\huggingface\datasets\csv\default-5d614e759366f6b8\0.0.0\6b9057d9e23d9d8a2f05b985917a0da84d70c5dae3d22ddd8a3f22fb01c69d9e...


100%|██████████| 1/1 [00:00<?, ?it/s]
100%|██████████| 1/1 [00:00<00:00, 1002.22it/s]


Dataset csv downloaded and prepared to C:\Users\Brad\.cache\huggingface\datasets\csv\default-5d614e759366f6b8\0.0.0\6b9057d9e23d9d8a2f05b985917a0da84d70c5dae3d22ddd8a3f22fb01c69d9e. Subsequent calls will reuse this data.


100%|██████████| 1/1 [00:00<00:00, 28.89it/s]


In [53]:
kaggleTrain

DatasetDict({
    train: Dataset({
        features: ['id', 'discourse_id', 'discourse_start', 'discourse_end', 'discourse_text', 'discourse_type', 'discourse_type_num', 'predictionstring'],
        num_rows: 144293
    })
})

In [40]:
train = pd.read_csv("train.csv")

In [41]:
train.head()

Unnamed: 0,id,discourse_id,discourse_start,discourse_end,discourse_text,discourse_type,discourse_type_num,predictionstring
0,423A1CA112E2,1622628000000.0,8.0,229.0,Modern humans today are always on their phone....,Lead,Lead 1,1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 1...
1,423A1CA112E2,1622628000000.0,230.0,312.0,They are some really bad consequences when stu...,Position,Position 1,45 46 47 48 49 50 51 52 53 54 55 56 57 58 59
2,423A1CA112E2,1622628000000.0,313.0,401.0,Some certain areas in the United States ban ph...,Evidence,Evidence 1,60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75
3,423A1CA112E2,1622628000000.0,402.0,758.0,"When people have phones, they know about certa...",Evidence,Evidence 2,76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 9...
4,423A1CA112E2,1622628000000.0,759.0,886.0,Driving is one of the way how to get around. P...,Claim,Claim 1,139 140 141 142 143 144 145 146 147 148 149 15...


In [42]:
essayNames = train["id"].unique()

In [181]:
tokenDict = {
    "Lead" : 0,
    "Position" : 1,
    "Evidence" : 2,
    "Claim" : 3,
    "Concluding Statement" : 4,
    "Counterclaim" : 5,
    "Rebuttal": 6
}

data = {
    "id": [],
    "tokens": [],
    "ner_tags": []
}

train_new = pd.DataFrame(data)

In [39]:
df_file = train[train.id.str.contains("A8445CABFECE",case=False)]
df_file = df_file.reset_index()
df_file.head()

Unnamed: 0,index,id,discourse_id,discourse_start,discourse_end,discourse_text,discourse_type,discourse_type_num,predictionstring
0,10,A8445CABFECE,1622576000000.0,18.0,85.0,Drivers should not be able to use phones while...,Position,Position 1,3 4 5 6 7 8 9 10 11 12 13 14
1,11,A8445CABFECE,1622576000000.0,86.0,202.0,Drivers who used their phone while operating a...,Claim,Claim 1,15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 3...
2,12,A8445CABFECE,1622576000000.0,203.0,1030.0,According to an article by the Edgar Snyder Fi...,Evidence,Evidence 1,36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 5...
3,13,A8445CABFECE,1622576000000.0,1031.0,1243.0,"In conclusion, drivers should not able to work...",Concluding Statement,Concluding Statement 1,177 178 179 180 181 182 183 184 185 186 187 18...


In [182]:
#csv data returns a string not a list
def createRow(fileName):
    df_file = train[train.id.str.contains(fileName,case=False)]
    df_file = df_file.reset_index()
    labels = []
    num_labels = []
    fullText = []
    for i in range(len(df_file.index)):
        df_string = df_file["predictionstring"][i]
        stringArray = df_string.split()
        df_textString = df_file["discourse_text"][i].split()
        for df in df_textString:
            fullText.append(df)
        for s in stringArray:
            labels.append(df_file["discourse_type"][i])
            num_labels.append(tokenDict[str(df_file["discourse_type"][i])])

    train_new.loc[len(train_new.index)] = [fileName, fullText, num_labels]

for fileName in tqdm(essayNames):
    createRow(fileName)


  0%|          | 0/15594 [00:00<?, ?it/s]

  return array(a, dtype, copy=False, order=order)


In [183]:
train_new['tokens'] = train_new['tokens'].apply(lambda a: ' '.join(map(str, a)))
train_new['ner_tags'] = train_new['ner_tags'].apply(lambda a: ' '.join(map(str, a)))

In [184]:
train_new.head()

Unnamed: 0,id,tokens,ner_tags
0,423A1CA112E2,Modern humans today are always on their phone....,0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 ...
1,A8445CABFECE,Drivers should not be able to use phones while...,1 1 1 1 1 1 1 1 1 1 1 1 3 3 3 3 3 3 3 3 3 3 3 ...
2,6B4F7A0165B9,The ability to stay connected to people we kno...,0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 ...
3,E05C7F5C1156,People are debating whether if drivers should ...,0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 ...
4,50B3435E475B,Over half of drivers in today's society have t...,0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 ...


In [185]:
train_new.to_csv("trainHugging4.csv")

In [186]:
pdTest = pd.read_csv("trainHugging4.csv")
pdTest['tokens'] = pdTest['tokens'].apply(lambda a: a.split())
pdTest['ner_tags'] = pdTest['ner_tags'].apply(lambda a: a.split())
pdTest.head()

Unnamed: 0.1,Unnamed: 0,id,tokens,ner_tags
0,0,423A1CA112E2,"[Modern, humans, today, are, always, on, their...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
1,1,A8445CABFECE,"[Drivers, should, not, be, able, to, use, phon...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 3, 3, 3, ..."
2,2,6B4F7A0165B9,"[The, ability, to, stay, connected, to, people...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
3,3,E05C7F5C1156,"[People, are, debating, whether, if, drivers, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
4,4,50B3435E475B,"[Over, half, of, drivers, in, today's, society...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."


In [177]:
def intss(a):
    for i in a:
        i = int(i)
    return a
pdTest['ner_tags'] = pdTest['ner_tags'].apply(lambda a: intss(a))

In [180]:
for i in pdTest['ner_tags'][0]:
    print(int(i))

0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4


In [122]:
pdTest.to_csv("test3.csv")

In [123]:
pdTest = pd.read_csv("test3.csv")
pdTest.head()

Unnamed: 0.2,Unnamed: 0,Unnamed: 0.1,id,tokens,ner_tags
0,0,0,423A1CA112E2,"['Modern', 'humans', 'today', 'are', 'always',...","['0', '0', '0', '0', '0', '0', '0', '0', '0', ..."
1,1,1,A8445CABFECE,"['Drivers', 'should', 'not', 'be', 'able', 'to...","['1', '1', '1', '1', '1', '1', '1', '1', '1', ..."
2,2,2,6B4F7A0165B9,"['The', 'ability', 'to', 'stay', 'connected', ...","['0', '0', '0', '0', '0', '0', '0', '0', '0', ..."
3,3,3,E05C7F5C1156,"['People', 'are', 'debating', 'whether', 'if',...","['0', '0', '0', '0', '0', '0', '0', '0', '0', ..."
4,4,4,50B3435E475B,"['Over', 'half', 'of', 'drivers', 'in', ""today...","['0', '0', '0', '0', '0', '0', '0', '0', '0', ..."


In [124]:
pdTest["ner_tags"][0]

"['0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '1', '1', '1', '1', '1', '1', '1', '1', '1', '1', '1', '1', '1', '1', '1', '2', '2', '2', '2', '2', '2', '2', '2', '2', '2', '2', '2', '2', '2', '2', '2', '2', '2', '2', '2', '2', '2', '2', '2', '2', '2', '2', '2', '2', '2', '2', '2', '2', '2', '2', '2', '2', '2', '2', '2', '2', '2', '2', '2', '2', '2', '2', '2', '2', '2', '2', '2', '2', '2', '2', '2', '2', '2', '2', '2', '2', '2', '2', '2', '2', '2', '2', '2', '2', '2', '2', '2', '2', '2', '2', '2', '2', '2', '2', '3', '3', '3', '3', '3', '3', '3', '3', '3', '3', '3', '3', '3', '3', '3', '3', '3', '3', '3', '3', '3', '3', '3', '3', '2', '2', '2', '2', '2', '2', '2', '2', '2', '2', '2', '2', '2', '2', '2', '2', '2', '2', '2', '2', '2', '2', '2', '2', '2', '2', '2', '2', '2', '2', '2', '2', '2', '2', '2', '2', '2', '2'