In [41]:
from transformers import AutoTokenizer
import pandas as pd
import csv
import os
import numpy as np
from glob import glob
from transformers import AutoModelForTokenClassification, DistilBertForTokenClassification, TrainingArguments, Trainer, DataCollatorForTokenClassification
import torch
from source.eval import evaluate_indices, score
from sklearn.metrics import classification_report, f1_score, accuracy_score, precision_score, recall_score 
from sklearn.preprocessing import MultiLabelBinarizer
from torch import nn

In [42]:
! CUDA_VISIBLE_DEVICES=0

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


In [43]:
#os.environ["CUDA_VISIBLE_DEVICES"]='1'

In [44]:
name_model="distilbert/distilbert-base-cased"

In [45]:
tokenizer = AutoTokenizer.from_pretrained(name_model)

In [46]:
sent = "This is, a real; sentence. Not really Legal...\nBut it is\nok."
tok_sent = tokenizer(sent)
print(tok_sent)
tokens = tokenizer.convert_ids_to_tokens(tok_sent["input_ids"])
print(tokens)

{'input_ids': [101, 1188, 1110, 117, 170, 1842, 132, 5650, 119, 1753, 1541, 10800, 119, 119, 119, 1252, 1122, 1110, 21534, 119, 102], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}
['[CLS]', 'This', 'is', ',', 'a', 'real', ';', 'sentence', '.', 'Not', 'really', 'Legal', '.', '.', '.', 'But', 'it', 'is', 'ok', '.', '[SEP]']


In [47]:
files = glob("./documents/lines/train/*.txt")
dataset_train = []
for file in files: 
    text = open(file, "r")
    original = text.readlines()
    for i in range(len(original)):
        original[i]= original[i].strip('\n')
        dataset_train += [original[i]]

In [48]:
files = glob("./documents/lines/dev/*.txt")
dataset_dev = []
for file in files: 
    text = open(file, "r")
    original = text.readlines()
    for i in range(len(original)):
        original[i]= original[i].strip('\n')
        dataset_dev += [original[i]]

In [49]:
files = glob("./documents/lines/test/*.txt")
dataset_test = []
for file in files: 
    text = open(file, "r")
    original = text.readlines()
    for i in range(len(original)):
        original[i]= original[i].strip('\n')
        dataset_test += [original[i]]

In [50]:
df = pd.read_csv("./dataset_v20230110.tsv", delimiter="	", quoting=csv.QUOTE_NONE, encoding='utf-8')
df.head()

Unnamed: 0,document,content,tag
0,ROSENBLATT v. BAER_MCL,MR. JUSTICE BRENNAN delivered the opinion of t...,Announcing function
1,ROSENBLATT v. BAER_MCL,A jury in New Hampshire Superior Court awarded...,Describing the adjudicated facts
2,ROSENBLATT v. BAER_MCL,Respondent alleged that the column contained d...,Describing procedural events
3,ROSENBLATT v. BAER_MCL,In the interval between the trial and the deci...,Describing the adjudicated facts
4,ROSENBLATT v. BAER_MCL,We there held that consistent with the First a...,Recalling a SCOTUS decision


In [51]:
def add_classes(ner_tags, nb_spans):
    indexes = np.where(ner_tags == 0)[0]

    #print("indexes : ", indexes)
    #print("ner_tags : ", ner_tags)
    start = 0
    #print(ner_tags)
    for i in indexes:
        #print(start, i)
        ner_tags[start:i] = make_span(ner_tags[start:i], nb_spans)
        start = i + 1
    return ner_tags

def make_span(span, nb_spans):
    start = 0
    for i in range(1,nb_spans+1):
        offset = (int(len(span)/nb_spans))+start
        
        if len(span) - offset < nb_spans:
            offset = len(span)
        span[start:offset+1] = i
        #print(offset)
        start = offset
        #print("span : ",span)
    
    return span


In [52]:
def make_dataset(dataset, nb_spans,return_tensors=None):
    sentences = []
    ner_tags = []
    fin_tags = []
    tokenizeds = []

    for i in range(len(dataset)):
        tok_sent = tokenizer(dataset[i])
        #print(type(tok_sent))
        tokens = tokenizer.convert_ids_to_tokens(tok_sent["input_ids"])
        sentences += tokens[1:-1]
        n_tag = np.zeros(len(tokens), dtype=int)
        n_tag[-1] = 1
        ner_tags += n_tag.tolist()
        tok_sent["labels"] = n_tag
        #tokenizeds_dev+= [tok_sent]
    ner_tags = (np.array(ner_tags) - 1) * (-1) 
    ner_tags = add_classes(np.array(ner_tags), nb_spans)
    ner_tags = [t for t in ner_tags]
    for i in range(0, len(sentences), (507)):
        if i + 507 > len(sentences):
            tokens = ['CLS'] + sentences[i:len(sentences)] + ['SEP']
            fin_tags = [-100] + ner_tags[i:len(sentences)] + [-100]
        else : 
            tokens = ['CLS'] + sentences[i:i + 507] + ['SEP']
            fin_tags = [-100] + ner_tags[i:i + 507] + [-100]
        #print(tokenizer.convert_tokens_to_string(tokens))
        sent = tokenizer.decode(tokenizer.convert_tokens_to_ids(tokens[1:-1]))
        #print(type(sent))
        tokeniz = tokenizer(sent, return_tensors=return_tensors)
        #print(tokenizer.convert_tokens_to_string(tokeniz))
        #print(tokeniz)
        tokeniz["labels"] = fin_tags
        #print(fin_tags)
        #print('input_ids : ', len(tokeniz['input_ids']))
        #print('input_ids : ', tokeniz['input_ids'])
        #print(sent)
        #print('labels :', len(tokeniz['labels']))
        #print('am : ', len(tokeniz['attention_mask']))
        tokenizeds += [tokeniz]
    
    return tokenizeds
nb_spans = 10
tokenizeds_dev = make_dataset(dataset_dev, nb_spans)
tokenizeds_train = make_dataset(dataset_train, nb_spans)
tokenizeds_test = make_dataset(dataset_test, nb_spans, return_tensors="pt")

In [53]:
label_list = [str(s) for s in range(0,nb_spans+1)]

In [54]:
id2label = {int(s):s for s in label_list}
print(id2label)
label2id = {s:int(s) for s in label_list}
print(label2id)

{0: '0', 1: '1', 2: '2', 3: '3', 4: '4', 5: '5', 6: '6', 7: '7', 8: '8', 9: '9', 10: '10'}
{'0': 0, '1': 1, '2': 2, '3': 3, '4': 4, '5': 5, '6': 6, '7': 7, '8': 8, '9': 9, '10': 10}


In [55]:
"""
id2label = {
    0: "0",
    1: "1"
}

label2id = {
    "0": 0,
    "1": 1
}
"""

'\nid2label = {\n    0: "0",\n    1: "1"\n}\n\nlabel2id = {\n    "0": 0,\n    "1": 1\n}\n'

In [56]:
model = AutoModelForTokenClassification.from_pretrained(
        name_model, 
        num_labels=nb_spans+1, 
        id2label=id2label, 
        label2id=label2id)
data_collator = DataCollatorForTokenClassification(tokenizer=tokenizer)

Some weights of DistilBertForTokenClassification were not initialized from the model checkpoint at distilbert/distilbert-base-cased and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [57]:
import evaluate

seqeval = evaluate.load("seqeval")

In [58]:
def compute_metrics(p):

    predictions, labels = p
    predictions = np.argmax(predictions, axis=2)

    true_predictions = []
    true_labels = []

        

    for prediction, label in zip(predictions, labels):
        for (p, l) in zip(prediction, label) :
            if l != -100:
                true_predictions += [label_list[p]]
                true_labels += [label_list[l]]
    
    #for prediction, label in zip(predictions, labels):
    #    for (p, l) in zip(prediction, label) :
    #        if l != -100:
    #            true_labels += [label_list[p]]
    """
    true_labels = [

        [label_list[l] for (p, l) in zip(prediction, label) if l != -100]

        for prediction, label in zip(predictions, labels)

    ]


    true_labels = [ l for l in true_labels]
    true_predictions = [ l for l in true_predictions]
    #print(len(true_predictions))
    #print(len(true_labels))
    """
    """
    l_pre = []
    l_rec = []
    l_f1 = []

    for p, l in zip(true_predictions, true_labels):
        p = np.array(p, dtype=int)
        l = np.array(l, dtype=int)
        tp = len(np.where((p == l) and (p == 0))[0])
        tn = len(np.where((p == l) and (p != 0))[0])
        fp = len(np.where((p != l) and (p == 0))[0])
        fn = len(np.where((p != l) and (p != 0)[0]))
        pre, rec, f1 = score(tp, fp, fn)
        l_pre += [pre] 
        l_rec += [rec] 
        l_f1 += [f1] 
    """
    
    MultiLabelBinarizer()

    return {

        "precision": precision_score(true_labels, true_predictions, average='macro', zero_division=0),

        "recall": recall_score(true_labels, true_predictions, average='macro', zero_division=0),

        "f1": f1_score(true_labels, true_predictions, average='macro', zero_division=0),

        "accuracy": accuracy_score(true_labels, true_predictions),

    }

    

    '''
    for p, l in zip(true_predictions, true_labels):
        pred = list(np.where(np.array(p) == '1')[0])
        lab = list(np.where(np.array(l) == '1')[0])
        pre, rec, f1 = evaluate_indices(lab, pred)
        #acc = 
        if len(np.unique(l)) == 2:
            l_pre += [pre]
            l_rec += [rec]
            l_f1 += [f1]
           # l_acc += [acc]
        else : 
            l_pre += [1]
            l_rec += [1]
            l_f1 += [1]
           # l_acc += [acc]

    return {

        "precision": np.mean(l_pre),

        "recall": np.mean(l_rec),

        "f1": np.mean(l_f1),

        "accuracy": np.mean(l_pre),

    }
    '''
    #print(evaluate_indices)
    results = seqeval.compute(predictions=true_predictions, references=true_labels, zero_division=0)

    return {

        "precision": results["overall_precision"],

        "recall": results["overall_recall"],

        "f1": results["overall_f1"],

        "accuracy": results["overall_accuracy"],

    }

In [59]:
class CustomTrainer(Trainer):
    def __init__(self, *args, **kwargs):
        super().__init__(*args, **kwargs)

    def compute_loss(self, model, inputs, return_outputs=False):
        labels = inputs.pop("labels")
        # forward pass
        outputs = model(**inputs)
        logits = outputs.get("logits")
        # compute custom loss (suppose one has 2 labels with different weights)
        #loss_fct = nn.CrossEntropyLoss(weight=torch.tensor([0.05, 3])).to('cuda'))
        loss_fct = nn.CrossEntropyLoss(weight=torch.tensor([3,0.05,0.05,0.05,0.05,0.05,0.05,0.05,0.05,0.05,0.05]).to('cuda'))
        loss = loss_fct(logits.view(-1, self.model.config.num_labels), labels.view(-1))
        return (loss, outputs) if return_outputs else loss

In [60]:
training_args = TrainingArguments(
    output_dir="./models/distilbert-v0",
    learning_rate=2e-5,
    per_device_train_batch_size=2,
    per_device_eval_batch_size=2,
    num_train_epochs=10,
    weight_decay=0.01,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True
)

trainer = Trainer(
    model=model,#.to('cuda'),
    args=training_args,
    train_dataset=tokenizeds_train,
    eval_dataset=tokenizeds_dev,
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics
)

trainer.train()

    There is an imbalance between your GPUs. You may want to exclude GPU 1 which
    has less than 75% of the memory or cores of GPU 0. You can do so by setting
    the device_ids argument to DataParallel, or by setting the CUDA_VISIBLE_DEVICES
    environment variable.


You're using a DistilBertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Epoch,Training Loss,Validation Loss,Precision,Recall,F1,Accuracy
1,No log,2.364378,0.065739,0.091682,0.024132,0.119346
2,No log,2.365086,0.097945,0.092482,0.033825,0.119496
3,No log,2.368558,0.074244,0.092265,0.049805,0.11477
4,No log,2.371766,0.090417,0.091823,0.061858,0.113195
5,No log,2.38,0.089268,0.091264,0.075799,0.108244
6,No log,2.394571,0.087314,0.091522,0.077178,0.106969
7,No log,2.40638,0.086448,0.089712,0.078044,0.104343
8,No log,2.416458,0.087166,0.088845,0.07979,0.101943
9,No log,2.42326,0.08603,0.088477,0.077801,0.102393
10,No log,2.424878,0.086013,0.088309,0.079159,0.101268


    There is an imbalance between your GPUs. You may want to exclude GPU 1 which
    has less than 75% of the memory or cores of GPU 0. You can do so by setting
    the device_ids argument to DataParallel, or by setting the CUDA_VISIBLE_DEVICES
    environment variable.
Checkpoint destination directory ./models/distilbert-v0/checkpoint-29 already exists and is non-empty.Saving will proceed but saved results may be invalid.
    There is an imbalance between your GPUs. You may want to exclude GPU 1 which
    has less than 75% of the memory or cores of GPU 0. You can do so by setting
    the device_ids argument to DataParallel, or by setting the CUDA_VISIBLE_DEVICES
    environment variable.
Checkpoint destination directory ./models/distilbert-v0/checkpoint-58 already exists and is non-empty.Saving will proceed but saved results may be invalid.
    There is an imbalance between your GPUs. You may want to exclude GPU 1 which
    has less than 75% of the memory or cores of GPU 0. You can do

TrainOutput(global_step=290, training_loss=2.3037612388873923, metrics={'train_runtime': 173.3106, 'train_samples_per_second': 6.52, 'train_steps_per_second': 1.673, 'total_flos': 147071696682414.0, 'train_loss': 2.3037612388873923, 'epoch': 10.0})

In [61]:
#import huggingface_hub
#huggingface_hub.login(token='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')
#trainer.push_to_hub("distilber for tok classif")


In [62]:
print(tokenizeds_dev[0].keys())

dict_keys(['input_ids', 'attention_mask', 'labels'])


In [63]:
model = DistilBertForTokenClassification.from_pretrained("./models/distilbert-v0/checkpoint-203")
labels = []
tok_labels = []
for x in tokenizeds_test:
    #print(x['input_ids'])
    inputs = {}
    #print(x.keys())
    inputs['input_ids'] = x['input_ids']
    inputs['attention_mask'] = x['attention_mask']
    with torch.no_grad():
        logits = model(**inputs).logits
    #print(logits)
    predicted_token_class_ids = logits.argmax(-1)
    print(np.unique(predicted_token_class_ids))
    predicted_tokens_classes = [model.config.id2label[t.item()] for t in predicted_token_class_ids[0]]
    labels += predicted_token_class_ids.tolist()
    print(labels[-1])
    tok_labels += [x['labels']]



[ 1  2  3  4  5  6  7  8  9 10]
[6, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 4, 1, 1, 1, 1, 1, 1, 8, 8, 8, 8, 8, 8, 8, 8, 9, 9, 8, 8, 8, 9, 8, 8, 4, 4, 8, 3, 6, 5, 4, 9, 9, 6, 6, 2, 5, 1, 6, 4, 4, 4, 8, 6, 4, 8, 8, 9, 8, 5, 9, 5, 9, 5, 9, 9, 9, 9, 9, 9, 9, 8, 8, 5, 5, 4, 4, 1, 5, 5, 7, 4, 4, 4, 8, 6, 6, 4, 4, 1, 4, 4, 4, 8, 4, 4, 1, 1, 1, 4, 2, 2, 8, 8, 8, 8, 8, 8, 6, 8, 8, 4, 4, 4, 4, 4, 3, 8, 8, 8, 8, 8, 8, 7, 2, 2, 2, 2, 7, 8, 7, 7, 7, 8, 8, 8, 8, 7, 7, 7, 7, 7, 7, 9, 8, 4, 4, 8, 7, 8, 8, 8, 8, 7, 7, 8, 7, 8, 9, 6, 7, 7, 7, 6, 4, 9, 8, 7, 6, 3, 3, 7, 9, 7, 9, 7, 7, 7, 7, 7, 9, 1, 8, 8, 8, 1, 2, 4, 8, 7, 7, 7, 7, 7, 7, 9, 8, 6, 8, 9, 8, 8, 8, 8, 8, 8, 7, 8, 7, 3, 8, 6, 6, 6, 6, 6, 4, 9, 6, 9, 1, 6, 6, 4, 9, 9, 9, 9, 9, 6, 6, 10, 10, 10, 10, 9, 10, 10, 10, 9, 9, 9, 9, 8, 4, 9, 7, 8, 8, 6, 8, 8, 8, 6, 6, 1, 8, 8, 8, 8, 8, 4, 4, 6, 9, 8, 4, 7, 8, 4, 8, 4, 8, 8, 8, 8, 7, 8, 8, 9, 8, 9, 9, 9, 9, 9, 9, 8, 9, 9, 6, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 3, 9, 4, 9, 4, 8, 1, 8, 8, 4, 4, 9, 8, 4, 8, 

In [64]:
print(len(tok_labels))
print(len(tokenizeds_test))
print(len(labels))

22
22
22


In [65]:
f_labels = [] 
for l in labels:
    f_labels += [l]
print(f_labels)
classification_report(labels, tok_labels)
#for i,j in zip(tok_labels, labels):
    
    #lab = np.where(j == 1)[0]
    #toks_lab = np.where(i == 1)[0]
    #compute_metrics((j, i))
    #evaluate_indices(lab, toks_lab)

[[6, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 4, 1, 1, 1, 1, 1, 1, 8, 8, 8, 8, 8, 8, 8, 8, 9, 9, 8, 8, 8, 9, 8, 8, 4, 4, 8, 3, 6, 5, 4, 9, 9, 6, 6, 2, 5, 1, 6, 4, 4, 4, 8, 6, 4, 8, 8, 9, 8, 5, 9, 5, 9, 5, 9, 9, 9, 9, 9, 9, 9, 8, 8, 5, 5, 4, 4, 1, 5, 5, 7, 4, 4, 4, 8, 6, 6, 4, 4, 1, 4, 4, 4, 8, 4, 4, 1, 1, 1, 4, 2, 2, 8, 8, 8, 8, 8, 8, 6, 8, 8, 4, 4, 4, 4, 4, 3, 8, 8, 8, 8, 8, 8, 7, 2, 2, 2, 2, 7, 8, 7, 7, 7, 8, 8, 8, 8, 7, 7, 7, 7, 7, 7, 9, 8, 4, 4, 8, 7, 8, 8, 8, 8, 7, 7, 8, 7, 8, 9, 6, 7, 7, 7, 6, 4, 9, 8, 7, 6, 3, 3, 7, 9, 7, 9, 7, 7, 7, 7, 7, 9, 1, 8, 8, 8, 1, 2, 4, 8, 7, 7, 7, 7, 7, 7, 9, 8, 6, 8, 9, 8, 8, 8, 8, 8, 8, 7, 8, 7, 3, 8, 6, 6, 6, 6, 6, 4, 9, 6, 9, 1, 6, 6, 4, 9, 9, 9, 9, 9, 6, 6, 10, 10, 10, 10, 9, 10, 10, 10, 9, 9, 9, 9, 8, 4, 9, 7, 8, 8, 6, 8, 8, 8, 6, 6, 1, 8, 8, 8, 8, 8, 4, 4, 6, 9, 8, 4, 7, 8, 4, 8, 4, 8, 8, 8, 8, 7, 8, 8, 9, 8, 9, 9, 9, 9, 9, 9, 8, 9, 9, 6, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 3, 9, 4, 9, 4, 8, 1, 8, 8, 4, 4, 9, 8, 4, 8, 4, 2, 2, 4, 8, 2, 6, 6, 8, 1, 4

ValueError: You appear to be using a legacy multi-label data representation. Sequence of sequences are no longer supported; use a binary array or sparse matrix instead - the MultiLabelBinarizer transformer can convert to this format.