In [None]:
from google.colab import drive
!pip install -q accelerate -U
!pip install -q transformers[torch]
!pip install -q evaluate

# Mount Google Drive
drive.mount('/content/drive')
!huggingface-cli login --token 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

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
Token will not been saved to git credential helper. Pass `add_to_git_credential=True` if you want to set the git credential as well.
Token is valid (permission: write).
Your token has been saved to /root/.cache/huggingface/token
Login successful


In [None]:
import json

def merge_consecutive_entities(data):
    entities = data
    merged_entities = []
    current_entity = entities[0]
    for next_entity in entities[1:]:
        if current_entity["label"] == next_entity["label"] :
            if current_entity["end_offset"] == next_entity["start_offset"] or current_entity["start_offset"] == next_entity["end_offset"] or current_entity["end_offset"]+1 == next_entity["start_offset"] or current_entity["start_offset"]+1 == next_entity["end_offset"]:
                current_entity["start_offset"] = min(current_entity["start_offset"], next_entity["start_offset"])
                current_entity["end_offset"] = max(current_entity["end_offset"], next_entity["end_offset"])
            else:
                merged_entities.append(current_entity)
                current_entity = next_entity
        else:

            merged_entities.append(current_entity)
            current_entity = next_entity

    merged_entities.append(current_entity)
    return merged_entities


input_file_path = "/content/drive/MyDrive/a.jsonl"

merged_data = []

with open(input_file_path, "r") as file:
    for line in file:
        data = json.loads(line)
        #data.pop("relations", None)  # Remove "relations" key if present
        #data.pop("Comments", None)
        data["entities"].sort(key=lambda x: x["id"])
        if len(data["entities"]) == 0:
            continue
        #data["entities"]= merge_consecutive_entities(data["entities"])
        merged_data.append(data)


In [None]:
unique_labels = set(entity["label"] for sample in merged_data for entity in sample["entities"])

label2id = {label: idx for idx, label in enumerate(unique_labels)}

for sample in merged_data:
    for entity in sample["entities"]:
        entity["label_id"] = label2id.get(entity["label"], -1)  # -1 for unknown labels



In [None]:
from transformers import BertTokenizer

def tokenize_and_add_labels(data):
    tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")

    unique_labels = set(entity["label"] for sample in data for entity in sample.get("entities", []))

    label2id = {label: idx + 1 for idx, label in enumerate(unique_labels)}
    label2id["[PAD]"] = 0
    tokenized_data = []

    for sample in data:
        text = sample.get("text", "")
        entities = sample.get("entities", [])
        text = text[:510]
        tokenized_inputs = tokenizer(
            text,
            return_tensors="pt",
            padding="max_length",
            truncation=True,
            max_length=512,
        )

        input_ids = tokenized_inputs["input_ids"].squeeze().tolist()
        # tokens = tokenizer.convert_ids_to_tokens(input_ids)
        labels = [0] * len(input_ids)
        attention_mask = tokenized_inputs["attention_mask"].squeeze().tolist()

        for entity in entities:
            start_offset = entity.get("start_offset", 0)
            end_offset = entity.get("end_offset", 0)
            label = entity.get("label", "")

            # Ensure indices are within the bounds of the text
            start_offset = min(start_offset, len(text) - 1)
            end_offset = min(end_offset, len(text) - 1)

            start_token = tokenizer.encode(text[:start_offset], add_special_tokens=False)
            end_token = tokenizer.encode(text[:end_offset], add_special_tokens=False)
            if start_token and end_token:

                start = len(start_token)
                end = len(end_token)
                label_id = label2id.get(label, 0)
                labels[start:end + 1] = [label_id] * (end - start + 1)


        tokenized_data.append({
            "input_ids": input_ids,
            "labels": labels,
            "attention_mask": attention_mask
        })

    return tokenized_data, label2id


# Tokenize the data and add labels
tokenized_data, label2id = tokenize_and_add_labels(merged_data)

for sample in tokenized_data:
    print("Input IDs:", sample["input_ids"])
    print("Labels:", sample["labels"])
    print("Attention Mask:", sample["attention_mask"])
    print()
    break
id2label = {v: k for k, v in label2id.items()}


Input IDs: [101, 8327, 2184, 1012, 2410, 4101, 6957, 3820, 8145, 7028, 8474, 2177, 1010, 4297, 1012, 1006, 1000, 10507, 5856, 1000, 1007, 1998, 20369, 2969, 2326, 6627, 1010, 4297, 1012, 1006, 1000, 26189, 4757, 1000, 1007, 1010, 1006, 1996, 1000, 4243, 1000, 2030, 1000, 4101, 6957, 2869, 1000, 2065, 3615, 2000, 13643, 1010, 2030, 1996, 1000, 2283, 1000, 2030, 4101, 6957, 2099, 1000, 2065, 3615, 2000, 13048, 2135, 1007, 1010, 2011, 2023, 3820, 5482, 3209, 2004, 2449, 9228, 1010, 1998, 2025, 2004, 5826, 1010, 1999, 1996, 4195, 1997, 1037, 4101, 6957, 1006, 1996, 1000, 4101, 6957, 1000, 1007, 1010, 2005, 1996, 3800, 1997, 11973, 3227, 1999, 1996, 2449, 3024, 2005, 2011, 3408, 1998, 8910, 1997, 2023, 3820, 1012, 1015, 1012, 1050, 102, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 

In [None]:
# Collect unique labels across all samples
unique_labels = set(label for sample in tokenized_data for label in sample["labels"])
id2label = {v: k for k, v in label2id.items()}



In [None]:
from transformers import AutoTokenizer, AutoModelForTokenClassification
tokenizer = AutoTokenizer.from_pretrained("nlpaueb/bert-base-uncased-contracts")
model = AutoModelForTokenClassification.from_pretrained("nlpaueb/bert-base-uncased-contracts")
# nlp = pipeline("ner", model=model, tokenizer=tokenizer)


Some weights of BertForTokenClassification were not initialized from the model checkpoint at nlpaueb/bert-base-uncased-contracts and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
from torch.utils.data import DataLoader, Dataset
from sklearn.model_selection import train_test_split
import torch
# Split the data into training and validation sets
train_data, valid_data = train_test_split(tokenized_data, test_size=0.2)

num_labels = len(label2id)
train_loader = DataLoader(train_data, batch_size=4, shuffle=True)
valid_loader = DataLoader(valid_data, batch_size=4, shuffle=False)


In [None]:
for sample in tokenized_data:
    if len(sample["labels"]) != 512:
        print(len(sample["labels"]))

In [None]:
# !pip install -q evaluate
# !pip install -q accelerate -U
# !pip install -q transformers[torch]

In [None]:
model_name = 'nlpaueb/bert-base-uncased-contracts'

from transformers import AutoModelForTokenClassification

model = AutoModelForTokenClassification.from_pretrained(
    model_name, num_labels=28, id2label=id2label, label2id=label2id, ignore_mismatched_sizes=True
)

Some weights of BertForTokenClassification were not initialized from the model checkpoint at nlpaueb/bert-base-uncased-contracts and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
import numpy as np
import evaluate

metric = evaluate.load("accuracy")

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    logits_flat = np.argmax(logits, axis=-1).flatten()
    labels_flat = labels.flatten()
    return metric.compute(predictions=logits_flat, references=labels_flat)


In [None]:
learning_rate = 0.0000005
lr_max = learning_rate * BATCH_SIZES
weight_decay = 0.05

optimizer = AdamW(
    model.parameters(),
    lr=lr_max,
    weight_decay=weight_decay)

In [None]:
num_train_samples = len(train_data)
warmup_ratio = 0.2 # Percentage of total steps to go from zero to max learning rate
num_cycles=0.8 # The cosine exponential rate

num_training_steps = num_train_samples*EPOCHS/BATCH_SIZES
num_warmup_steps = num_training_steps*warmup_ratio

lr_sched = get_cosine_schedule_with_warmup(optimizer=optimizer,
                                           num_warmup_steps=num_warmup_steps,
                                           num_training_steps = num_training_steps,
                                           num_cycles=num_cycles)

In [None]:
from transformers import TrainingArguments, Trainer

training_args = TrainingArguments(
    output_dir="test_trainer",
    evaluation_strategy="epoch",
    save_strategy='epoch',
    learning_rate=2e-5,
    weight_decay=0.01,
    num_train_epochs=10,
    load_best_model_at_end=True,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_data,
    eval_dataset=valid_data,
    compute_metrics=compute_metrics,
    optimizer=optimizer
)

In [None]:
del tokenized_data

In [None]:
trainer.train()


Epoch,Training Loss,Validation Loss,Accuracy
1,No log,0.123323,0.97463
2,No log,0.105203,0.977668
3,0.184300,0.109177,0.978665
4,0.184300,0.102771,0.97734
5,0.069200,0.110868,0.977277
6,0.069200,0.120152,0.97618
7,0.035200,0.118066,0.979276
8,0.035200,0.117799,0.977826
9,0.021000,0.12435,0.977863
10,0.021000,0.124291,0.978644


TrainOutput(global_step=2350, training_loss=0.06835059754391934, metrics={'train_runtime': 2194.5996, 'train_samples_per_second': 8.562, 'train_steps_per_second': 1.071, 'total_flos': 4910831392542720.0, 'train_loss': 0.06835059754391934, 'epoch': 10.0})

In [None]:
trainer.save_model("Lber2_full_data")
trainer.model.save_pretrained("Lber2_full_data")



In [None]:
import os
os.environ["CUDA_LAUNCH_BLOCKING"] = "1"
import gc

# Run garbage collection (may not always free memory immediately)
gc.collect()


132

In [None]:
!huggingface-cli login --token=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
new_model ="Lber2_full_data"
model.push_to_hub(new_model, max_shard_size='2GB')
tokenizer.push_to_hub(new_model)

Token will not been saved to git credential helper. Pass `add_to_git_credential=True` if you want to set the git credential as well.
Token is valid (permission: write).
Your token has been saved to /root/.cache/huggingface/token
Login successful


model.safetensors:   0%|          | 0.00/436M [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/Cognitus-Stuti/CommercialLBERT/commit/fe68e781979c41c4114fe37893ca9470038c5ca8', commit_message='Upload tokenizer', commit_description='', oid='fe68e781979c41c4114fe37893ca9470038c5ca8', pr_url=None, pr_revision=None, pr_num=None)

In [None]:
trainer.push_to_hub("LambdaX-AI/Lber2_full_data")
