In [2]:
import jsonlines
from collections import Counter

train_file_path = r"C:\Users\akilarasan.p\Downloads\DATASET\NLP-data\FindVehicle_train.jsonl"

with jsonlines.open(train_file_path) as reader:
    records = list(reader)

label_counter = Counter()

for record in records:
    for label in record.get('ner_label', []):
        label_type = label[0]  # The first element is the label type
        label_counter[label_type] += 1

print("NER label counts:")
for label_type, count in label_counter.items():
    print(f"{label_type}: {count}")

NER label counts:
vehicle_type: 1521
vehicle_location: 11394
vehicle_color: 19312
vehicle_range: 188
vehicle_orientation: 8286
vehicle_velocity: 8141
vehicle_brand: 17872
vehicle_model: 17872
vehicle_type-suv: 2564
vehicle_type-sedan: 3909
vehicle_type-hatchback: 1615
vehicle_type-sports_car: 1646
vehicle_type-coupe: 945
vehicle_type-bus: 462
vehicle_type-vintage_car: 1412
vehicle_type-motorcycle: 2592
vehicle_type-truck: 859
vehicle_type-roadster: 373
vehicle_type-van: 556
vehicle_type-mpv: 608
vehicle_type-estate_car: 521


In [15]:
import json
from transformers import BertTokenizerFast
from datasets import Dataset
from pathlib import Path

# === Setup ===
tokenizer = BertTokenizerFast.from_pretrained("bert-base-cased")
input_file = r"C:\Users\akilarasan.p\Downloads\DATASET\NLP-data\FindVehicle_train.jsonl"
output_file = r"C:\Users\akilarasan.p\Downloads\DATASET\NLP-data\bio_tagged_dataset.json"

# === Collect unique labels for tag mapping ===
all_labels = set()
with open(input_file, "r", encoding="utf-8") as f:
    for line in f:
        sample = json.loads(line)
        for label in sample["ner_label"]:
            all_labels.add(label[0])

# Add BIO prefix and map to IDs
labels_list = sorted(list(all_labels))
bio_labels = ["O"] + [f"{prefix}-{label}" for label in labels_list for prefix in ["B", "I"]]
label2id = {label: i for i, label in enumerate(bio_labels)}
id2label = {i: label for label, i in label2id.items()}

# === Conversion ===
def convert_sample(sample):
    text = sample["data"]
    entities = sample["ner_label"]

    # Tokenize with offset mapping
    encoding = tokenizer(text, return_offsets_mapping=True, truncation=True)
    tokens = tokenizer.convert_ids_to_tokens(encoding["input_ids"])
    offsets = encoding["offset_mapping"]

    tags = ["O"] * len(tokens)

    for entity in entities:
        label, start_char, end_char = entity[0], entity[1], entity[2]

        for i, (token_start, token_end) in enumerate(offsets):
            if token_start >= end_char:
                break
            if token_end <= start_char:
                continue

            if token_start >= start_char and token_end <= end_char:
                if tags[i] == "O":
                    tags[i] = f"B-{label}" if token_start == start_char else f"I-{label}"

    tag_ids = [label2id[tag] for tag in tags]
    return {
        "tokens": tokens,
        "ner_tags": tag_ids
    }

# === Process All Samples ===
converted = []
with open(input_file, "r", encoding="utf-8") as f:
    for line in f:
        sample = json.loads(line)
        converted.append(convert_sample(sample))

# === Save to File (optional) ===
with open(output_file, "w", encoding="utf-8") as f:
    json.dump(converted, f, indent=2)

# === Optional: Convert to Hugging Face Dataset ===
dataset = Dataset.from_list(converted)
dataset.save_to_disk("bio_tagged_hf_dataset")

print(f"✅ Done. Total samples: {len(converted)}")
print(f"Labels: {bio_labels}")

Saving the dataset (0/1 shards):   0%|          | 0/21565 [00:00<?, ? examples/s]

✅ Done. Total samples: 21565
Labels: ['O', 'B-vehicle_brand', 'I-vehicle_brand', 'B-vehicle_color', 'I-vehicle_color', 'B-vehicle_location', 'I-vehicle_location', 'B-vehicle_model', 'I-vehicle_model', 'B-vehicle_orientation', 'I-vehicle_orientation', 'B-vehicle_range', 'I-vehicle_range', 'B-vehicle_type', 'I-vehicle_type', 'B-vehicle_type-bus', 'I-vehicle_type-bus', 'B-vehicle_type-coupe', 'I-vehicle_type-coupe', 'B-vehicle_type-estate_car', 'I-vehicle_type-estate_car', 'B-vehicle_type-hatchback', 'I-vehicle_type-hatchback', 'B-vehicle_type-motorcycle', 'I-vehicle_type-motorcycle', 'B-vehicle_type-mpv', 'I-vehicle_type-mpv', 'B-vehicle_type-roadster', 'I-vehicle_type-roadster', 'B-vehicle_type-sedan', 'I-vehicle_type-sedan', 'B-vehicle_type-sports_car', 'I-vehicle_type-sports_car', 'B-vehicle_type-suv', 'I-vehicle_type-suv', 'B-vehicle_type-truck', 'I-vehicle_type-truck', 'B-vehicle_type-van', 'I-vehicle_type-van', 'B-vehicle_type-vintage_car', 'I-vehicle_type-vintage_car', 'B-vehicle_

In [16]:
# pip install transformers datasets seqeval scikit-learn

In [17]:
from transformers import BertTokenizerFast, BertForTokenClassification, Trainer, TrainingArguments
from datasets import load_dataset, ClassLabel, Dataset
from sklearn.utils.class_weight import compute_class_weight
from seqeval.metrics import classification_report, f1_score, precision_score, recall_score
import numpy as np
import json
import torch

# === Load Data ===
with open("bio_tagged_dataset.json", "r") as f:
    data = json.load(f)

dataset = Dataset.from_list(data)
label_list = sorted(set(tag for row in data for tag in row["ner_tags"]))
id2label = {id: f"LABEL_{id}" for id in label_list}
label2id = {v: k for k, v in id2label.items()}

# === Tokenizer & Model ===
model_checkpoint = "bert-base-cased"
tokenizer = BertTokenizerFast.from_pretrained(model_checkpoint)
model = BertForTokenClassification.from_pretrained(model_checkpoint, num_labels=len(label_list), id2label=id2label, label2id=label2id)

# === Compute Class Weights ===
all_labels = [tag for row in data for tag in row["ner_tags"]]
weights = compute_class_weight('balanced', classes=np.array(label_list), y=all_labels)
class_weights = torch.tensor(weights, dtype=torch.float)

# Inject class weights into loss
def compute_loss_with_weights(model, inputs, return_outputs=False):
    labels = inputs.get("labels")
    outputs = model(**inputs)
    loss_fct = torch.nn.CrossEntropyLoss(weight=class_weights.to(outputs.logits.device))
    loss = loss_fct(outputs.logits.view(-1, model.num_labels), labels.view(-1))
    return (loss, outputs) if return_outputs else loss

model.compute_loss = compute_loss_with_weights

# === Tokenize Inputs (they’re already tokenized) ===
def tokenize_and_align_labels(example):
    return {"input_ids": tokenizer.convert_tokens_to_ids(example["tokens"])}

dataset = dataset.map(tokenize_and_align_labels)

# === Train/Test Split ===
dataset = dataset.train_test_split(test_size=0.2)
train_dataset = dataset["train"]
eval_dataset = dataset["test"]

# === Metrics ===
def compute_metrics(p):
    predictions, labels = p
    predictions = np.argmax(predictions, axis=2)
    true_labels = [[id2label.get(l, "O") for l in label if l != -100] for label in labels]
    true_predictions = [[id2label.get(p, "O") for (p, l) in zip(pred, label) if l != -100]
                        for pred, label in zip(predictions, labels)]
    return {
        "precision": precision_score(true_labels, true_predictions),
        "recall": recall_score(true_labels, true_predictions),
        "f1": f1_score(true_labels, true_predictions),
    }

# === Training Arguments ===
args = TrainingArguments(
    output_dir="./ner_model",
    evaluation_strategy="epoch",
    save_strategy="epoch",
    logging_dir="./logs",
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=5,
    learning_rate=5e-5,
    weight_decay=0.01,
    logging_steps=10,
    report_to="none"
)

# === Trainer ===
trainer = Trainer(
    model=model,
    args=args,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)

# === Train ===
trainer.train()

# === Save Model ===
trainer.save_model("ner_model")
tokenizer.save_pretrained("ner_model")





ModuleNotFoundError: No module named 'seqeval'