In [3]:
import jsonlines
import random
import spacy
from spacy.tokens import DocBin
from spacy.util import filter_spans
import jsonlines
from spacy.scorer import Scorer
from spacy.tokens import Doc
from spacy.training import Example  # Import the Example class

In [4]:
# Path to the original training file
train_file_path = r'/content/FindVehicle_train.jsonl'
with jsonlines.open(train_file_path) as reader:
    records = list(reader)

random.shuffle(records)
split_index = int(0.8 * len(records))
train_records = records[:split_index]
dev_records = records[split_index:]

# Save the training and validation datasets
with jsonlines.open(r"findvehicle_train_split.jsonl", mode="w") as writer:
    writer.write_all(train_records)

with jsonlines.open(r"findvehicle_dev_split.jsonl", mode="w") as writer:
    writer.write_all(dev_records)

In [7]:
def spacy_vali_tra(dev_file_path,name):
    # Load a blank spaCy model
    nlp = spacy.blank("en")
    dev_doc_bin = DocBin()
    with jsonlines.open(dev_file_path) as dev_data:
        for record in dev_data:
            doc = nlp(record["data"])
            entities = []
            for ner in record["ner_label"]:
                start_char, end_char = ner[1], ner[2]
                label = ner[0]
                span = doc.char_span(start_char, end_char, label=label)
                if span:
                    entities.append(span)
                else:
                    print(f"Misaligned entity: {ner}")

            # Filter overlapping entities
            filtered_entities = filter_spans(entities)
            doc.ents = filtered_entities
            dev_doc_bin.add(doc)

    dev_doc_bin.to_disk(f"{name}.spacy")

dev = spacy_vali_tra(r"findvehicle_dev_split.jsonl",'dev01')
train = spacy_vali_tra(r"findvehicle_train_split.jsonl", 'train01')

In [10]:
#generate config.cfg file
!python -m spacy init config config.cfg --lang en --pipeline ner

[38;5;3m⚠ To generate a more effective transformer-based config (GPU-only),
install the spacy-transformers package and re-run this command. The config
generated now does not use transformers.[0m
[38;5;4mℹ Generated config template specific for your use case[0m
- Language: en
- Pipeline: ner
- Optimize for: efficiency
- Hardware: CPU
- Transformer: None
[38;5;2m✔ Auto-filled config with all values[0m
[38;5;2m✔ Saved config[0m
config.cfg
You can now add your data and train your pipeline:
python -m spacy train config.cfg --paths.train ./train.spacy --paths.dev ./dev.spacy


In [14]:
!python -m spacy train config.cfg --output ./output --paths.train ./train01.spacy --paths.dev ./dev01.spacy

[38;5;4mℹ Saving to output directory: output[0m
[38;5;4mℹ Using CPU[0m
[38;5;4mℹ To switch to GPU 0, use the option: --gpu-id 0[0m
[1m
[38;5;2m✔ Initialized pipeline[0m
[1m
[38;5;4mℹ Pipeline: ['tok2vec', 'ner'][0m
[38;5;4mℹ Initial learn rate: 0.001[0m
E    #       LOSS TOK2VEC  LOSS NER  ENTS_F  ENTS_P  ENTS_R  SCORE 
---  ------  ------------  --------  ------  ------  ------  ------
  0       0          0.00     57.87    0.00    0.00    0.00    0.00
  0     200        850.38   4388.68   35.33   43.79   29.60    0.35
  0     400        270.51   3618.66   26.58   48.18   18.35    0.27
  0     600        200.89   3923.73   46.48   51.15   42.60    0.46
  0     800        249.69   4791.18   41.85   51.85   35.08    0.42
  0    1000        585.90   5809.85   38.46   50.41   31.09    0.38
  0    1200        307.77   6994.49   38.37   51.80   30.47    0.38
  0    1400        580.38   8577.39   35.76   51.89   27.28    0.36
  0    1600        396.82  10206.67   40.48   51.45 

In [25]:

# Load the trained model
model_path = "./output/model-best"
nlp = spacy.load(model_path)
test_sentences = [ "I am looking for a red Tesla Model S., hyundai",
    "The blue Audi Q7 and the black BMW X5 are parked outside.",
    "I saw a silver Toyota Camry and a white Ford Mustang.",
    "In the parking lot, there is a yellow Chevrolet Trailblazer and a gray Nissan Rogue.",
    "Suzuki"]

for sentence in test_sentences:
    doc = nlp(sentence)
    print(f"Input: {sentence}")
    if doc.ents:
        for ent in doc.ents:
            print(f"  Entity: {ent.text} → Label: {ent.label_}")
    else:
        print("  No entities detected.")
    print()

Input: I am looking for a red Tesla Model S., hyundai
  Entity: red → Label: vehicle_color
  Entity: Tesla → Label: vehicle_brand
  Entity: Model S., hyundai → Label: vehicle_model

Input: The blue Audi Q7 and the black BMW X5 are parked outside.
  Entity: blue → Label: vehicle_color
  Entity: Audi → Label: vehicle_brand
  Entity: Q7 → Label: vehicle_model
  Entity: black → Label: vehicle_color
  Entity: BMW → Label: vehicle_brand
  Entity: X5 → Label: vehicle_model
  Entity: parked outside → Label: vehicle_color

Input: I saw a silver Toyota Camry and a white Ford Mustang.
  Entity: silver → Label: vehicle_color
  Entity: Toyota → Label: vehicle_brand
  Entity: Camry → Label: vehicle_model
  Entity: white → Label: vehicle_color
  Entity: Ford → Label: vehicle_brand

Input: In the parking lot, there is a yellow Chevrolet Trailblazer and a gray Nissan Rogue.
  Entity: yellow → Label: vehicle_color
  Entity: Chevrolet → Label: vehicle_brand
  Entity: Trailblazer → Label: vehicle_model
  

In [26]:
# Path to the test dataset
test_file_path = r'/content/FindVehicle_test.jsonl'
test_doc_bin = DocBin()
with jsonlines.open(test_file_path) as test_data:
    for record in test_data:
        doc = nlp.make_doc(record["data"])
        entities = []
        for ner in record["ner_label"]:
            start_char, end_char = ner[1], ner[2]  # Character-level spans
            label = ner[0]  # Entity label
            span = doc.char_span(start_char, end_char, label=label)
            if span:
                entities.append(span)
            else:
                print(f"Misaligned entity: {ner}")

        filtered_entities = filter_spans(entities)
        doc.ents = filtered_entities
        test_doc_bin.add(doc)
test_doc_bin.to_disk("test.spacy")

In [27]:
# test the dataset
test_docs = list(test_doc_bin.get_docs(nlp.vocab))
scorer = Scorer()
examples = []
for doc in test_docs:
    pred_doc = nlp(doc.text)
    example = Example(pred_doc, doc)
    examples.append(example)

# Evaluate using the Scorer and capture the results
results = scorer.score(examples)

In [28]:
# Print evaluation metrics
print("Precision:", results["ents_p"])
print("Recall:", results["ents_r"])
print("F1-score:", results["ents_f"])
print()
print("Per-entity scores:", results["ents_per_type"])

Precision: 0.5116991964916836
Recall: 0.7289288466478294
F1-score: 0.6012959439243624

Per-entity scores: {'vehicle_color': {'p': 0.5212104386451971, 'r': 0.9981391886862672, 'f': 0.6848200769665688}, 'vehicle_type': {'p': 1.0, 'r': 1.0, 'f': 1.0}, 'vehicle_location': {'p': 0.5107361963190185, 'r': 0.9975659988766149, 'f': 0.6755848602041463}, 'vehicle_orientation': {'p': 0.5342825406442492, 'r': 0.8809523809523809, 'f': 0.6651580866176378}, 'vehicle_velocity': {'p': 0.5148075534000619, 'r': 0.6252663241007645, 'f': 0.5646859083191851}, 'vehicle_range': {'p': 1.0, 'r': 1.0, 'f': 1.0}, 'vehicle_brand': {'p': 0.49818945538818077, 'r': 0.7874763894453666, 'f': 0.610286778894138}, 'vehicle_model': {'p': 0.49542351846106114, 'r': 0.7311544845744377, 'f': 0.5906369205872153}, 'vehicle_type-suv': {'p': 0.3696969696969697, 'r': 0.024053627760252366, 'f': 0.04516845612736023}, 'vehicle_type-sedan': {'p': 0.40350877192982454, 'r': 0.012554585152838428, 'f': 0.02435150873478031}, 'vehicle_type-ha

In [29]:
!python -m spacy evaluate ./output/model-best ./test.spacy

[38;5;4mℹ Using CPU[0m
[38;5;4mℹ To switch to GPU 0, use the option: --gpu-id 0[0m
[1m

TOK     100.00
NER P   51.17 
NER R   72.89 
NER F   60.13 
SPEED   28676 

[1m

                                P        R        F
vehicle_color               52.12    99.81    68.48
vehicle_type               100.00   100.00   100.00
vehicle_location            51.07    99.76    67.56
vehicle_orientation         53.43    88.10    66.52
vehicle_velocity            51.48    62.53    56.47
vehicle_range              100.00   100.00   100.00
vehicle_brand               49.82    78.75    61.03
vehicle_model               49.54    73.12    59.06
vehicle_type-suv            36.97     2.41     4.52
vehicle_type-sedan          40.35     1.26     2.44
vehicle_type-hatchback      41.38     5.63     9.91
vehicle_type-sports_car     43.75     0.45     0.88
vehicle_type-coupe          43.22    29.49    35.06
vehicle_type-bus            37.16    12.33    18.52
vehicle_type-vintage_car     0.00     0.00   