In [None]:
!wget http://nlp.stanford.edu/data/glove.6B.zip

In [None]:
!unzip glove.6B.zip

In [1]:
import json
import random
import numpy as np
import tensorflow as tf
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.layers import Embedding, LSTM, Dense, TimeDistributed, Dropout, Bidirectional
from tensorflow.keras.models import Sequential
from sklearn.model_selection import train_test_split
from tensorflow.keras.optimizers import Adam

In [None]:
with open("./license_data_with_missing_parts.json", "r", encoding="utf-8") as f:
    data = json.load(f)

In [None]:
import spacy

nlp = spacy.blank("en")

sentences = []
labels = []
label_set = {"O"}

for item in data:
    text = item["text"]
    entities = item["entities"]

    doc = nlp(text)
    words = [token.text for token in doc]
    token_start_char_idxs = [token.idx for token in doc]

    label_seq = ["O"] * len(words)

    for entity in entities:
        ent_start = entity["start"]
        ent_end = entity["end"]
        ent_label = entity["label"]

        for i, token in enumerate(doc):
            token_start = token.idx
            token_end = token.idx + len(token)

            if token_start >= ent_start and token_end <= ent_end:
                if token_start == ent_start:
                    label_seq[i] = f"B-{ent_label}"
                else:
                    label_seq[i] = f"I-{ent_label}"
                label_set.add(label_seq[i])

    sentences.append(words)
    labels.append(label_seq)


In [None]:
sentences[1]

In [None]:
labels[1]

In [None]:
word_set = {word for sent in sentences for word in sent}
word2idx = {word: idx + 1 for idx, word in enumerate(word_set)}
word2idx["PAD"] = 0

In [None]:
label2idx = {label: idx for idx, label in enumerate(label_set)}

In [None]:
X = [[word2idx[word] for word in sent] for sent in sentences]
y = [[label2idx[label] for label in label_seq] for label_seq in labels]

In [None]:
max_length = max(len(sent) for sent in X)
max_length

In [None]:
X = pad_sequences(X, maxlen=max_length, padding="post")
y = pad_sequences(y, maxlen=max_length, padding="post")

In [None]:
y = np.array([to_categorical(seq, num_classes=len(label2idx)) for seq in y])

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
embedding_dim = 200
embedding_file = "glove.6B.200d.txt"

embedding_matrix = np.zeros((len(word2idx), embedding_dim))
embeddings_index = {}

with open(embedding_file, 'r', encoding='utf-8') as f:
    for line in f:
        values = line.split()
        word = values[0]
        coefs = np.asarray(values[1:], dtype='float32')
        embeddings_index[word] = coefs

for word, i in word2idx.items():
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        embedding_matrix[i] = embedding_vector

In [None]:
model = Sequential([
    Embedding(input_dim=len(word2idx), output_dim=embedding_dim,
              weights=[embedding_matrix], input_length=max_length,
              trainable=False),
    Bidirectional(LSTM(units=128, return_sequences=True, dropout=0.5, recurrent_dropout=0.2)),
    Bidirectional(LSTM(units=64, return_sequences=True, dropout=0.5, recurrent_dropout=0.2)),
    Bidirectional(LSTM(units=32, return_sequences=True, dropout=0.5, recurrent_dropout=0.2)),
    Dense(len(label2idx), activation="softmax")
])

optimizer = Adam(learning_rate=0.001)
model.compile(optimizer=optimizer, loss="categorical_crossentropy", metrics=["accuracy"])

model.fit(X_train, y_train, batch_size=32, epochs=35, validation_data=(X_test, y_test))

model.save("ner_model.h5")

model.save("ner_model.keras", save_format="keras")

model.export("ner_model_savedmodel")

with open("word2idx.json", "w") as f:
    json.dump(word2idx, f)

with open("label2idx.json", "w") as f:
    json.dump(label2idx, f)

print("Model training complete. Saved as 'ner_model.h5'")


In [None]:
!zip -r ner_model_savedmodel.zip ner_model_savedmodel/

In [None]:
import json
import numpy as np
import spacy
from tensorflow.keras.models import load_model
from tensorflow.keras.preprocessing.sequence import pad_sequences


nlp = spacy.blank("en")


with open("word2idx.json", "r") as f:
    word2idx = json.load(f)

with open("label2idx.json", "r") as f:
    label2idx = json.load(f)

idx2label = {int(v): k for k, v in label2idx.items()}

model = load_model("ner_model.h5")


max_length = 0
for word in word2idx:
    if word != "PAD":
        max_length = max(max_length, len(word))

max_length = 100

def predict_entities_dict(text):
    text = text.lower()
    doc = nlp(text)
    words = [token.text for token in doc]

    X_test = pad_sequences([[word2idx.get(w, 0) for w in words]], maxlen=max_length, padding="post")

    predictions = model.predict(X_test)[0]
    predicted_labels = [idx2label[np.argmax(p)] for p in predictions[:len(words)]]

    entity_dict = {}
    current_entity = []
    current_entity_type = None

    for word, label in zip(words, predicted_labels):
        if label.startswith("B-"):
            if current_entity:
                entity_dict[current_entity_type] = " ".join(current_entity)
            current_entity_type = label[2:]
            current_entity = [word]
        elif label.startswith("I-") and current_entity:
            current_entity.append(word)
        else:
            if current_entity:
                entity_dict[current_entity_type] = " ".join(current_entity)
                current_entity = []
                current_entity_type = None

    if current_entity:
        entity_dict[current_entity_type] = " ".join(current_entity)

    return entity_dict


sample_text = """DRIVING LICENCE DEMOCRATIC SOCIALIST REPUBLIC OF SRI LANKA 200035304389 4829786
VINNATH NINURA SATHARASINGHE 279/5 RATHMALDENIYA ROAD PANNIPITIYA
18.2000 18.2020 18.2028 Blood Group B+ J.M.U.K Jayasekera Commissioner General Motor Traffic"""

predictions_dict = predict_entities_dict(sample_text)

print("Extracted Entities:\n")
for entity, value in predictions_dict.items():
    print(f"{entity}: {value}")
