In [2]:
import numpy as np
import tensorflow as tf
from keras.preprocessing.sequence import pad_sequences
from keras.utils import to_categorical
from keras.models import Sequential
from keras.layers import LSTM, Dense, Embedding, TimeDistributed, Dropout, Bidirectional
from sklearn.model_selection import train_test_split

# Function to read data from the file
def read_training_data(file_path):
    sentences = []
    labels = []
    with open(file_path, 'r') as file:
        for line in file:
            parts = line.strip().split('\t')
            if len(parts) == 2:
                sentence, label = parts
                sentences.append(sentence)
                labels.append(label.split())
    return sentences, labels

# Read data from "Identity trainer.txt"
file_path = "Identity trainer.txt"
sentences, labels = read_training_data(file_path)

# Ensure we have sentences and labels
if not sentences or not labels:
    raise ValueError("No sentences or labels found in the file.")

# Build a vocabulary
words = set(word.lower() for sentence in sentences for word in sentence.split())
n_words = len(words)
tags = set(tag for label in labels for tag in label)
n_tags = len(tags)

word2idx = {w: i + 2 for i, w in enumerate(words)}
word2idx["PAD"] = 0
word2idx["UNK"] = 1

tag2idx = {t: i + 1 for i, t in enumerate(tags)}
tag2idx["PAD"] = 0

idx2tag = {i: w for w, i in tag2idx.items()}

# Convert sentences and labels to sequences of indices
X = [[word2idx.get(w.lower(), 1) for w in s.split()] for s in sentences]
y = [[tag2idx[t] for t in tag] for tag in labels]

# Ensure sequences are not empty
if any(len(seq) == 0 for seq in X) or any(len(seq) == 0 for seq in y):
    raise ValueError("Found empty sequences in the data.")

# Pad sequences
X = pad_sequences(X, padding="post")
y = pad_sequences(y, padding="post")

# Convert labels to categorical
y = [to_categorical(i, num_classes=n_tags + 1) for i in y]

# Model definition
model = Sequential()
model.add(Embedding(input_dim=n_words + 2, output_dim=50, input_length=X.shape[1]))
model.add(Bidirectional(LSTM(units=100, return_sequences=True, recurrent_dropout=0.1)))
model.add(TimeDistributed(Dense(n_tags + 1, activation="softmax")))

model.compile(optimizer="adam", loss="categorical_crossentropy", metrics=["accuracy"])

# Train the model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1)
history = model.fit(X_train, np.array(y_train), batch_size=32, epochs=5, validation_split=0.1)

# NER prediction function
def ner_predict(sentence):
    tokens = sentence.lower().split()
    input_data = pad_sequences([[word2idx.get(w, 1) for w in tokens]], padding="post", maxlen=X.shape[1])
    predictions = model.predict(input_data)
    predicted_tags = np.argmax(predictions, axis=-1)
    result = [(token, idx2tag[tag]) for token, tag in zip(tokens, predicted_tags[0]) if tag != 0]
    return result

# Test the function
test_sentence = "Alice will visit Paris in July"
print(ner_predict(test_sentence))


ValueError: No sentences or labels found in the file.