In [2]:
import numpy as np
from tensorflow.keras import layers, models
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import classification_report
from keras.preprocessing.sequence import pad_sequences

# Load and preprocess data
def load_data(filename):
    sentences = []
    labels = []
    with open(filename, 'r') as f:
        lines = f.readlines()
        sentence = []
        label = []
        for line in lines:
            if line == '\n':
                sentences.append(sentence)
                labels.append(label)
                sentence = []
                label = []
            else:
                word, _, _, ner_tag = line.split()
                sentence.append(word)
                label.append(ner_tag)
    return sentences, labels

train_sentences, train_labels = load_data('en_ewt-ud-train.iob2')
dev_sentences, dev_labels = load_data('en_ewt-ud-dev.iob2')

# Encode labels
label_encoder = LabelEncoder()
label_encoder.fit(np.concatenate(train_labels))
train_labels_encoded = [label_encoder.transform(seq) for seq in train_labels]
dev_labels_encoded = [label_encoder.transform(seq) for seq in dev_labels]

# Pad sequences
max_seq_length = 100  # Adjust according to your data
train_sentences_padded = pad_sequences(train_sentences, maxlen=max_seq_length, padding='post')
dev_sentences_padded = pad_sequences(dev_sentences, maxlen=max_seq_length, padding='post')

# Define model architecture
vocab_size = len(word_index)  # Update with your vocabulary size
embedding_dim = 100  # Adjust according to your embedding dimension
num_labels = len(label_encoder.classes_)
model = models.Sequential([
    layers.Embedding(input_dim=vocab_size, output_dim=embedding_dim, input_length=max_seq_length),
    layers.Bidirectional(layers.LSTM(units=128, return_sequences=True)),
    layers.TimeDistributed(layers.Dense(num_labels, activation='softmax'))
])

# Compile the model
model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])

# Train the model
model.fit(train_sentences_padded, np.array(train_labels_encoded), validation_data=(dev_sentences_padded, np.array(dev_labels_encoded)), epochs=10, batch_size=32)

# Predict on dev set
dev_predictions = model.predict(dev_sentences_padded)
dev_predictions_labels = [label_encoder.inverse_transform(np.argmax(pred, axis=-1)) for pred in dev_predictions]

# Flatten the predictions and true labels for evaluation
dev_true_labels_flattened = [label for sublist in dev_labels for label in sublist]
dev_predictions_flattened = [label for sublist in dev_predictions_labels for label in sublist]

# Evaluate the model
print(classification_report(dev_true_labels_flattened, dev_predictions_flattened))


: 

: 