<a href="https://colab.research.google.com/github/Ali-Hasan-Khan28/Natural-Language-Processing/blob/main/Lab11b.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import numpy as np
import pandas as pd
import tensorflow as tf
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from keras.preprocessing.sequence import pad_sequences
from keras.utils import to_categorical
from keras.models import Model
from keras.layers import Input, LSTM, Embedding, Dense, TimeDistributed, Bidirectional

In [2]:
data = pd.DataFrame({
    'word': ["John", "Smith", "works", "at", "Google", "in", "New", "York"],
    'pos_tag': ["NNP", "NNP", "VBZ", "IN", "NNP", "IN", "NNP", "NNP"],
    'label': ["B-PER", "I-PER", "O", "O", "B-ORG", "O", "B-LOC", "I-LOC"]
})

In [3]:
words = data['word'].tolist()
labels = data['label'].tolist()
num_words = len(set(words))
num_labels = len(set(labels))
word2idx = {word: idx + 1 for idx, word in enumerate(set(words))}
label2idx = {label: idx for idx, label in enumerate(set(labels))}
X = [[word2idx[word] for word in words]]
y = [[label2idx[label] for label in labels]]


In [4]:
X_train, X_test, y_train, y_test = train_test_split(X * 100, y * 100, test_size=0.2, random_state=42)

In [5]:
def create_lstm_model(max_len, num_words, num_labels):
    input_layer = Input(shape=(max_len,))
    embedding_layer = Embedding(input_dim=num_words, output_dim=50, input_length=max_len)(input_layer)
    lstm_layer = Bidirectional(LSTM(units=100, return_sequences=True, recurrent_dropout=0.1))(embedding_layer)
    output_layer = TimeDistributed(Dense(num_labels, activation='softmax'))(lstm_layer)
    model = Model(inputs=input_layer, outputs=output_layer)
    model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])
    return model

In [10]:
def train_lstm_model(X_train, y_train, X_test, y_test, num_words, num_labels):
    max_len = max(len(seq) for seq in X_train + X_test)
    X_train_pad = pad_sequences(X_train, maxlen=max_len, padding='post', value=0)  # Add 'value=0' for padding token
    y_train_pad = pad_sequences(y_train, maxlen=max_len, padding='post', value=0)  # Add 'value=0' for padding token
    X_test_pad = pad_sequences(X_test, maxlen=max_len, padding='post', value=0)    # Add 'value=0' for padding token
    y_test_pad = pad_sequences(y_test, maxlen=max_len, padding='post', value=0)    # Add 'value=0' for padding token

    model = create_lstm_model(max_len, num_words, num_labels)
    model.fit(X_train_pad, to_categorical(y_train_pad, num_classes=num_labels), validation_data=(X_test_pad, to_categorical(y_test_pad, num_classes=num_labels)), batch_size=32, epochs=10)
    return model

In [11]:
# Step 5: Inference
# Evaluate the model's performance using standard NER metrics such as precision, recall, and F1-score
def evaluate_model(model, X_test, y_test):
    y_pred = model.predict(X_test)
    y_pred_labels = np.argmax(y_pred, axis=-1)
    y_test_labels = np.array(y_test)
    # Flatten the sequences and ignore padding
    y_pred_labels_flat = [label for pred_seq, true_seq in zip(y_pred_labels, y_test_labels) for label in pred_seq[:len(true_seq)]]
    y_test_labels_flat = [label for seq in y_test_labels for label in seq]
    print(classification_report(y_test_labels_flat, y_pred_labels_flat))

In [15]:
# Train and evaluate the LSTM model
model = train_lstm_model(X_train, y_train, X_test, y_test, num_words + 1, num_labels + 1)  # Increment num_words and num_labels to account for padding token

# Calculate max_len
max_len = max(len(seq) for seq in X_train + X_test)

# Retrieve padded test data from the train_lstm_model function
X_test_pad = pad_sequences(X_test, maxlen=max_len, padding='post', value=0)    # Add 'value=0' for padding token
y_test_pad = pad_sequences(y_test, maxlen=max_len, padding='post', value=0)    # Add 'value=0' for padding token

# Evaluate the model
evaluate_model(model, X_test_pad, y_test_pad)


Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
              precision    recall  f1-score   support

           0       0.00      0.00      0.00        20
           1       0.50      1.00      0.67        60
           2       1.00      1.00      1.00        20
           3       0.00      0.00      0.00        20
           4       0.00      0.00      0.00        20
           5       1.00      1.00      1.00        20

    accuracy                           0.62       160
   macro avg       0.42      0.50      0.44       160
weighted avg       0.44      0.62      0.50       160



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
