In [14]:
from tensorflow.keras.layers import Input, Embedding, Bidirectional, LSTM, Dropout, TimeDistributed, Dense
from tensorflow.keras.models import Model
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import EarlyStopping
import numpy as np
from tensorflow.keras.utils import to_categorical
from sklearn.metrics import f1_score

In [7]:
!ls /kaggle/input/glove6b100dtxt

glove.6B.100d.txt


In [4]:
def load_data(file_path):
    with open(file_path, "r") as f:
        lines = f.readlines()

    sentences, labels = [], []
    sentence, label = [], []

    for line in lines:
        if line == "\n" or line.startswith("-DOCSTART-"):
            if sentence and label:
                sentences.append(sentence)
                labels.append(label)
                sentence, label = [], []
        else:
            word, _, _, tag = line.strip().split()
            sentence.append(word.lower())  # Normalize the case
            label.append(tag)

    return sentences, labels


In [8]:
train_file_path = "/kaggle/input/conll-2003/data/conllpp_train.txt"
test_file_path = "/kaggle/input/conll-2003/data/conllpp_test.txt"

train_sentences, train_labels = load_data(train_file_path)
test_sentences, test_labels = load_data(test_file_path)

In [9]:
# Load pre-trained word embeddings
embeddings_index = {}
with open('/kaggle/input/glove6b100dtxt/glove.6B.100d.txt', encoding='utf-8') as f:
    for line in f:
        values = line.split()
        word = values[0]
        coefs = np.asarray(values[1:], dtype='float32')
        embeddings_index[word] = coefs

In [15]:
# Create word-to-index and tag-to-index dictionaries
words = list(set([word for sentence in train_sentences + test_sentences for word in sentence]))
words.append('ENDPAD')
n_words = len(words)
tags = list(set([tag for label in train_labels + test_labels for tag in label]))
n_tags = len(tags)

word_index = {w: i for i, w in enumerate(words)}
label_index = {t: i for i, t in enumerate(tags)}

# Convert words and tags to sequences of indices
X_train = [[word_index[w] for w in sentence] for sentence in train_sentences]
X_train = pad_sequences(maxlen=50, sequences=X_train, padding='post', value=n_words-1)
y_train = [[label_index[t] for t in label] for label in train_labels]
y_train = pad_sequences(maxlen=50, sequences=y_train, padding='post', value=label_index['O'])
y_train = [to_categorical(i, num_classes=n_tags) for i in y_train]

X_test = [[word_index[w] for w in sentence] for sentence in test_sentences]
X_test = pad_sequences(maxlen=50, sequences=X_test, padding='post', value=n_words-1)
y_test = [[label_index[t] for t in label] for label in test_labels]
y_test = pad_sequences(maxlen=50, sequences=y_test, padding='post', value=label_index['O'])
y_test = [to_categorical(i, num_classes=n_tags) for i in y_test]

max_len  = 50

In [16]:
# Create embedding matrix
embedding_dim = 100
embedding_matrix = np.zeros((len(word_index) + 1, embedding_dim))
for word, i in word_index.items():
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        embedding_matrix[i] = embedding_vector

In [17]:
# Define model architecture
input_layer = Input(shape=(50,))
embedding_layer = Embedding(len(word_index) + 1, embedding_dim, weights=[embedding_matrix], input_length=max_len, trainable=False)(input_layer)
dropout_layer = Dropout(0.5)(embedding_layer)
bidirectional_layer = Bidirectional(LSTM(128, return_sequences=True))(dropout_layer)
output_layer = TimeDistributed(Dense(len(label_index), activation='softmax'))(bidirectional_layer)
model = Model(inputs=input_layer, outputs=output_layer)

# Compile model
model.compile(optimizer=Adam(), loss='categorical_crossentropy', metrics=['accuracy'])




ValueError: Unrecognized keyword arguments passed to Embedding: {'weights': [array([[ 0.        ,  0.        ,  0.        , ...,  0.        ,
         0.        ,  0.        ],
       [-0.11514   ,  0.16726001, -0.23509   , ..., -0.35883   ,
         0.38626999, -0.61556   ],
       [ 0.18971001, -0.017413  ,  0.62576002, ..., -0.35029   ,
         0.034271  ,  0.82238001],
       ...,
       [-0.075529  , -0.27406001, -0.34755999, ...,  0.19689   ,
        -0.22531   , -0.92199999],
       [ 0.        ,  0.        ,  0.        , ...,  0.        ,
         0.        ,  0.        ],
       [ 0.        ,  0.        ,  0.        , ...,  0.        ,
         0.        ,  0.        ]])]}

In [None]:
# Train model
early_stopping = EarlyStopping(monitor='val_loss', patience=3)
model.fit(X_train, np.array(y_train), validation_split=0.1, batch_size=32, epochs=10, callbacks=[early_stopping])

In [None]:
# # Evaluate model
# y_pred = model.predict(X_test)
# y_pred = np.argmax(y_pred, axis=-1)
# y_test_labels = [[tags[i] for i in row] for row in np.argmax(y_test, axis=-1)]
# y_pred_labels = [[tags[i] for i in row] for row in y_pred]
# f1 = f1_score(y_test_labels, y_pred_labels, average='weighted')
# print("F1-score: {:.2f}".format(f1))

In [None]:
from sklearn.metrics import classification_report
# Evaluate model
y_pred = model.predict(X_test)
y_pred = np.argmax(y_pred, axis=-1)
y_test_labels = [[tags[i] for i in row] for row in np.argmax(y_test, axis=-1)]
y_pred_labels = [[tags[i] for i in row] for row in y_pred]


# Print classification report
report = classification_report(y_test_labels_flat, y_pred_labels_flat)
print(report)
