In [None]:
!pip install pandas numpy scikit-learn keras



In [None]:
# !pip install pandas numpy scikit-learn keras tensorflow
# Used GPU T4 (Colab Kernel)

# Import necessary libraries
import logging
import os
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, f1_score
from keras.preprocessing.sequence import pad_sequences
from keras.utils import to_categorical
from keras.models import Sequential
from keras.layers import LSTM, Embedding, Dense, Dropout
from keras.callbacks import EarlyStopping, ModelCheckpoint
import tensorflow as tf
from tensorflow.keras.utils import plot_model
import traceback
import spacy
from spacy import displacy


# Configure logging
log_dir = os.path.join(os.path.abspath(os.path.join(os.getcwd(), os.pardir)), 'logs')
os.makedirs(log_dir, exist_ok=True)
logging.basicConfig(
    filename=os.path.join(log_dir, 'app.log'),
    level=logging.INFO,
    format='%(asctime)s [%(levelname)s]: %(message)s',
    datefmt='%Y-%m-%d %H:%M:%S'
)

# Check if GPU is available
if tf.config.list_physical_devices('GPU'):
    print("GPU is available!!")
    device = '/gpu:0'
else:
    print("GPU not available, using CPU instead!")
    device = '/cpu:0'

# Read the dataset
data_directory = os.getcwd()
dataset_path = os.path.join(data_directory, 'ner_dataset.csv')
if os.path.exists(dataset_path):
    df_ner = pd.read_csv(dataset_path, encoding="latin1")
else:
    raise FileNotFoundError(f"Dataset not found at: {dataset_path}")

# Drop unnecessary columns
df_ner = df_ner.drop(columns=["POS"])

# Function to preprocess the data into sentences and corresponding labels
def preprocess_data(data):
    """
    Preprocesses the NER dataset into sentences and corresponding labels.

    Args:
    data: pandas DataFrame containing the NER dataset

    Returns:
    sentences: list of lists containing words for each sentence
    labels: list of lists containing NER labels for each sentence
    """
    sentences = []
    labels = []
    current_sentence = []
    current_labels = []

    for index, row in data.iterrows():
        if pd.isnull(row['Sentence #']):
            if current_sentence:  # Check if the sentence is not empty
                sentences.append(current_sentence)
                labels.append(current_labels)
            current_sentence = []
            current_labels = []
        else:
            current_sentence.append(row['Word'])
            current_labels.append(row['Tag'])

    return sentences, labels

# Preprocess the data
sentences, labels = preprocess_data(df_ner)

# Split the data into train and test sets (80% train, 20% test)
train_sentences, test_sentences, train_labels, test_labels = train_test_split(sentences, labels, test_size=0.2, random_state=42)

# Further split the train set into train and validation sets (80% train, 20% validation)
train_sentences, val_sentences, train_labels, val_labels = train_test_split(train_sentences, train_labels, test_size=0.2, random_state=42)

# Tokenize words and labels using only the training data to avoid data leakage
words = list(set([word for sentence in train_sentences for word in sentence]))
n_words = len(words)

tags = list(set(df_ner["Tag"].values))
n_tags = len(tags)

# Create mappings for words and tags
word2idx = {w: i for i, w in enumerate(words)}
tag2idx = {t: i for i, t in enumerate(tags)}

# Padding sequences
max_len = 50
X_train = [[word2idx[w] for w in s] for s in train_sentences]
X_train = pad_sequences(maxlen=max_len, sequences=X_train, padding="post", value=n_words-1)

y_train = [[tag2idx[t] for t in l] for l in train_labels]
y_train = pad_sequences(maxlen=max_len, sequences=y_train, padding="post", value=tag2idx["O"])
y_train = [to_categorical(i, num_classes=n_tags) for i in y_train]

# Function to train the model
def train_model(X_train, y_train, n_words, n_tags, device):
    """
    Trains the LSTM model.

    Args:
    X_train: Training data
    y_train: Labels for the training data
    n_words: Total number of unique words
    n_tags: Total number of unique tags
    device: Device to run the model on ('/cpu:0' or '/gpu:0')

    Returns:
    model: Trained model
    """
    with tf.device(device):

        # Define the model architecture
        model = Sequential()
        model.add(Embedding(input_dim=n_words, output_dim=10, input_length=max_len))
        model.add(LSTM(units=20, return_sequences=True, recurrent_dropout=0.1))
        model.add(Dropout(0.5))
        model.add(Dense(n_tags, activation="softmax"))
        # Compile model
        model.compile(optimizer="adam", loss="categorical_crossentropy", metrics=["accuracy"])
        # Define early stopping
        early_stopping = EarlyStopping(monitor='val_loss', patience=2)
        # Define model checkpointing
        checkpoint = ModelCheckpoint('model.keras', monitor='val_loss', verbose=1, save_best_only=True, mode='min')

        batch_size = 256 # Larger batch size
        # Train the model
        model.fit(X_train, np.array(y_train), batch_size=batch_size, epochs=5, validation_split=0.1, verbose=1, callbacks=[early_stopping, checkpoint])

    return model

# Train the model
model = train_model(X_train, y_train, n_words, n_tags, device)

# print model summary
print(model.summary())
plot_model(model)

# Prepare the test data
X_test = [[word2idx.get(w, n_words-1) for w in s] for s in test_sentences]
X_test = pad_sequences(maxlen=max_len, sequences=X_test, padding="post", value=n_words-1)

y_test = [[tag2idx.get(t, tag2idx["O"]) for t in l] for l in test_labels]
y_test = pad_sequences(maxlen=max_len, sequences=y_test, padding="post", value=tag2idx["O"])
y_test = [to_categorical(i, num_classes=n_tags) for i in y_test]

# Predict on the test data
y_pred = model.predict(X_test)

# Convert the index to tag
idx2tag = {i: w for w, i in tag2idx.items()}

# Function to convert predictions to labels
def pred2label(pred):
    """
    Converts predictions to labels.

    Args:
    pred: Predictions from the model

    Returns:
    out: List of predicted labels
    """
    out = []
    for pred_i in pred:
        out_i = []
        for p in pred_i:
            p_i = np.argmax(p)
            out_i.append(idx2tag[p_i].replace("PAD", "O"))
        out.append(out_i)
    return out

# Convert predictions to labels
pred_labels = pred2label(y_pred)
test_labels = pred2label(y_test)

# Flatten the lists of labels and predictions
flat_test_labels = [label for sublist in test_labels for label in sublist]
flat_pred_labels = [label for sublist in pred_labels for label in sublist]

# Calculate the F1 score
f1 = f1_score(flat_test_labels, flat_pred_labels, average='weighted')

print(f'F1 Score: {f1}')

GPU is available!!




Epoch 1/5
Epoch 1: val_loss improved from inf to 0.17058, saving model to model.keras
Epoch 2/5
Epoch 2: val_loss improved from 0.17058 to 0.07376, saving model to model.keras
Epoch 3/5
Epoch 3: val_loss improved from 0.07376 to 0.04574, saving model to model.keras
Epoch 4/5
Epoch 4: val_loss improved from 0.04574 to 0.03402, saving model to model.keras
Epoch 5/5
Epoch 5: val_loss improved from 0.03402 to 0.02828, saving model to model.keras
Model: "sequential_2"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_2 (Embedding)     (None, 50, 10)            29150     
                                                                 
 lstm_2 (LSTM)               (None, 50, 20)            2480      
                                                                 
 dropout_2 (Dropout)         (None, 50, 20)            0         
                                                                 
 den