In [3]:
!pip install pandas numpy scikit-learn keras




In [4]:
#!pip install pandas numpy scikit-learn keras

import logging
import os
import random
import warnings
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from keras.preprocessing.sequence import pad_sequences
from keras.utils import to_categorical
from keras.models import Sequential
from keras.layers import LSTM, Embedding, Dense, Dropout, BatchNormalization
from sklearn.metrics import classification_report, f1_score
from keras.callbacks import EarlyStopping, ModelCheckpoint
import traceback
import spacy
from spacy import displacy


# Configure logging
log_dir = os.path.join(os.path.abspath(os.path.join(os.getcwd(), os.pardir)), 'logs')
os.makedirs(log_dir, exist_ok=True)
logging.basicConfig(
    filename=os.path.join(log_dir, 'app.log'),
    level=logging.INFO,
    format='%(asctime)s [%(levelname)s]: %(message)s',
    datefmt='%Y-%m-%d %H:%M:%S'
)

# Suppress deprecation warnings
warnings.filterwarnings("ignore", category=FutureWarning)
warnings.filterwarnings("ignore", category=DeprecationWarning)
'''
# Load the dataset
data_directory = os.path.join(os.path.abspath(os.path.join(os.getcwd(), os.pardir)), 'data')
dataset_path = os.path.join(data_directory, 'ner_dataset.csv')
if os.path.exists(dataset_path):
    df_ner = pd.read_csv(dataset_path, encoding="latin1")
else:
    logging.error(f"Dataset not found at: {dataset_path}")
'''
df_ner = pd.read_csv('ner_dataset.csv', encoding="latin1")

# Drop unnecessary columns
df_ner = df_ner.drop(columns=["POS"])

print(df_ner.head())

# we'll preprocess the data by creating a function to transform the dataset into sentences and corresponding labels:

def preprocess_data(data):
    """
    Preprocesses the NER dataset into sentences and corresponding labels.

    Args:
    data: pandas DataFrame containing the NER dataset

    Returns:
    sentences: list of lists containing words for each sentence
    labels: list of lists containing NER labels for each sentence
    """
    sentences = []
    labels = []
    current_sentence = []
    current_labels = []

    for index, row in data.iterrows():
        # If it's the start of a new sentence
        if pd.isnull(row['Sentence #']):
            sentences.append(current_sentence)
            labels.append(current_labels)
            current_sentence = []
            current_labels = []
        else:
            current_sentence.append(row['Word'])
            current_labels.append(row['Tag'])

    return sentences, labels

sentences, labels = preprocess_data(df_ner)

# Now, let's split the dataset into train, validation, and test sets:

# Split the data into train and test sets (80% train, 20% test)
train_sentences, test_sentences, train_labels, test_labels = train_test_split(sentences, labels, test_size=0.2, random_state=42)

# Further split the train set into train and validation sets (80% train, 20% validation)
train_sentences, val_sentences, train_labels, val_labels = train_test_split(train_sentences, train_labels, test_size=0.2, random_state=42)

# With the data preprocessed and split, we can now move on to building the baseline model. We'll use a simple LSTM-based model for this:

# Tokenize words and labels
words = list(set(df_ner["Word"].values))
n_words = len(words)

tags = list(set(df_ner["Tag"].values))
n_tags = len(tags)

# Create mappings for words and tags
word2idx = {w: i for i, w in enumerate(words)}
tag2idx = {t: i for i, t in enumerate(tags)}

# Padding sequences
max_len = 50
X_train = [[word2idx[w] for w in s] for s in train_sentences]
X_train = pad_sequences(maxlen=max_len, sequences=X_train, padding="post", value=n_words-1)

y_train = [[tag2idx[t] for t in l] for l in train_labels]
y_train = pad_sequences(maxlen=max_len, sequences=y_train, padding="post", value=tag2idx["O"])
y_train = [to_categorical(i, num_classes=n_tags) for i in y_train]

try:
    # Define the model architecture
    model = Sequential()
    model.add(Embedding(input_dim=n_words, output_dim=10, input_length=max_len))
    model.add(LSTM(units=20, return_sequences=True, recurrent_dropout=0.1))
    model.add(Dropout(0.5))  # Added dropout for regularization
    model.add(Dense(n_tags, activation="softmax"))

    model.compile(optimizer="adam", loss="categorical_crossentropy", metrics=["accuracy"])

    # Define early stopping
    early_stopping = EarlyStopping(monitor='val_loss', patience=2)

    # Define model checkpointing
    checkpoint = ModelCheckpoint('model.keras', monitor='val_loss', verbose=1, save_best_only=True, mode='min')

    # Train the model
    model.fit(X_train, np.array(y_train), batch_size=32, epochs=5, validation_split=0.1, verbose=1, callbacks=[early_stopping, checkpoint])

except Exception as e:
    logging.error("Exception occurred", exc_info=True)
    traceback.print_exc()

#This code sets up a basic LSTM-based model for NER, tokenizes words and labels, pads sequences, defines the model architecture, compiles the model, and finally trains it.

# Prepare the test data
X_test = [[word2idx.get(w, n_words-1) for w in s] for s in test_sentences]
X_test = pad_sequences(maxlen=max_len, sequences=X_test, padding="post", value=n_words-1)

y_test = [[tag2idx.get(t, tag2idx["O"]) for t in l] for l in test_labels]
y_test = pad_sequences(maxlen=max_len, sequences=y_test, padding="post", value=tag2idx["O"])
y_test = [to_categorical(i, num_classes=n_tags) for i in y_test]

# Predict on the test data
y_pred = model.predict(X_test)

# Convert the index to tag
idx2tag = {i: w for w, i in tag2idx.items()}

def pred2label(pred):
    out = []
    for pred_i in pred:
        out_i = []
        for p in pred_i:
            p_i = np.argmax(p)
            out_i.append(idx2tag[p_i].replace("PAD", "O"))
        out.append(out_i)
    return out

pred_labels = pred2label(y_pred)
test_labels = pred2label(y_test)

# Flatten the lists of labels and predictions
flat_test_labels = [label for sublist in test_labels for label in sublist]
flat_pred_labels = [label for sublist in pred_labels for label in sublist]

# Print the classification report
#print(classification_report(test_labels, pred_labels))

# Calculate the F1 score
f1 = f1_score(flat_test_labels, flat_pred_labels, average='weighted')

print(f'F1 Score: {f1}')

    Sentence #           Word Tag
0  Sentence: 1      Thousands   O
1          NaN             of   O
2          NaN  demonstrators   O
3          NaN           have   O
4          NaN        marched   O
Epoch 1/5
Epoch 1: val_loss improved from inf to 0.00034, saving model to model.h5
Epoch 2/5
    1/18012 [..............................] - ETA: 8:26 - loss: 4.9405e-06 - accuracy: 1.0000

  saving_api.save_model(


Epoch 2: val_loss improved from 0.00034 to 0.00026, saving model to model.h5
Epoch 3/5
Epoch 3: val_loss improved from 0.00026 to 0.00026, saving model to model.h5
Epoch 4/5
Epoch 4: val_loss did not improve from 0.00026
Epoch 5/5
Epoch 5: val_loss improved from 0.00026 to 0.00025, saving model to model.h5
F1 Score: 0.9999191413875568
