<a href="https://colab.research.google.com/github/Adithyan-mp/Sequence_Model/blob/main/LLMNER.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import torch
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
import re
from collections import Counter
from torch.nn.utils.rnn import pad_sequence
from torch import nn
from torch.utils.data import Dataset,DataLoader

In [27]:
# Load dataset
df = pd.read_csv('/content/ner.csv')
x = df['Sentence']
y = df['Tag']

# Preprocess sentences
def clean_dataset(sentence):
    sentence = sentence.lower()
    sentence = re.sub(r'[^\w\s]', '', sentence)
    sentence = re.sub(r'\s+', ' ', sentence)
    sentence = sentence.split()
    sentence.append('<EOS>')
    return sentence

# Tokenize with vocab
def tokenization(sentence, vocab: dict):
    return torch.tensor([vocab.get(token, vocab['<UNK>']) for token in sentence])

# Create tokenized and padded data
def get_tokenization(data, target=False, MAX_LEN=60):
    data_token = [clean_dataset(sentence) for sentence in data]
    frequency_word = Counter([token for sentence in data_token for token in sentence])
    vocab = {token: idx + 2 for idx, (token, _) in enumerate(frequency_word.most_common())}
    vocab['<PAD>'] = 0
    vocab['<UNK>'] = 1
    tokenized = [tokenization(sentence, vocab)[:MAX_LEN] for sentence in data_token]
    tokenized = pad_sequence(tokenized, batch_first=True, padding_value=0)
    return data_token, tokenized, vocab

# Tokenize input and output
data_token_x, tokenized_x, vocab_x = get_tokenization(x)
data_token_y, tokenized_y, vocab_y = get_tokenization(y, target=True)

# Train/test split
x_train, x_test, y_train, y_test = train_test_split(tokenized_x, tokenized_y, test_size=0.2, random_state=32)

# Custom Dataset
class CustomDataset(Dataset):
    def __init__(self, inputs, labels):
        self.inputs = inputs
        self.labels = labels

    def __len__(self):
        return len(self.inputs)

    def __getitem__(self, index):
        return self.inputs[index], self.labels[index]

# Custom LSTM model
class LSTM(nn.Module):
    def __init__(self, hidden_state, vocab_size, output_size, embedding_dim=120):
        super(LSTM, self).__init__()
        self.hidden_state = hidden_state
        self.embedding_layer = nn.Embedding(num_embeddings=vocab_size, embedding_dim=embedding_dim, padding_idx=0)
        self.forget_gate = nn.Linear(hidden_state + embedding_dim, hidden_state)
        self.update_gate = nn.Linear(hidden_state + embedding_dim, hidden_state)
        self.layer = nn.Linear(hidden_state + embedding_dim, hidden_state)
        self.output_gate = nn.Linear(hidden_state + embedding_dim, hidden_state)
        self.output_layer = nn.Linear(hidden_state, output_size)

    def forward(self, input):
        batch_size, seq_len = input.size()
        device = input.device
        c_prev = torch.zeros(batch_size, self.hidden_state, device=device)
        activation_prev = torch.zeros(batch_size, self.hidden_state, device=device)
        input = self.embedding_layer(input)

        outputs = []
        for i in range(seq_len):
            x = input[:, i, :]
            x = torch.cat((x, activation_prev), dim=1)

            forgeted_info = torch.sigmoid(self.forget_gate(x))
            updated_info = torch.sigmoid(self.update_gate(x))
            c_hat = torch.tanh(self.layer(x))

            c_prev = c_prev * forgeted_info + c_hat * updated_info
            output_gate = torch.sigmoid(self.output_gate(x))
            activation_prev = output_gate * torch.tanh(c_prev)

            output = self.output_layer(activation_prev)
            outputs.append(output.unsqueeze(1))

        outputs = torch.cat(outputs, dim=1)  # (batch_size, seq_len, output_size)
        return outputs

# Training setup
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = LSTM(hidden_state=80, output_size=len(vocab_y), vocab_size=len(vocab_x)).to(device)

dataset = CustomDataset(inputs=x_train, labels=y_train)
dataloader = DataLoader(dataset=dataset, batch_size=64, shuffle=True)

optimizer = torch.optim.Adam(params=model.parameters(), lr=0.001)
criterion = nn.CrossEntropyLoss(ignore_index=0)

# Training loop
num_epochs = 5
for epoch in range(num_epochs):
    model.train()
    total_loss = 0
    for x_batch, y_batch in dataloader:
        x_batch = x_batch.to(device)
        y_batch = y_batch.to(device)

        logits = model(x_batch)  # (batch_size, seq_len, output_size)
        logits = logits.view(-1, logits.shape[-1])  # (batch_size * seq_len, output_size)
        y_batch = y_batch.view(-1)  # (batch_size * seq_len)

        loss = criterion(logits, y_batch)
        total_loss += loss.item()

        loss.backward()
        optimizer.step()
        optimizer.zero_grad()

    print(f"Epoch [{epoch+1}/{num_epochs}], Loss: {total_loss / len(dataloader):.4f}")

Epoch [1/5], Loss: 0.7511
Epoch [2/5], Loss: 0.4499
Epoch [3/5], Loss: 0.3772
Epoch [4/5], Loss: 0.3292
Epoch [5/5], Loss: 0.2977


In [42]:
sample_input = x_test[6]
sample_output = y_test[6]
print(sample_input)
idx2tag = {value:key for key,value in vocab_y.items()}

logit = model(sample_input.view(-1,60))
print(logit.view(-1,20).size())
y_pred = torch.softmax(logit.view(-1,20),dim=1)

y_pred = torch.argmax(y_pred,axis=1)

predicted_tags = [idx2tag[i.item()] for i in y_pred if not idx2tag[i.item()] == "<EOS>"]
actual_tags = [idx2tag[i.item()] for i in sample_output if not idx2tag[i.item()] == "<EOS>" and not idx2tag[i.item()]=="<PAD>"]

# Step 4: Print
print(f"Predicted Tags: {predicted_tags}")
print(f"Actual Tags:    {actual_tags}")

tensor([135,  30, 118, 154,   5, 952,   3,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0])
torch.Size([60, 20])
Predicted Tags: ['o', 'o', 'o', 'o', 'o', 'o', 'o']
Actual Tags:    ['o', 'o', 'o', 'o', 'o', 'o', 'o']
