In [1]:
import pandas as pd
import torch
import torch.nn as nn
import torch.optim as optim
from transformers import BertTokenizer
from torch.utils.data import TensorDataset, DataLoader
from sklearn.model_selection import train_test_split

In [None]:

# Load NER dataset from CSV
ner_data = pd.read_csv("NER dataset.csv")


In [None]:
ner_data.head()

In [15]:

# Fill missing values in the "Sentence #" column with forward fill
ner_data["Sentence #"].fillna(method='ffill', inplace=True)

# Drop rows with NaN values in the "Word" column
ner_data.dropna(subset=["Word"], inplace=True)

# Define label dictionary
label_dict = {"O": 0, "B-geo": 1, "I-geo": 2, "B-per": 3, "I-per": 4, "B-org": 5, "I-org": 6, "B-tim": 7, "I-tim": 8}

# Map labels to numerical format
ner_data["Tag"] = ner_data["Tag"].apply(lambda x: label_dict.get(x, 0))  # Use get() to handle unknown labels

# Tokenize and preprocess NER dataset
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

# Tokenize the words
tokenized_texts = [tokenizer.tokenize(word) for word in ner_data["Word"]]

# Convert tokens to input IDs
input_ids = [tokenizer.convert_tokens_to_ids(tokens) for tokens in tokenized_texts]

# Pad input IDs and attention masks
input_ids = torch.nn.utils.rnn.pad_sequence([torch.tensor(ids) for ids in input_ids], batch_first=True)
attention_masks = torch.ones_like(input_ids)

# Convert labels to tensor
labels = torch.tensor(ner_data["Tag"].values)

# Split data into training and testing sets
train_input_ids, test_input_ids, train_attention_masks, test_attention_masks, train_labels, test_labels = train_test_split(
    input_ids, attention_masks, labels, test_size=0.2, random_state=42
)

# Define BiLSTM-CRF model
class BiLSTM_CRF(nn.Module):
    def __init__(self, input_dim, hidden_dim, output_dim):
        super(BiLSTM_CRF, self).__init__()
        self.hidden_dim = hidden_dim
        self.lstm = nn.LSTM(input_dim, hidden_dim // 2, num_layers=1, bidirectional=True, batch_first=True)
        self.hidden2tag = nn.Linear(hidden_dim, output_dim)
        self.crf = CRF(output_dim)

    def forward(self, x):
        lstm_out, _ = self.lstm(x.float())  # Convert input to float
        emissions = self.hidden2tag(lstm_out)
        return emissions

# Define CRF layer
class CRF(nn.Module):
    def __init__(self, num_tags):
        super(CRF, self).__init__()
        self.num_tags = num_tags
        self.transitions = nn.Parameter(torch.randn(num_tags, num_tags))

    def forward(self, emissions):
        return emissions

# Initialize model, loss function, and optimizer
input_dim = 768  # BERT hidden size
hidden_dim = 256
output_dim = len(label_dict)
model = BiLSTM_CRF(input_dim, hidden_dim, output_dim)
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

# Training loop
num_epochs = 10
batch_size = 32

train_data = TensorDataset(train_input_ids, train_attention_masks, train_labels)
train_loader = DataLoader(train_data, batch_size=batch_size)

for epoch in range(num_epochs):
    model.train()
    total_loss = 0
    for batch in train_loader:
        input_ids, attention_mask, labels = batch
        optimizer.zero_grad()
        outputs = model(input_ids)
        loss = criterion(outputs.view(-1, output_dim), labels.view(-1))
        total_loss += loss.item()
        loss.backward()
        optimizer.step()
    print(f'Epoch [{epoch+1}/{num_epochs}], Loss: {total_loss / len(train_loader)}')

# 

  ner_data["Sentence #"].fillna(method='ffill', inplace=True)


Epoch [1/10], Loss: 0.6245553405266578
Epoch [2/10], Loss: 0.6214530191273947
Epoch [3/10], Loss: 0.6225225274438465
Epoch [4/10], Loss: 0.622955730883147
Epoch [5/10], Loss: 0.6286658747890126
Epoch [6/10], Loss: 0.6284190106759675
Epoch [7/10], Loss: 0.6079971309445036
Epoch [8/10], Loss: 0.6294998991482021
Epoch [9/10], Loss: 0.6075681913525417
Epoch [10/10], Loss: 0.6192569209519584


# BiLSTM model:

In [None]:
import pandas as pd
import torch
import torch.nn as nn
import torch.optim as optim
from transformers import BertTokenizer
from torch.utils.data import TensorDataset, DataLoader
from sklearn.model_selection import train_test_split


In [17]:

# Load NER dataset from CSV
ner_data = pd.read_csv("NER dataset.csv")

# Fill missing values in the "Sentence #" column with forward fill
ner_data["Sentence #"].fillna(method='ffill', inplace=True)

# Drop rows with NaN values in the "Word" column
ner_data.dropna(subset=["Word"], inplace=True)

# Define label dictionary
label_dict = {"O": 0, "B-geo": 1, "I-geo": 2, "B-per": 3, "I-per": 4, "B-org": 5, "I-org": 6, "B-tim": 7, "I-tim": 8}

# Map labels to numerical format
ner_data["Tag"] = ner_data["Tag"].apply(lambda x: label_dict.get(x, 0))  # Use get() to handle unknown labels

# Tokenize and preprocess NER dataset
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

# Tokenize the words
tokenized_texts = [tokenizer.tokenize(word) for word in ner_data["Word"]]

# Convert tokens to input IDs
input_ids = [tokenizer.convert_tokens_to_ids(tokens) for tokens in tokenized_texts]

# Pad input IDs and attention masks
input_ids = torch.nn.utils.rnn.pad_sequence([torch.tensor(ids) for ids in input_ids], batch_first=True)
attention_masks = torch.ones_like(input_ids)

# Convert labels to tensor
labels = torch.tensor(ner_data["Tag"].values)

# Split data into training and testing sets
train_input_ids, test_input_ids, train_attention_masks, test_attention_masks, train_labels, test_labels = train_test_split(
    input_ids, attention_masks, labels, test_size=0.2, random_state=42
)

# Define BiLSTM model
class BiLSTM(nn.Module):
    def __init__(self, input_dim, hidden_dim, output_dim):
        super(BiLSTM, self).__init__()
        self.hidden_dim = hidden_dim
        self.lstm = nn.LSTM(input_dim, hidden_dim // 2, num_layers=1, bidirectional=True, batch_first=True)
        self.hidden2tag = nn.Linear(hidden_dim, output_dim)

    def forward(self, x):
        lstm_out, _ = self.lstm(x.float())  # Convert input to float
        tag_space = self.hidden2tag(lstm_out)
        return tag_space

# Initialize model, loss function, and optimizer
input_dim = 768  # BERT hidden size
hidden_dim = 256
output_dim = len(label_dict)
model = BiLSTM(input_dim, hidden_dim, output_dim)
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

# Training loop
num_epochs = 10
batch_size = 32

train_data = TensorDataset(train_input_ids, train_attention_masks, train_labels)
train_loader = DataLoader(train_data, batch_size=batch_size)

for epoch in range(num_epochs):
    model.train()
    total_loss = 0
    for batch in train_loader:
        input_ids, attention_mask, labels = batch
        optimizer.zero_grad()
        outputs = model(input_ids)
        loss = criterion(outputs.view(-1, output_dim), labels.view(-1))
        total_loss += loss.item()
        loss.backward()
        optimizer.step()
    print(f'Epoch [{epoch+1}/{num_epochs}], Loss: {total_loss / len(train_loader)}')


  ner_data["Sentence #"].fillna(method='ffill', inplace=True)


Epoch [1/10], Loss: 0.6255384719277438
Epoch [2/10], Loss: 0.6260366969533874
Epoch [3/10], Loss: 0.6242956326251639
Epoch [4/10], Loss: 0.6267553601573834
Epoch [5/10], Loss: 0.6282630300467282
Epoch [6/10], Loss: 0.6098856361192441
Epoch [7/10], Loss: 0.6142208872904037
Epoch [8/10], Loss: 0.5979904679407799
Epoch [9/10], Loss: 0.5912402412456773
Epoch [10/10], Loss: 0.5871240579254035


In [22]:
import torch
from transformers import BertTokenizer, BertForTokenClassification

def main():
    # Load the pre-trained tokenizer
    tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

    # Define the number of labels
    num_labels = 10  # Update this with the actual number of labels in your task

    # Load the pre-trained model
    model = BertForTokenClassification.from_pretrainedm_pretrained('bert-base-uncased', num_labels=num_labels)

    # Example input text
    input_text = "Your input text goes here."

    # Tokenize input text
    tokens = tokenizer.tokenize(input_text)

    # Convert tokens to input IDs
    input_ids = tokenizer.convert_tokens_to_ids(tokens)

    # Add special tokens ([CLS] and [SEP])
    input_ids = [tokenizer.cls_token_id] + input_ids + [tokenizer.sep_token_id]

    # Convert input IDs to tensor
    input_ids_tensor = torch.tensor(input_ids).unsqueeze(0)  # Add batch dimension

    # Forward pass through the model
    outputs = model(input_ids_tensor)

    # Print the output
    print(outputs)

if __name__ == "__main__":
    main()

Some weights of BertForTokenClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


TokenClassifierOutput(loss=None, logits=tensor([[[-2.7589e-01, -1.6294e-01,  1.8303e-01,  5.0541e-01,  1.8769e-01,
          -1.9986e-01,  1.9181e-02,  1.5626e-01,  1.1186e-04,  5.2143e-01],
         [-2.0719e-01,  4.0094e-01,  3.4007e-01,  6.7458e-01, -7.3232e-02,
           5.2242e-02, -2.1318e-01, -4.6699e-01,  4.3942e-01,  8.6241e-01],
         [-1.0086e-01,  1.9542e-02,  6.4063e-01,  4.8706e-01, -1.0334e-01,
          -1.6914e-01, -5.6874e-01, -2.3810e-01,  3.5291e-01,  6.8764e-01],
         [-9.9619e-02,  1.7005e-01,  6.3895e-02,  2.8824e-01, -1.3279e-01,
           3.0054e-02, -4.2966e-01, -3.3145e-01,  3.2864e-01,  6.5428e-01],
         [-2.3264e-01,  3.6195e-04,  1.3298e-03,  3.6684e-01, -4.3409e-01,
           2.2665e-01, -3.1214e-01, -4.5459e-01,  2.4301e-01,  7.1796e-01],
         [-2.4147e-01,  2.5983e-01,  4.9858e-01,  2.8965e-01, -5.1520e-01,
          -1.0676e-01, -5.4999e-01, -3.7139e-01,  6.2743e-01,  8.1652e-01],
         [-9.7637e-02,  3.9464e-01,  3.4407e-01,  1.77

In [24]:
import torch
from transformers import BertTokenizer, BertForTokenClassification

def load_model(num_labels):
    # Load the pre-trained model
    model = BertForTokenClassification.from_pretrained('bert-base-uncased', num_labels=num_labels)
    return model

def tokenize_input(input_text, tokenizer):
    # Tokenize input text
    tokens = tokenizer.tokenize(input_text)
    # Convert tokens to input IDs
    input_ids = tokenizer.convert_tokens_to_ids(tokens)
    # Add special tokens ([CLS] and [SEP])
    input_ids = [tokenizer.cls_token_id] + input_ids + [tokenizer.sep_token_id]
    # Convert input IDs to tensor
    input_ids_tensor = torch.tensor(input_ids).unsqueeze(0)  # Add batch dimension
    return input_ids_tensor

def predict(input_text, model, tokenizer):
    # Tokenize and prepare input
    input_ids_tensor = tokenize_input(input_text, tokenizer)
    # Forward pass through the model
    outputs = model(input_ids_tensor)
    return outputs

def main():
    # Load the pre-trained tokenizer
    tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

    # Define the number of labels
    num_labels = 10  # Update this with the actual number of labels in your task

    # Load the pre-trained model
    model = load_model(num_labels)

    # Example input text
    input_text = input("Enter your input text: ")

    # Perform prediction
    outputs = predict(input_text, model, tokenizer)

    # Print the output
    print(outputs)

if __name__ == "__main__":
    main()


Some weights of BertForTokenClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Enter your input text:  hy


TokenClassifierOutput(loss=None, logits=tensor([[[-0.0773,  0.2228,  0.2066, -0.1010,  0.1446, -0.3053,  0.7207,
          -0.1326, -0.0672, -0.3408],
         [-0.2922,  0.5091, -0.2874,  0.0673, -0.3962, -0.3078,  0.4000,
           0.1471, -0.1870,  0.0928],
         [ 0.2592, -0.0222, -0.2055,  0.1457,  0.0414, -0.4768, -0.3498,
          -0.0239, -0.0486, -0.3256],
         [ 0.2420, -0.4576,  0.0630,  0.0605,  0.2620, -0.1884, -0.3167,
           0.2868,  0.0144, -0.0121]]], grad_fn=<ViewBackward0>), hidden_states=None, attentions=None)
