1. necessary libraries

In [None]:
# Importing necessary libraries
import nltk
from nltk.corpus import treebank
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report
from nltk.tag import hmm

# Download required NLTK data
nltk.download('treebank')
nltk.download('universal_tagset')

# Load tagged sentences from the Treebank corpus using the 'universal' tagset
sentences = treebank.tagged_sents(tagset='universal')


[nltk_data] Downloading package treebank to /root/nltk_data...
[nltk_data]   Package treebank is already up-to-date!
[nltk_data] Downloading package universal_tagset to /root/nltk_data...
[nltk_data]   Package universal_tagset is already up-to-date!


2. data splitting and training hmm

In [None]:
# Split data into training and testing sets
trainData, testData = train_test_split(sentences, test_size=0.2, random_state=42)

# Initialize and train the Hidden Markov Model POS tagger using training data
hmmTrainer = hmm.HiddenMarkovModelTrainer()
hmmTagger = hmmTrainer.train_supervised(trainData)


3. function for tagging and evaluation

In [None]:
# Function to tag sentences using the trained HMM tagger
def tagSentences(sentences, tagger):
    return [tagger.tag([word for word, _ in sent]) for sent in sentences]

# Tag test data and extract true tags for evaluation
predictedTags = tagSentences(testData, hmmTagger)
trueTags = [[tag for _, tag in sent] for sent in testData]

# Flatten predicted and true tags for calculating accuracy
predictedTagsFlat = [tag for sent in predictedTags for _, tag in sent]
trueTagsFlat = [tag for tags in trueTags for tag in tags]

# Calculate and display the accuracy and classification report
accuracy = accuracy_score(trueTagsFlat, predictedTagsFlat)
print("HMM Tagger Accuracy:", accuracy)
print("Classification Report:\n", classification_report(trueTagsFlat, predictedTagsFlat))


  O[i, k] = self._output_logprob(si, self._symbols[k])
  O[i, k] = self._output_logprob(si, self._symbols[k])


HMM Tagger Accuracy: 0.5984232809382452
Classification Report:
               precision    recall  f1-score   support

           .       1.00      0.37      0.54      2354
         ADJ       0.92      0.35      0.51      1316
         ADP       0.96      0.44      0.61      2028
         ADV       0.85      0.50      0.63       634
        CONJ       0.99      0.42      0.59       471
         DET       0.98      0.52      0.68      1795
        NOUN       0.42      0.99      0.59      5943
         NUM       1.00      0.37      0.54       727
        PRON       1.00      0.56      0.72       523
         PRT       0.96      0.46      0.62       658
        VERB       0.97      0.48      0.64      2740
           X       1.00      0.40      0.58      1360

    accuracy                           0.60     20549
   macro avg       0.92      0.49      0.60     20549
weighted avg       0.81      0.60      0.60     20549



4. bert based pos tagging

In [None]:
# Import libraries for BERT-based token classification
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import BertTokenizer, BertForTokenClassification, AdamW

# Define tokenizer and map tags to indices
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
tagToIndex = {tag: i for i, tag in enumerate(set(tag for sent in sentences for _, tag in sent))}
indexToTag = {i: tag for tag, i in tagToIndex.items()}
defaultTag = next(iter(indexToTag.values()))


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]



5. custom dataset for bert

In [None]:
# Custom dataset class for BERT token classification
class PosTaggingDataset(Dataset):
    def __init__(self, sentences, tagToIndex, tokenizer, maxLen=50):
        self.sentences = sentences
        self.tagToIndex = tagToIndex
        self.tokenizer = tokenizer
        self.maxLen = maxLen
        self.defaultTagId = tagToIndex[defaultTag]

    def __len__(self):
        return len(self.sentences)

    def __getitem__(self, idx):
        words, tags = zip(*self.sentences[idx])
        tokenIds, tagIds = [], []

        for word, tag in zip(words, tags):
            tokens = self.tokenizer.tokenize(word)
            tokenIds.extend(self.tokenizer.convert_tokens_to_ids(tokens))
            tagIds.extend([self.tagToIndex[tag]] * len(tokens))

        tokenIds = tokenIds[:self.maxLen - 2]
        tagIds = tagIds[:self.maxLen - 2]

        tokenIds = [self.tokenizer.cls_token_id] + tokenIds + [self.tokenizer.sep_token_id]
        tagIds = [self.defaultTagId] + tagIds + [self.defaultTagId]

        attentionMask = [1] * len(tokenIds)
        paddingLength = self.maxLen - len(tokenIds)

        tokenIds.extend([0] * paddingLength)
        tagIds.extend([0] * paddingLength)
        attentionMask.extend([0] * paddingLength)

        return {
            'input_ids': torch.tensor(tokenIds, dtype=torch.long),
            'attention_mask': torch.tensor(attentionMask, dtype=torch.long),
            'labels': torch.tensor(tagIds, dtype=torch.long)
        }


6. dataloader for training and testing

In [None]:
# Split sentences for training and testing the BERT model
trainSentences, testSentences = train_test_split(sentences, test_size=0.7, random_state=42)

# Create dataset and dataloaders for training and testing
trainDataset = PosTaggingDataset(trainSentences, tagToIndex, tokenizer)
testDataset = PosTaggingDataset(testSentences, tagToIndex, tokenizer)
trainLoader = DataLoader(trainDataset, batch_size=16, shuffle=True)
testLoader = DataLoader(testDataset, batch_size=16)


7. training

In [None]:
# Initialize BERT model for token classification
model = BertForTokenClassification.from_pretrained('bert-base-uncased', num_labels=len(tagToIndex))
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)

# Optimizer for training
optimizer = AdamW(model.parameters(), lr=3e-5)

# Train the model for 3 epochs
model.train()
for epoch in range(1):
  for batch in trainLoader:
      inputs = {key: value.to(device) for key, value in batch.items()}
      outputs = model(**inputs)
      loss = outputs.loss

      optimizer.zero_grad()
      loss.backward()
      optimizer.step()
      print(f"Epoch {epoch + 1}, Loss: {loss.item()}")


model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of BertForTokenClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch 1, Loss: 2.562995195388794
Epoch 1, Loss: 2.4377481937408447
Epoch 1, Loss: 2.24756121635437
Epoch 1, Loss: 2.078918218612671
Epoch 1, Loss: 1.9286404848098755
Epoch 1, Loss: 1.8760747909545898
Epoch 1, Loss: 1.6812808513641357
Epoch 1, Loss: 1.6727659702301025
Epoch 1, Loss: 1.6093882322311401
Epoch 1, Loss: 1.5507190227508545
Epoch 1, Loss: 1.4627387523651123
Epoch 1, Loss: 1.3129745721817017
Epoch 1, Loss: 1.3690037727355957
Epoch 1, Loss: 1.1717495918273926
Epoch 1, Loss: 0.8993451595306396
Epoch 1, Loss: 1.1079081296920776
Epoch 1, Loss: 1.1059898138046265
Epoch 1, Loss: 0.9135486483573914
Epoch 1, Loss: 0.8627169728279114
Epoch 1, Loss: 0.9200090169906616
Epoch 1, Loss: 0.8729161620140076
Epoch 1, Loss: 0.9531255960464478
Epoch 1, Loss: 0.9494646191596985
Epoch 1, Loss: 0.8843047618865967
Epoch 1, Loss: 0.9583133459091187
Epoch 1, Loss: 0.7142061591148376
Epoch 1, Loss: 0.7461532354354858
Epoch 1, Loss: 0.8234832286834717
Epoch 1, Loss: 0.7481887936592102
Epoch 1, Loss: 0.7

8. evaluating

In [None]:
# Function to evaluate the model using a dataloader
def evaluate_model(dataLoader):
    model.eval()
    totalAccuracy = 0
    for batch in dataLoader:
        inputs = {key: value.to(device) for key, value in batch.items()}
        with torch.no_grad():
            outputs = model(**inputs)

        logits = outputs.logits
        predictions = torch.argmax(logits, dim=-1)
        labels = inputs['labels']

        accuracy = (predictions == labels).float().mean()
        totalAccuracy += accuracy.item()

    return totalAccuracy / len(dataLoader)

# Evaluate on the test set and display the accuracy
testAccuracy = evaluate_model(testLoader)
print(f"BERT Model Accuracy on Test Set: {testAccuracy}")


BERT Model Accuracy on Test Set: 0.9678924388663713


9.  error analysis





In [None]:
from collections import defaultdict

# Function to perform error analysis
def perform_error_analysis(dataLoader):
    model.eval()
    errorDetails = defaultdict(list)

    for batch in dataLoader:
        inputs = {key: value.to(device) for key, value in batch.items()}
        with torch.no_grad():
            outputs = model(**inputs)

        logits = outputs.logits
        predictions = torch.argmax(logits, dim=-1)
        labels = inputs['labels']

        for i in range(labels.shape[0]):
            for j in range(labels.shape[1]):
                if labels[i, j] != predictions[i, j] and labels[i, j] != tokenizer.pad_token_id:
                    word = tokenizer.convert_ids_to_tokens(inputs['input_ids'][i][j].item())
                    trueTag = indexToTag[labels[i, j].item()]
                    predictedTag = indexToTag[predictions[i, j].item()]
                    errorDetails[(trueTag, predictedTag)].append(word)

    return errorDetails


10. error testing

In [None]:
# Perform error analysis on the test set
testErrors = perform_error_analysis(testLoader)

print("Test Set Errors:", dict(testErrors))


