In [None]:
import nltk
from nltk.corpus import treebank
nltk.download('treebank')
nltk.download('universal_tagset')

[nltk_data] Downloading package treebank to /root/nltk_data...
[nltk_data]   Unzipping corpora/treebank.zip.
[nltk_data] Downloading package universal_tagset to /root/nltk_data...
[nltk_data]   Unzipping taggers/universal_tagset.zip.


True

In [None]:
sentences = treebank.tagged_sents(tagset='universal')


In [None]:
from nltk.tag import hmm
from sklearn.model_selection import train_test_split

train_data, test_data = train_test_split(sentences, test_size=0.2, random_state=1)

trainer = hmm.HiddenMarkovModelTrainer()
tagger = trainer.train_supervised(train_data)


In [None]:
from sklearn.metrics import accuracy_score, classification_report

def pos_tag_sents(sentences, tagger):
    return [tagger.tag([token for token, tag in sent]) for sent in sentences]

predicted_tags = pos_tag_sents(test_data, tagger)
ground_truth_tags = [[tag for token, tag in sent] for sent in test_data]
predicted_tags_flat = [tag for sent in predicted_tags for word, tag in sent]
ground_truth_tags_flat = [tag for sent in ground_truth_tags for tag in sent]

accuracy = accuracy_score(ground_truth_tags_flat, predicted_tags_flat)
print("Accuracy:", accuracy)

print(classification_report(ground_truth_tags_flat, predicted_tags_flat))


  O[i, k] = self._output_logprob(si, self._symbols[k])
  O[i, k] = self._output_logprob(si, self._symbols[k])


Accuracy: 0.5784538653366583
              precision    recall  f1-score   support

           .       1.00      0.35      0.52      2239
         ADJ       0.91      0.34      0.50      1333
         ADP       0.95      0.42      0.59      2048
         ADV       0.87      0.47      0.61       673
        CONJ       0.99      0.37      0.54       425
         DET       0.97      0.47      0.64      1773
        NOUN       0.40      0.99      0.57      5617
         NUM       0.99      0.39      0.56       656
        PRON       1.00      0.56      0.72       556
         PRT       0.96      0.43      0.59       628
        VERB       0.97      0.45      0.62      2779
           X       1.00      0.40      0.57      1323

    accuracy                           0.58     20050
   macro avg       0.92      0.47      0.58     20050
weighted avg       0.81      0.58      0.58     20050



In [1]:
import torch
from torch.utils.data import DataLoader, Dataset
from transformers import BertTokenizer, BertForTokenClassification, AdamW
from sklearn.model_selection import train_test_split
import nltk
from nltk.corpus import treebank
import numpy as np

nltk.download('treebank')
nltk.download('universal_tagset')

sentences = treebank.tagged_sents(tagset='universal')

tag_vocab = {tag: idx for idx, tag in enumerate(set(tag for sent in sentences for _, tag in sent))}
tag_map = {idx: tag for tag, idx in tag_vocab.items()}
neutral_label = next(iter(tag_map.values()))

tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

class POSDataset(Dataset):
    def __init__(self, sentences, tag_vocab, tokenizer, max_len=50):
        self.sentences = sentences
        self.tag_vocab = tag_vocab
        self.tokenizer = tokenizer
        self.max_len = max_len
        self.neutral_label_id = self.tag_vocab[neutral_label]

    def __len__(self):
        return len(self.sentences)

    def __getitem__(self, idx):
        words, tags = zip(*self.sentences[idx])
        token_ids = []
        target_tags = []

        for word, tag in zip(words, tags):
            word_tokens = self.tokenizer.tokenize(word)
            token_ids.extend(self.tokenizer.convert_tokens_to_ids(word_tokens))
            target_tags.extend([self.tag_vocab[tag]] * len(word_tokens))

        token_ids = token_ids[:self.max_len - 2]
        target_tags = target_tags[:self.max_len - 2]

        token_ids = [self.tokenizer.cls_token_id] + token_ids + [self.tokenizer.sep_token_id]
        target_tags = [self.neutral_label_id] + target_tags + [self.neutral_label_id]

        attention_mask = [1] * len(token_ids)
        padding = [0] * (self.max_len - len(token_ids))

        token_ids += padding
        target_tags += padding
        attention_mask += padding

        return {
            'input_ids': torch.tensor(token_ids, dtype=torch.long),
            'attention_mask': torch.tensor(attention_mask, dtype=torch.long),
            'labels': torch.tensor(target_tags, dtype=torch.long)
        }

train_sentences, test_sentences = train_test_split(sentences, test_size=0.1, random_state=42)
train_dataset = POSDataset(train_sentences, tag_vocab, tokenizer)
test_dataset = POSDataset(test_sentences, tag_vocab, tokenizer)

train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=16, shuffle=False)

model = BertForTokenClassification.from_pretrained('bert-base-uncased', num_labels=len(tag_vocab))

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

optimizer = AdamW(model.parameters(), lr=3e-5)

model.train()
for epoch in range(3):
    for batch in train_loader:
        batch = {k: v.to(device) for k, v in batch.items()}
        outputs = model(**batch)
        loss = outputs.loss

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        print(f"Epoch: {epoch}, Loss: {loss.item()}")

model.eval()
total_eval_accuracy = 0
for batch in test_loader:
    batch = {k: v.to(device) for k, v in batch.items()}
    with torch.no_grad():
        outputs = model(**batch)

    logits = outputs.logits
    predictions = torch.argmax(logits, dim=-1)
    labels = batch['labels']

    eval_accuracy = (predictions == labels).float().mean()
    total_eval_accuracy += eval_accuracy.item()

print(f"Accuracy on test set: {total_eval_accuracy / len(test_loader)}")


[nltk_data] Downloading package treebank to /root/nltk_data...
[nltk_data]   Unzipping corpora/treebank.zip.
[nltk_data] Downloading package universal_tagset to /root/nltk_data...
[nltk_data]   Unzipping taggers/universal_tagset.zip.
The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of BertForTokenClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch: 0, Loss: 2.498079776763916
Epoch: 0, Loss: 2.3634908199310303
Epoch: 0, Loss: 2.181957721710205
Epoch: 0, Loss: 2.072434663772583
Epoch: 0, Loss: 1.8704975843429565
Epoch: 0, Loss: 1.8824383020401
Epoch: 0, Loss: 1.6135367155075073
Epoch: 0, Loss: 1.5514596700668335
Epoch: 0, Loss: 1.599197506904602
Epoch: 0, Loss: 1.5060580968856812
Epoch: 0, Loss: 1.421540379524231
Epoch: 0, Loss: 1.3928208351135254
Epoch: 0, Loss: 1.404736042022705
Epoch: 0, Loss: 1.0662168264389038
Epoch: 0, Loss: 1.3046826124191284
Epoch: 0, Loss: 1.1316065788269043
Epoch: 0, Loss: 1.0534034967422485
Epoch: 0, Loss: 1.0392988920211792
Epoch: 0, Loss: 1.2104188203811646
Epoch: 0, Loss: 0.9639556407928467
Epoch: 0, Loss: 0.9271921515464783
Epoch: 0, Loss: 0.819990336894989
Epoch: 0, Loss: 0.9022027850151062
Epoch: 0, Loss: 0.8313694000244141
Epoch: 0, Loss: 0.8568758368492126
Epoch: 0, Loss: 0.7792721390724182
Epoch: 0, Loss: 0.8327773213386536
Epoch: 0, Loss: 0.7411781549453735
Epoch: 0, Loss: 0.835367441177

In [3]:
nltk.download('punkt')


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

In [5]:
import nltk
from nltk.corpus import reuters, brown, twitter_samples

nltk.download('punkt')
nltk.download('reuters')
nltk.download('brown')
nltk.download('twitter_samples')
nltk.download('averaged_perceptron_tagger')

def prepare_corpus(corpus):
    if corpus == 'reuters':
        sentences = [nltk.pos_tag(sent, tagset='universal') for sent in reuters.sents()]
    elif corpus == 'brown':
        sentences = [nltk.pos_tag(sent, tagset='universal') for sent in brown.sents()]
    elif corpus == 'twitter':
        sentences = [nltk.pos_tag(nltk.word_tokenize(tweet), tagset='universal') for tweet in twitter_samples.strings()]
    else:
        raise ValueError("Unsupported corpus")
    return sentences

news_sentences = prepare_corpus('reuters')
literary_sentences = prepare_corpus('brown')
social_media_sentences = prepare_corpus('twitter')


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package reuters to /root/nltk_data...
[nltk_data]   Package reuters is already up-to-date!
[nltk_data] Downloading package brown to /root/nltk_data...
[nltk_data]   Package brown is already up-to-date!
[nltk_data] Downloading package twitter_samples to /root/nltk_data...
[nltk_data]   Package twitter_samples is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger.zip.


In [6]:
news_dataset = POSDataset(news_sentences, tag_vocab, tokenizer)
literary_dataset = POSDataset(literary_sentences, tag_vocab, tokenizer)
social_media_dataset = POSDataset(social_media_sentences, tag_vocab, tokenizer)

news_loader = DataLoader(news_dataset, batch_size=16, shuffle=False)
literary_loader = DataLoader(literary_dataset, batch_size=16, shuffle=False)
social_media_loader = DataLoader(social_media_dataset, batch_size=16, shuffle=False)

def evaluate_model(dataloader):
    model.eval()
    total_eval_accuracy = 0
    for batch in dataloader:
        batch = {k: v.to(device) for k, v in batch.items()}
        with torch.no_grad():
            outputs = model(**batch)

        logits = outputs.logits
        predictions = torch.argmax(logits, dim=-1)
        labels = batch['labels']

        eval_accuracy = (predictions == labels).float().mean()
        total_eval_accuracy += eval_accuracy.item()

    return total_eval_accuracy / len(dataloader)

news_accuracy = evaluate_model(news_loader)
literary_accuracy = evaluate_model(literary_loader)
social_media_accuracy = evaluate_model(social_media_loader)

print(f"Accuracy on news corpus: {news_accuracy}")
print(f"Accuracy on literary corpus: {literary_accuracy}")
print(f"Accuracy on social media corpus: {social_media_accuracy}")


Accuracy on news corpus: 0.9226219332706161
Accuracy on literary corpus: 0.9697535122146032
Accuracy on social media corpus: 0.8447406478881836


In [7]:
from collections import defaultdict

def error_analysis(dataloader):
    model.eval()
    errors = defaultdict(list)

    for batch in dataloader:
        batch = {k: v.to(device) for k, v in batch.items()}
        with torch.no_grad():
            outputs = model(**batch)

        logits = outputs.logits
        predictions = torch.argmax(logits, dim=-1)
        labels = batch['labels']

        for i in range(labels.size(0)):
            for j in range(labels.size(1)):
                if labels[i, j] != predictions[i, j] and labels[i, j] != tokenizer.pad_token_id:
                    word = tokenizer.convert_ids_to_tokens(batch['input_ids'][i][j].item())
                    true_tag = tag_map[labels[i, j].item()]
                    predicted_tag = tag_map[predictions[i, j].item()]
                    errors[(true_tag, predicted_tag)].append(word)

    return errors

news_errors = error_analysis(news_loader)
literary_errors = error_analysis(literary_loader)
social_media_errors = error_analysis(social_media_loader)

print("News Corpus Errors:", dict(news_errors))
print("Literary Corpus Errors:", dict(literary_errors))
print("Social Media Corpus Errors:", dict(social_media_errors))


Social Media Corpus Errors: {('ADP', 'DET'): ['that', 'that', 'that', 'that', 'that', 'that', 'that', 'that', 'that', 'that', 'that', 'that', 'that', 'that', 'that', 'that', 'that', 'that', 'that', 'that', 'that', 'that', 'that', 'that', 'that', 'that', 'that', 'that', 'that', 'that', 'that', 'that', 'that', 'that', 'that', 'that', 'that', 'that', 'that', 'that', 'that', 'that', 'that', 'that', 'that', 'that', 'that', 'that', 'that', 'that', 'that', 'that', 'that', 'that', 'that', 'that', 'that', 'that', 'that', 'that', 'that', 'that', 'that', 'that', 'that', 'that', 'that', 'that', 'that', 'that', 'that', 'that', 'that', 'that', 'that', 'that', 'that', 'that', 'that', 'that', 'that', 'that', 'that', 'that', 'that', 'that', 'that', 'that', 'that', 'that', 'that', 'that', 'that', 'that', 'that', 'that', 'that', 'that', 'that', 'that', 'that', 'that', 'that', 'that', 'that', 'that', 'that', 'that', 'that', 'that', 'that', 'that', 'that', 'that', 'that', 'that', 'that', 'that', 'that', 't