In [10]:
!pip3 install pytorch-lightning -q -U

In [11]:
from sklearn.metrics import classification_report
import torch
import torch.nn as nn
import torch.optim as optim
import pytorch_lightning as pl
from torch.utils.data import DataLoader, TensorDataset
from pytorch_lightning.callbacks.early_stopping import EarlyStopping
from pytorch_lightning.utilities.model_summary import ModelSummary
import json
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [12]:
EMBEDDING_DIM = [512, 512]
HIDDEN_DIM    = [512, 512]
NUM_EPOCHS    = [10, 10]
BATCH_SIZE    = [64, 64]
COARSE = 0
FINE = 1

In [13]:
fine_to_coarse = {"I-AnatomicalStructure": "Medical", "B-AnatomicalStructure": "Medical", "I-Symptom": "Medical", "B-AerospaceManufacturer": "Group", "I-SportsGRP": "Group", "B-SportsGRP": "Group", "I-CarManufacturer": "Group", "B-CarManufacturer": "Group", "I-TechCorp": "Group", "B-Symptom": "Medical", "I-Disease": "Medical", "I-VisualWork": "Creative Works", "B-VisualWork": "Creative Works", "I-MusicalWork": "Creative Works", "B-MusicalWork": "Creative Works", "I-WrittenWork": "Creative Works", "B-WrittenWork": "Creative Works", "I-ArtWork": "Creative Works", "B-ArtWork": "Creative Works", "I-Software": "Creative Works", "B-Software": "Creative Works", "I-OtherCW": "Creative Works", "B-OtherCW": "Creative Works", "I-MusicalGRP": "Group", "B-MusicalGRP": "Group", "I-PublicCorp": "Group", "B-PublicCorp": "Group", "I-PrivateCorp": "Group", "B-PrivateCorp": "Group", "I-OtherCorp": "Group", "B-OtherCorp": "Group", "I-AerospaceManufacturer": "Group", "B-TechCorp": "Group", "I-ORG": "Group", "B-ORG": "Group", "I-Scientist": "Person", "B-Scientist": "Person", "I-Artist": "Person", "B-Artist": "Person", "I-Athlete": "Person", "B-Athlete": "Person", "I-Politician": "Person", "B-Politician": "Person", "I-Cleric": "Person", "B-Cleric": "Person", "I-SportsManager": "Person", "B-SportsManager": "Person", "I-OtherPER": "Person", "B-OtherPER": "Person", "I-Clothing": "Product", "B-Clothing": "Product", "I-Vehicle": "Product", "B-Vehicle": "Product", "I-Food": "Product", "B-Food": "Product", "I-Drink": "Product", "B-Drink": "Product", "I-OtherPROD": "Product", "B-OtherPROD": "Product", "I-Medication/Vaccine": "Medical", "B-Medication/Vaccine": "Medical", "I-MedicalProcedure": "Medical", "B-MedicalProcedure": "Medical", "B-Disease": "Medical", "I-Facility": "Location", "B-Facility": "Location", "I-OtherLOC": "Location", "B-OtherLOC": "Location", "I-HumanSettlement": "Location", "B-HumanSettlement": "Location", "I-Station": "Location", "B-Station": "Location", "O": "O"}

In [14]:
def get_tokens(file_path, model_type):
    sentences = []
    with open(file_path, 'r', encoding='utf-8') as f:
        sentence = []
        for line in f:
            line = line.strip()
            if line.startswith('#'):
                sentences.append(sentence)
                sentence = []
                continue
            if not line:
                continue
            tokens = line.split(' ')
            if model_type == COARSE:
                sentence.append([tokens[0], fine_to_coarse[tokens[3]]])
            else:
                sentence.append([tokens[0], tokens[3]])
        sentences.append(sentence)
    return sentences

def preprocess(dataset, SEQ_LEN, tag_to_idx, word_to_idx, train=True):
    # Extract sentences and tags
    sent = [[token.lower() for token, tag in sentence] for sentence in dataset]
    tags = [[tag for token, tag in sentence] for sentence in dataset]

    for i in range(len(sent)):
        while len(sent[i]) < SEQ_LEN:
            sent[i].append('<PAD>')
            tags[i].append('<PAD>')

        if len(sent[i]) > SEQ_LEN:
            sent[i] = sent[i][:SEQ_LEN]
            tags[i] = tags[i][:SEQ_LEN]
    
    if train:
        for sentence_tags in tags:
            for tag in sentence_tags:
                if tag not in tag_to_idx:
                    tag_to_idx[tag] = len(tag_to_idx)
        
        for sentence in sent:
            for word in sentence:
                if word not in word_to_idx:
                    word_to_idx[word] = len(word_to_idx)

    # Convert words and tags to indices
    X = torch.tensor([[word_to_idx.get(word, 1) for word in sentence] for sentence in sent], dtype=torch.int).type(torch.LongTensor)
    Y = torch.tensor([[tag_to_idx[tag] for tag in sentence] for sentence in tags], dtype=torch.int).type(torch.LongTensor)
    
    return X, Y

class BiLSTM_Model(pl.LightningModule):
    def __init__(self, vocab_size, target_size, embedding_dim, hidden_dim, num_layers):
        super().__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        self.lstm = nn.LSTM(embedding_dim, hidden_dim, num_layers, batch_first=True, bidirectional=True)
        self.fc = nn.Linear(hidden_dim * 2, target_size)
        self.loss_fn = nn.CrossEntropyLoss()

    def forward(self, x):
        embedding = self.embedding(x)
        output, _ = self.lstm(embedding)
        output = self.fc(output)
        return nn.functional.log_softmax(output, dim=2)
    
    def training_step(self, batch, batch_idx):
        x, y = batch
        y_hat = self(x)
        loss = self.loss_fn(y_hat.view(-1, y_hat.shape[-1]), y.view(-1))
        self.log('train_loss', loss)
        return loss
    
    def validation_step(self, batch, batch_idx):
        x, y = batch
        y_hat = self(x)
        loss = self.loss_fn(y_hat.view(-1, y_hat.shape[-1]), y.view(-1))
        self.log('val_loss', loss)
        return loss
    
    def test_step(self, batch, batch_idx):
        x, y = batch
        y_hat = self(x)
        loss = self.loss_fn(y_hat.view(-1, y_hat.shape[-1]), y.view(-1))
        self.log('test_loss', loss)
        return loss
    
    def configure_optimizers(self):
        return optim.Adam(self.parameters())

In [15]:
word_to_idx = {"<PAD>": 0, "<UNK>": 1}
tag_to_idx = {"<PAD>": 0}
idx_to_tag = {v: k for k, v in tag_to_idx.items()}
idx_to_word = {idx: word for word, idx in word_to_idx.items()}

def run_model(train_file_path, dev_file_path, test_file_path, lang_name, model_type=FINE):
    SEQ_LEN = 25

    trainX, trainY = preprocess(get_tokens(train_file_path, model_type)[1:], SEQ_LEN, tag_to_idx, word_to_idx)
    devX, devY = preprocess(get_tokens(dev_file_path, model_type)[1:], SEQ_LEN, tag_to_idx, word_to_idx, train=False)
    testX, testY = preprocess(get_tokens(test_file_path, model_type)[1:],SEQ_LEN, tag_to_idx, word_to_idx, train=False)

    # print shapes
    print("Shape of trainX: ", trainX.shape, ", Shape of trainY: ", trainY.shape)
    print("Shape of devX: ", devX.shape, ", Shape of devY: ", devY.shape)
    print("Shape of testX: ", testX.shape, ", Shape of testY: ", testY.shape)

    train_dataset = TensorDataset(trainX, trainY)
    train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE[model_type], shuffle=True, num_workers=2)
    val_dataset = TensorDataset(devX, devY)
    val_loader = DataLoader(val_dataset, batch_size=BATCH_SIZE[model_type], shuffle=False, num_workers=2)
    test_dataset = TensorDataset(testX, testY)
    test_loader = DataLoader(test_dataset, batch_size=BATCH_SIZE[model_type], shuffle=False, num_workers=2)

    # define model in a global scope

    global model
    model = BiLSTM_Model(len(word_to_idx), len(tag_to_idx), EMBEDDING_DIM[model_type], HIDDEN_DIM[model_type], 1)
    print(ModelSummary(model))
    early_stop_callback = EarlyStopping(monitor='val_loss', patience=5, mode='min')
    trainer = pl.Trainer(max_epochs=NUM_EPOCHS[model_type], callbacks=[early_stop_callback], accelerator='gpu', devices=1)
    trainer.fit(model, train_loader, val_loader)
    trainer.test(dataloaders=test_loader)
    
    # save parameters
    if model_type == COARSE:
        torch.save(model.state_dict(), f'/content/drive/MyDrive/DL_A2/Models/BiLSTM_coarse_{lang_name}.pt')
    else:
        torch.save(model.state_dict(), f'/content/drive/MyDrive/DL_A2/Models/BiLSTM_fine_{lang_name}.pt')

    # Print 1 sentence for debugging
    model.eval()

    y_true = []
    y_pred = []
                    
    with torch.no_grad():
        for x, y in test_loader:
            # Move the data to the device
            x = x.to(device)
            y = y.to(device)
            model = model.to(device)

            # Forward pass
            y_hat = model(x)

            # Get back the sentence
            x_sent = [idx_to_word[i] for i in x.cpu().numpy().flatten().tolist()]

            # Compute the predicted tags
            y_pred += [idx_to_tag[i] for i in y_hat.argmax(-1).cpu().numpy().flatten().tolist()]

            # Compute the true tags
            y_true += [idx_to_tag[i] for i in y.cpu().numpy().flatten().tolist()]
            print("---------------Original Sentence---------------")
            for i in range(SEQ_LEN):
                print(x_sent[i], end=" ")
            print()
            print("---------------Predicted tags---------------")
            for i in range(SEQ_LEN):
                print(y_pred[i], end=" ")
            print()
            break
    
    
    # Classification Report
    global idx_to_tag
    idx_to_tag = {idx: tag for tag, idx in tag_to_idx.items()}
    model.eval()

    y_true = []
    y_pred = []

    with torch.no_grad():
        for x, y in test_loader:
            # Move the data to the device
            x = x.to(device)
            y = y.to(device)

            # Forward pass
            y_hat = model(x)

            # Compute the predicted tags
            y_pred += [idx_to_tag[i] for i in y_hat.argmax(-1).cpu().numpy().flatten().tolist()]

            # Compute the true tags
            y_true += [idx_to_tag[i] for i in y.cpu().numpy().flatten().tolist()]

    print(classification_report(y_true, y_pred))

# Mount GDrive

In [16]:
from google.colab import drive
drive.mount('/content/drive')
folder_path= "/content/drive/MyDrive/DL_A2/Data"

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


# Hindi Language Models

In [17]:
train_file_path = folder_path + "/HI-Hindi/hi_train.conll"
dev_file_path = folder_path + "/HI-Hindi/hi_dev.conll"
test_file_path = folder_path + "/HI-Hindi/hi_test.conll"

## Coarse Model

In [19]:
run_model(train_file_path, dev_file_path, test_file_path, "hi", model_type=COARSE)

Shape of trainX:  torch.Size([9632, 25]) , Shape of trainY:  torch.Size([9632, 25])
Shape of devX:  torch.Size([514, 25]) , Shape of devY:  torch.Size([514, 25])
Shape of testX:  torch.Size([18399, 25]) , Shape of testY:  torch.Size([18399, 25])
  | Name      | Type             | Params
-----------------------------------------------
0 | embedding | Embedding        | 8.8 M 
1 | lstm      | LSTM             | 4.2 M 
2 | fc        | Linear           | 75.9 K
3 | loss_fn   | CrossEntropyLoss | 0     
-----------------------------------------------
13.1 M    Trainable params
0         Non-trainable params
13.1 M    Total params
52.304    Total estimated model params size (MB)


INFO:pytorch_lightning.utilities.rank_zero:GPU available: True (cuda), used: True
INFO:pytorch_lightning.utilities.rank_zero:TPU available: False, using: 0 TPU cores
INFO:pytorch_lightning.utilities.rank_zero:IPU available: False, using: 0 IPUs
INFO:pytorch_lightning.utilities.rank_zero:HPU available: False, using: 0 HPUs
INFO:pytorch_lightning.accelerators.cuda:LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
INFO:pytorch_lightning.callbacks.model_summary:
  | Name      | Type             | Params
-----------------------------------------------
0 | embedding | Embedding        | 8.8 M 
1 | lstm      | LSTM             | 4.2 M 
2 | fc        | Linear           | 75.9 K
3 | loss_fn   | CrossEntropyLoss | 0     
-----------------------------------------------
13.1 M    Trainable params
0         Non-trainable params
13.1 M    Total params
52.304    Total estimated model params size (MB)


Sanity Checking: 0it [00:00, ?it/s]

Training: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

  rank_zero_warn(
INFO:pytorch_lightning.utilities.rank_zero:Restoring states from the checkpoint path at /content/lightning_logs/version_2/checkpoints/epoch=7-step=1208.ckpt
INFO:pytorch_lightning.accelerators.cuda:LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
INFO:pytorch_lightning.utilities.rank_zero:Loaded model weights from the checkpoint at /content/lightning_logs/version_2/checkpoints/epoch=7-step=1208.ckpt


Testing: 0it [00:00, ?it/s]

---------------Original Sentence---------------
उनकी विशेषताओं आंदोलनों और खेल शैली के कारण उनकी तुलना <UNK> <UNK> से की गई है। <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> 
---------------Predicted tags---------------
O O O O O O O O O O O O O O O O <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> 
                precision    recall  f1-score   support

         <PAD>       1.00      1.00      1.00    172076
Creative Works       0.87      0.48      0.62      6689
         Group       0.91      0.77      0.84      9441
      Location       0.89      0.63      0.73     10394
       Medical       0.81      0.72      0.76      2747
             O       0.94      0.99      0.97    244083
        Person       0.88      0.54      0.67     12214
       Product       0.75      0.59      0.66      2331

      accuracy                           0.96    459975
     macro avg       0.88      0.72      0.78    459975
  weighted avg       0.96      0.96      0.96    459975



## Fine Model

In [18]:
run_model(train_file_path, dev_file_path, test_file_path, "hi", model_type=FINE)

Shape of trainX:  torch.Size([9632, 25]) , Shape of trainY:  torch.Size([9632, 25])
Shape of devX:  torch.Size([514, 25]) , Shape of devY:  torch.Size([514, 25])
Shape of testX:  torch.Size([18399, 25]) , Shape of testY:  torch.Size([18399, 25])
  | Name      | Type             | Params
-----------------------------------------------
0 | embedding | Embedding        | 8.8 M 
1 | lstm      | LSTM             | 4.2 M 
2 | fc        | Linear           | 69.7 K
3 | loss_fn   | CrossEntropyLoss | 0     
-----------------------------------------------
13.1 M    Trainable params
0         Non-trainable params
13.1 M    Total params
52.280    Total estimated model params size (MB)


INFO:pytorch_lightning.utilities.rank_zero:GPU available: True (cuda), used: True
INFO:pytorch_lightning.utilities.rank_zero:TPU available: False, using: 0 TPU cores
INFO:pytorch_lightning.utilities.rank_zero:IPU available: False, using: 0 IPUs
INFO:pytorch_lightning.utilities.rank_zero:HPU available: False, using: 0 HPUs
INFO:pytorch_lightning.accelerators.cuda:LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
INFO:pytorch_lightning.callbacks.model_summary:
  | Name      | Type             | Params
-----------------------------------------------
0 | embedding | Embedding        | 8.8 M 
1 | lstm      | LSTM             | 4.2 M 
2 | fc        | Linear           | 69.7 K
3 | loss_fn   | CrossEntropyLoss | 0     
-----------------------------------------------
13.1 M    Trainable params
0         Non-trainable params
13.1 M    Total params
52.280    Total estimated model params size (MB)


Sanity Checking: 0it [00:00, ?it/s]

Training: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

INFO:pytorch_lightning.utilities.rank_zero:Restoring states from the checkpoint path at /content/lightning_logs/version_1/checkpoints/epoch=8-step=1359.ckpt
INFO:pytorch_lightning.accelerators.cuda:LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
INFO:pytorch_lightning.utilities.rank_zero:Loaded model weights from the checkpoint at /content/lightning_logs/version_1/checkpoints/epoch=8-step=1359.ckpt


Testing: 0it [00:00, ?it/s]

---------------Original Sentence---------------
उनकी विशेषताओं आंदोलनों और खेल शैली के कारण उनकी तुलना <UNK> <UNK> से की गई है। <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> 
---------------Predicted tags---------------
O O O O O O O O O O B-HumanSettlement O O O O O <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> 
                         precision    recall  f1-score   support

                  <PAD>       1.00      1.00      1.00    172076
B-AerospaceManufacturer       0.50      0.06      0.11        85
  B-AnatomicalStructure       0.84      0.67      0.74       485
              B-ArtWork       0.10      0.00      0.00       425
               B-Artist       0.68      0.41      0.51      1847
              B-Athlete       0.83      0.58      0.69      1166
      B-CarManufacturer       0.85      0.86      0.86       146
               B-Cleric       0.78      0.86      0.82       188
             B-Clothing       0.73      0.76      0.75        75
              B-Di

# Bengali Language Models

In [20]:
train_file_path = folder_path + "/BN-Bangla/bn_train.conll"
dev_file_path = folder_path + "/BN-Bangla/bn_dev.conll"
test_file_path = folder_path + "/BN-Bangla/bn_test.conll"

## Coarse Model

In [22]:
run_model(train_file_path, dev_file_path, test_file_path, "bn", model_type=COARSE)

Shape of trainX:  torch.Size([9708, 25]) , Shape of trainY:  torch.Size([9708, 25])
Shape of devX:  torch.Size([507, 25]) , Shape of devY:  torch.Size([507, 25])
Shape of testX:  torch.Size([19859, 25]) , Shape of testY:  torch.Size([19859, 25])
  | Name      | Type             | Params
-----------------------------------------------
0 | embedding | Embedding        | 20.2 M
1 | lstm      | LSTM             | 4.2 M 
2 | fc        | Linear           | 75.9 K
3 | loss_fn   | CrossEntropyLoss | 0     
-----------------------------------------------
24.5 M    Trainable params
0         Non-trainable params
24.5 M    Total params
97.821    Total estimated model params size (MB)


INFO:pytorch_lightning.utilities.rank_zero:GPU available: True (cuda), used: True
INFO:pytorch_lightning.utilities.rank_zero:TPU available: False, using: 0 TPU cores
INFO:pytorch_lightning.utilities.rank_zero:IPU available: False, using: 0 IPUs
INFO:pytorch_lightning.utilities.rank_zero:HPU available: False, using: 0 HPUs
INFO:pytorch_lightning.accelerators.cuda:LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
INFO:pytorch_lightning.callbacks.model_summary:
  | Name      | Type             | Params
-----------------------------------------------
0 | embedding | Embedding        | 20.2 M
1 | lstm      | LSTM             | 4.2 M 
2 | fc        | Linear           | 75.9 K
3 | loss_fn   | CrossEntropyLoss | 0     
-----------------------------------------------
24.5 M    Trainable params
0         Non-trainable params
24.5 M    Total params
97.821    Total estimated model params size (MB)


Sanity Checking: 0it [00:00, ?it/s]

Training: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

  rank_zero_warn(
INFO:pytorch_lightning.utilities.rank_zero:Restoring states from the checkpoint path at /content/lightning_logs/version_4/checkpoints/epoch=7-step=1216.ckpt
INFO:pytorch_lightning.accelerators.cuda:LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
INFO:pytorch_lightning.utilities.rank_zero:Loaded model weights from the checkpoint at /content/lightning_logs/version_4/checkpoints/epoch=7-step=1216.ckpt


Testing: 0it [00:00, ?it/s]

---------------Original Sentence---------------
প্রোপেলারটি একটি ডি <UNK> এয়ারক্রাফ্ট কোম্পানি স্থির পিচ টাইপ ছিল। <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> 
---------------Predicted tags---------------
O O O O O Group O O O O <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> 
                precision    recall  f1-score   support

         <PAD>       1.00      1.00      1.00    240692
Creative Works       0.76      0.59      0.67      8334
         Group       0.87      0.78      0.82      8621
      Location       0.84      0.73      0.78     10677
       Medical       0.83      0.69      0.75      2585
             O       0.95      0.99      0.97    208686
        Person       0.86      0.70      0.77     14628
       Product       0.78      0.55      0.65      2252

      accuracy                           0.96    496475
     macro avg       0.86      0.75      0.80    496475
  weighted avg 

## Fine Model

In [21]:
run_model(train_file_path, dev_file_path, test_file_path, "bn", model_type=FINE)

Shape of trainX:  torch.Size([9708, 25]) , Shape of trainY:  torch.Size([9708, 25])
Shape of devX:  torch.Size([507, 25]) , Shape of devY:  torch.Size([507, 25])
Shape of testX:  torch.Size([19859, 25]) , Shape of testY:  torch.Size([19859, 25])


INFO:pytorch_lightning.utilities.rank_zero:GPU available: True (cuda), used: True
INFO:pytorch_lightning.utilities.rank_zero:TPU available: False, using: 0 TPU cores
INFO:pytorch_lightning.utilities.rank_zero:IPU available: False, using: 0 IPUs
INFO:pytorch_lightning.utilities.rank_zero:HPU available: False, using: 0 HPUs
INFO:pytorch_lightning.accelerators.cuda:LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
INFO:pytorch_lightning.callbacks.model_summary:
  | Name      | Type             | Params
-----------------------------------------------
0 | embedding | Embedding        | 20.2 M
1 | lstm      | LSTM             | 4.2 M 
2 | fc        | Linear           | 75.9 K
3 | loss_fn   | CrossEntropyLoss | 0     
-----------------------------------------------
24.5 M    Trainable params
0         Non-trainable params
24.5 M    Total params
97.821    Total estimated model params size (MB)


  | Name      | Type             | Params
-----------------------------------------------
0 | embedding | Embedding        | 20.2 M
1 | lstm      | LSTM             | 4.2 M 
2 | fc        | Linear           | 75.9 K
3 | loss_fn   | CrossEntropyLoss | 0     
-----------------------------------------------
24.5 M    Trainable params
0         Non-trainable params
24.5 M    Total params
97.821    Total estimated model params size (MB)


Sanity Checking: 0it [00:00, ?it/s]

Training: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

  rank_zero_warn(
INFO:pytorch_lightning.utilities.rank_zero:Restoring states from the checkpoint path at /content/lightning_logs/version_3/checkpoints/epoch=8-step=1368.ckpt
INFO:pytorch_lightning.accelerators.cuda:LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
INFO:pytorch_lightning.utilities.rank_zero:Loaded model weights from the checkpoint at /content/lightning_logs/version_3/checkpoints/epoch=8-step=1368.ckpt


Testing: 0it [00:00, ?it/s]

---------------Original Sentence---------------
প্রোপেলারটি একটি ডি <UNK> এয়ারক্রাফ্ট কোম্পানি স্থির পিচ টাইপ ছিল। <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> 
---------------Predicted tags---------------
O O O B-Food I-Artist B-ORG O O O O <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> 
                         precision    recall  f1-score   support

                  <PAD>       1.00      1.00      1.00    240692
B-AerospaceManufacturer       0.30      0.06      0.10        97
  B-AnatomicalStructure       0.89      0.55      0.68       532
              B-ArtWork       0.31      0.01      0.02       455
               B-Artist       0.52      0.48      0.50      2744
              B-Athlete       0.62      0.28      0.39      1086
      B-CarManufacturer       0.77      0.90      0.83        84
               B-Cleric       0.47      0.60      0.52       240
             B-Clothing       0.21 

# English Language Models

In [23]:
train_file_path = folder_path + "/EN-English/en_train.conll"
dev_file_path = folder_path + "/EN-English/en_dev.conll"
test_file_path = folder_path + "/EN-English/en_test.conll"

## Coarse Model

In [25]:
run_model(train_file_path, dev_file_path, test_file_path, "en", model_type=COARSE)

Shape of trainX:  torch.Size([16778, 25]) , Shape of trainY:  torch.Size([16778, 25])
Shape of devX:  torch.Size([871, 25]) , Shape of devY:  torch.Size([871, 25])
Shape of testX:  torch.Size([249980, 25]) , Shape of testY:  torch.Size([249980, 25])


INFO:pytorch_lightning.utilities.rank_zero:GPU available: True (cuda), used: True
INFO:pytorch_lightning.utilities.rank_zero:TPU available: False, using: 0 TPU cores
INFO:pytorch_lightning.utilities.rank_zero:IPU available: False, using: 0 IPUs
INFO:pytorch_lightning.utilities.rank_zero:HPU available: False, using: 0 HPUs
INFO:pytorch_lightning.accelerators.cuda:LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
INFO:pytorch_lightning.callbacks.model_summary:
  | Name      | Type             | Params
-----------------------------------------------
0 | embedding | Embedding        | 37.5 M
1 | lstm      | LSTM             | 4.2 M 
2 | fc        | Linear           | 75.9 K
3 | loss_fn   | CrossEntropyLoss | 0     
-----------------------------------------------
41.8 M    Trainable params
0         Non-trainable params
41.8 M    Total params
167.115   Total estimated model params size (MB)


  | Name      | Type             | Params
-----------------------------------------------
0 | embedding | Embedding        | 37.5 M
1 | lstm      | LSTM             | 4.2 M 
2 | fc        | Linear           | 75.9 K
3 | loss_fn   | CrossEntropyLoss | 0     
-----------------------------------------------
41.8 M    Trainable params
0         Non-trainable params
41.8 M    Total params
167.115   Total estimated model params size (MB)


Sanity Checking: 0it [00:00, ?it/s]

Training: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

  rank_zero_warn(
INFO:pytorch_lightning.utilities.rank_zero:Restoring states from the checkpoint path at /content/lightning_logs/version_6/checkpoints/epoch=6-step=1841.ckpt
INFO:pytorch_lightning.accelerators.cuda:LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
INFO:pytorch_lightning.utilities.rank_zero:Loaded model weights from the checkpoint at /content/lightning_logs/version_6/checkpoints/epoch=6-step=1841.ckpt


Testing: 0it [00:00, ?it/s]

---------------Original Sentence---------------
the species was described by <UNK> <UNK> after the <UNK> t. f. <UNK> . <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> 
---------------Predicted tags---------------
O O O O O Person Person O O O O Person Person O <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> 


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


                 precision    recall  f1-score   support

          <PAD>       1.00      1.00      1.00   2499089
      B-Disease       0.00      0.00      0.00         0
     B-Facility       0.00      0.00      0.00         0
     B-OtherPER       0.00      0.00      0.00         0
B-SportsManager       0.00      0.00      0.00         0
 Creative Works       0.64      0.55      0.59    169176
          Group       0.57      0.58      0.58    133847
        I-Drink       0.00      0.00      0.00         0
     I-Facility       0.00      0.00      0.00         0
     I-OtherLOC       0.00      0.00      0.00         0
     I-OtherPER       0.00      0.00      0.00         0
    I-OtherPROD       0.00      0.00      0.00         0
  I-PrivateCorp       0.00      0.00      0.00         0
  I-WrittenWork       0.00      0.00      0.00         0
       Location       0.79      0.61      0.69    130520
        Medical       0.65      0.26      0.38     33019
              O       0.93    

  _warn_prf(average, modifier, msg_start, len(result))


## Fine Model

In [24]:
run_model(train_file_path, dev_file_path, test_file_path, "en", model_type=FINE)

Shape of trainX:  torch.Size([16778, 25]) , Shape of trainY:  torch.Size([16778, 25])
Shape of devX:  torch.Size([871, 25]) , Shape of devY:  torch.Size([871, 25])
Shape of testX:  torch.Size([249980, 25]) , Shape of testY:  torch.Size([249980, 25])


INFO:pytorch_lightning.utilities.rank_zero:GPU available: True (cuda), used: True
INFO:pytorch_lightning.utilities.rank_zero:TPU available: False, using: 0 TPU cores
INFO:pytorch_lightning.utilities.rank_zero:IPU available: False, using: 0 IPUs
INFO:pytorch_lightning.utilities.rank_zero:HPU available: False, using: 0 HPUs
INFO:pytorch_lightning.accelerators.cuda:LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
INFO:pytorch_lightning.callbacks.model_summary:
  | Name      | Type             | Params
-----------------------------------------------
0 | embedding | Embedding        | 37.5 M
1 | lstm      | LSTM             | 4.2 M 
2 | fc        | Linear           | 75.9 K
3 | loss_fn   | CrossEntropyLoss | 0     
-----------------------------------------------
41.8 M    Trainable params
0         Non-trainable params
41.8 M    Total params
167.115   Total estimated model params size (MB)


  | Name      | Type             | Params
-----------------------------------------------
0 | embedding | Embedding        | 37.5 M
1 | lstm      | LSTM             | 4.2 M 
2 | fc        | Linear           | 75.9 K
3 | loss_fn   | CrossEntropyLoss | 0     
-----------------------------------------------
41.8 M    Trainable params
0         Non-trainable params
41.8 M    Total params
167.115   Total estimated model params size (MB)


Sanity Checking: 0it [00:00, ?it/s]

Training: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

  rank_zero_warn(
INFO:pytorch_lightning.utilities.rank_zero:Restoring states from the checkpoint path at /content/lightning_logs/version_5/checkpoints/epoch=7-step=2104.ckpt
INFO:pytorch_lightning.accelerators.cuda:LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
INFO:pytorch_lightning.utilities.rank_zero:Loaded model weights from the checkpoint at /content/lightning_logs/version_5/checkpoints/epoch=7-step=2104.ckpt


Testing: 0it [00:00, ?it/s]

---------------Original Sentence---------------
the species was described by <UNK> <UNK> after the <UNK> t. f. <UNK> . <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> 
---------------Predicted tags---------------
O O O B-WrittenWork O B-OtherPER I-OtherPER O O O B-OtherPER I-OtherPER I-OtherPER O <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> 
                         precision    recall  f1-score   support

                  <PAD>       1.00      1.00      1.00   2499089
B-AerospaceManufacturer       0.41      0.45      0.42      1013
  B-AnatomicalStructure       0.55      0.28      0.37      5824
              B-ArtWork       0.29      0.19      0.23      1264
               B-Artist       0.56      0.57      0.56     56981
              B-Athlete       0.63      0.46      0.53     27554
      B-CarManufacturer       0.50      0.33      0.40      2977
               B-Cleric       0.32      0.24      0.27      4725
             B-Clothing       0

In [46]:
# Function to get predictions for a sentence
def get_prediction(sentence):
    # Split the sentence into words
    words = sentence.split()
    for word in words:
      word = word.lower()

    # Get the index of the words
    X = []
    for word in words:
        if word not in word_to_idx:
            X.append(word_to_idx["<UNK>"])
        else:
            X.append(word_to_idx[word])
    # print(X)
    # X = [word_to_idx[word] for word in words]
    
    # Create a tensor
    X = torch.tensor(X).unsqueeze(0)

    global model
    X = X.to(device)
    model = model.to(device)
    
    # Get the predictions
    y_hat = model(X)
    
    # Get the predictions
    predictions = [idx_to_tag[pred] for pred in y_hat.argmax(-1).cpu().numpy().flatten().tolist()]

    # Give new sentence
    new_sentence = []
    flag = False
    tag = ""
    for word, pred in zip(words, predictions):
        print(word,pred)
        if pred.startswith("I"):
            # this is a continuation of the previous word
            new_sentence.append(word)
            continue
        
        if flag:
            new_sentence.append("]")
            flag = False
            new_sentence.append("<" + tag + ">")
        
        if pred.startswith("B"):
            new_sentence.append("[")
            new_sentence.append(word)
            flag = True
            tag = pred[2:]
            
        if pred.startswith("O"):
            new_sentence.append(word)
        
        if pred.__len__() == 0:
            new_sentence.append(word)
        
    if flag:
        new_sentence.append("]")
        new_sentence.append("<" + tag + ">")
    
    return " ".join(new_sentence)