Just using this to write the easily test the code for the baseline model. Final implementation will be in a py script, so it can be run from command line using GPU.


# To do!
- create function to extract data to train model
- create function to output tags into appropriate format
- make sure model works
- train model
- submit results



In [1]:
import numpy as np
import torch
import TorchCRF
from torch import nn

#Creating the class for the baseline Model

class baselineModel(torch.nn.Module):

    def __init__(self, nWords, tags, dimEmbed, dimHidden):
        super().__init__()
        self.dimEmbed = dimEmbed
        self.dimHidden = dimHidden
        self.vocabSize = nWords
        self.tagSetSize = len(tags)
        self.tagSet = tags

        self.embed = nn.Embedding(nWords, dimEmbed)
        self.LSTM = nn.LSTM(dimEmbed, dimHidden, batch_first = True, bidirectional = True)
        self.linear = nn.Linear(dimHidden * 2, self.tagSetSize) 
        
        self.CRF = TorchCRF.CRF(self.tagSetSize, batch_first = True)


        
    def forward(self, inputData):
        wordVectors = self.embed(inputData)
        out, _ = self.LSTM(wordVectors.view((inputData.shape[0], inputData.shape[1], self.dimEmbed)))
        out = self.output(out)
        logProbs = nn.functional.log_softmax(out, dim=1)
        return logProbs
        

In [10]:
def extract_data(file_path):
    """
    Returns:tuple: A tuple containing input data (list of lists of words), tags (list of lists of tags),
    and metadata (list of tuples containing newdoc_id, sent_id, and text).
    """
    words_data = []
    tags_data = []
    metadata = []
    current_sent = None
    with open(file_path, 'r', encoding='utf-8') as file:
        for line in file:
            line = line.strip()
            if line.startswith("# newdoc id"):
                newdoc_id = line.split("= ")[1]
            elif line.startswith("# sent_id"):
                sent_id = line.split("= ")[1]
            elif line.startswith("# text"):
                text = line.split("= ")[1]
            elif line:
                parts = line.split('\t')
                word = parts[1]
                tag = parts[2]
                if sent_id != current_sent:
                    current_sent = sent_id
                    words_data.append([word])
                    tags_data.append([tag])
                    metadata.append((newdoc_id, sent_id, text))
                else:
                    words_data[-1].append(word)
                    tags_data[-1].append(tag)
    return words_data, tags_data, metadata

# Example usage:
file_path = "C:/Users/danii/2YearProjectNLP_Group_abc/Data/UniversalNER/train/en_ewt-ud-train.iob2"
words_data, tags_data, metadata = extract_data(file_path)
for words, tags, meta in zip(words_data, tags_data, metadata):
    print("Words:", words)
    print("Tags:", tags)
    print("Metadata:", meta)
    print()


Words: ['Where', 'in', 'the', 'world', 'is', 'Iguazu', '?']
Tags: ['O', 'O', 'O', 'O', 'O', 'B-LOC', 'O']
Metadata: ('answers-20090605110235AAALlCt_ans', 'answers-20090605110235AAALlCt_ans-0001', 'Where in the world is Iguazu?')

Words: ['Iguazu', 'Falls']
Tags: ['B-LOC', 'I-LOC']
Metadata: ('answers-20090605110235AAALlCt_ans', 'answers-20090605110235AAALlCt_ans-0002', 'Iguazu Falls')

Words: ['Widely', 'considered', 'to', 'be', 'one', 'of', 'the', 'most', 'spectacular', 'waterfalls', 'in', 'the', 'world', ',', 'the', 'Iguazu', 'Falls', 'on', 'the', 'border', 'of', 'Argentina', 'and', 'Brazil', ',', 'are', 'a', 'certainly', 'must', 'see', 'attraction', 'in', 'the', 'area', '.']
Tags: ['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'B-LOC', 'I-LOC', 'O', 'O', 'O', 'O', 'B-LOC', 'O', 'B-LOC', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']
Metadata: ('answers-20090605110235AAALlCt_ans', 'answers-20090605110235AAALlCt_ans-0003', 'Widely considered to be 

In [21]:
class Vocab():
    def __init__(self, pad_unk='<PAD>'):
        """
        A convenience class that can help store a vocabulary
        and retrieve indices for inputs.
        """
        self.pad_unk = pad_unk
        self.word2idx = {self.pad_unk: 0}
        self.idx2word = [self.pad_unk]

    def getIdx(self, word, add=False):
        if word not in self.word2idx:
            if add:
                self.word2idx[word] = len(self.idx2word)
                self.idx2word.append(word)
            else:
                return self.word2idx[self.pad_unk]
        return self.word2idx[word]

    def getWord(self, idx):
        return self.idx2word[idx]
# Change words to Vocab indexes:
def toIdx (data):
    
    wordsIndx = []
    vocab = Vocab()
    len_train_data = len(data)
    max_len_sentance = max([len(x) for x in data ])
    for index in range(len_train_data):
        l = len(data[index])
        line = []

        for word in range(l):
            line.append(vocab.getIdx(data[index][word], add = True))

        for _ in range(max_len_sentance-len(line)):
            line.append(0)

        wordsIndx.append(line)
    return torch.tensor(wordsIndx)
    #return wordsIndx

toIdx(words_data)
toIdx(tags_data)

tensor([[   1,    2,    3,  ...,    0,    0,    0],
        [   6,    8,    0,  ...,    0,    0,    0],
        [   9,   10,   11,  ...,    0,    0,    0],
        ...,
        [  32, 7745,   11,  ...,    0,    0,    0],
        [ 879, 3479,   18,  ...,    0,    0,    0],
        [ 194,  201,  238,  ...,    0,    0,    0]])
