### An encoding class for easy look up of the vocabularies.

In [5]:
import torch
from torch.utils.data import Dataset, DataLoader
from collections import Counter
import torch.nn as nn
from tqdm import tqdm
import numpy as np
from gensim.models import KeyedVectors 
np.random.seed(42)

In [6]:


class Vocabulary:
    
    def __init__(self, symbols = None):
        
        #dictionary to map the vocabulary with a id to build matrix
        #add "UNK" for unknown word to the initial mapping  
        self.word2idx = dict()
        self.idx2word = []

        if symbols:
            for sym in symbols:
                self.update(sym)

    def update(self, tok):
        
        #takes as input a symbol and build the mapping if it doesnt exist
        if tok not in self.word2idx:
            self.word2idx[tok] = len(self.idx2word)
            self.idx2word.append(tok)

    def lookup(self, tok, update = False):
        
        #find tok id given the string, if the tok does not exist return the idx of "UNK"
        if tok not in self.word2idx:
            if update:
                self.update(tok)
                return self[tok]
            return self.word2idx["<unk>"]
            
        return self.word2idx[tok]
    def rev_lookup(self, idx):
        
        #find the tok string given the id
        return self.idx2word[idx]
    
    def __getitem__(self, symbol):
        
        #if the symbol does not exist we see it as unk
        return self.lookup(symbol)
    
    def __len__(self):
        
        return len(self.idx2word)
    

### Load the data
Conll reader reads the conllu file and Data loader gonna load the data and feed the encoded data to the model(either for training or inference).

In [83]:
"""
Functions for reading and writing UD CONLL data
"""
CONLL_FIELDS = ["token", "pos", "features", "deprel"]
MWE_TAGS     = ["B", "I"]  #B for begin , I for inside


def readfile(filename, update = False, toks_vocab=Vocabulary(["<unk>", "<bos>", "<eos>"]), tags_vocab=Vocabulary(["B_X"])):
    """
    function to read and encode the corpus at one pass 
    signature for train corpus : X_toks, Y_tags = readfile("corpus/train.conllu", update=True)
    signature for test corpus/ dev corpus:  X_test, Y_test = readfile("corpus/train.conllu", update=True, vocabtoks_train, vocabtags_train)
    """

    istream              = open(filename, encoding = "utf-8")
    X_toks, Y_tags       = [], []
    sent_toks, sent_tags = [], []

    for line in istream:
        line = line.strip()
        if line and line[0] != "#":
            try:
                tokidx, token, lemma, upos, pos, features, headidx, deprel, extended, _ = line.split()

            except ValueError:
                pass
            if tokidx == "1":
                #beginning of sentence, add false toks  
                sent_toks.append(toks_vocab["<bos>"])
                sent_tags.append(tags_vocab["B_X"])
            
            #extract simple mwe tags
            mwe_tag = lambda x: "I" if features.startswith("component") else "B"
            #extract tagging information
            sent_toks.append(toks_vocab.lookup(tok = token, update = update))
            sent_tags.append(tags_vocab.lookup(tok = mwe_tag(features) + "_" + upos, update = update))
                
        elif sent_toks:
            #end of sentence, add  false tokens 
            sent_toks.append(toks_vocab["<eos>"])
            sent_tags.append(tags_vocab["B_X"])
            X_toks.append(sent_toks)
            Y_tags.append(sent_tags)
            sent_toks, sent_tags = [], []

    istream.close()
    #return the encoded data in list of list, the nested list represents the sentences
    return X_toks, Y_tags,toks_vocab, tags_vocab
# [{"token1": "token", "multiword": "mwe", "mwe lemma": "mwe lemma"}, {"token2": "token", "multiword": "mwe"}, {"token3": "token", "multiword": "mwe"}]




In [201]:
#build train vocab
X_toks, Y_tags,toks_vocab, tags_vocab = readfile("corpus/train.conllu", update=True)


In [202]:
class MWEDataset (Dataset):

    def __init__(self,datafilename = None, toks_vocab=Vocabulary(["<unk>", "<bos>", "<eos>"]), tags_vocab=Vocabulary(["B_X"]), isTrain = False, window_size = 0):
        """
        take as input either the path to a conllu file or a list of tokens
        we consider context size as the n preceding and n subsequent words in the text as the context for predicting the next word.
        """
        super(MWEDataset, self).__init__()

        self.toks_vocab, self.tags_vocab = toks_vocab, tags_vocab

        self.Xtoks_IDs,self.Ytags_IDs, self.toks_vocab, self.tags_vocab = readfile("corpus/train.conllu",
                                                                                       update=isTrain, 
                                                                                       toks_vocab = toks_vocab, 
                                                                                       tags_vocab=tags_vocab)
                                                                          

     
        print('token Vocab size',len(self.toks_vocab))
        self.window_size  = window_size
        self.data         = self.build_dataset(self.Xtoks_IDs,self.Ytags_IDs)
        
    def __len__(self):
        return len(self.data)
        
    def build_dataset(self,X_toks,Y_tags):
        """
        build examples with contextual tokens as features
        takes as input a nested list of encoded corpus, [sentences[tokens]]
        return a list of examples with context window features
        """
        examples = []
        for toks, tags in zip(X_toks, Y_tags):

            toks = [self.toks_vocab["<bos>"]]*self.window_size + toks + [self.toks_vocab["<eos>"]]*self.window_size #3+3+3
            
            for i in range(self.window_size, len(toks)-self.window_size, 1): #3, 6, 1

                examples.append((torch.tensor(toks[i - self.window_size : i + self.window_size+1]), torch.tensor(tags[i - self.window_size])))
                #print(examples[-1])
        return examples
        
    def __getitem__(self, idx):
        
        return self.data[idx]
        
    def as_strings(self,batch_tensor):
        """
        Returns a string representation of a tensor of word indexes
        """
        out = []
        for line in batch_tensor.tolist():
            out.append([self.tok_vocab.rev_lookup(idx) for idx in line])
        return out

    def get_loader(self, batch_size=1, num_workers=0, word_dropout=0., shuffle =False):
        return DataLoader(self, batch_size=batch_size, num_workers=num_workers,shuffle = shuffle)


### Stats

In [203]:
print(len(toks_vocab))
print(len(tags_vocab))

35694
41


In [204]:
corpuspath = "corpus/train.conllu"

train_dataset = MWEDataset("corpus/train.conllu",  isTrain = True,  window_size = 3)
testset    = MWEDataset("corpus/test.conllu", window_size = 3)

token Vocab size 35694
token Vocab size 35694


# Model 模型

In [211]:
class MweClassifer(nn.Module):
    
    def __init__(self, toks_vocab, tags_vocab, window_size = 0, emb_size=64, hidden_size=64, pretrainedw2v = None, drop_out = 0.):
        
        super(MweClassifer, self).__init__()
        
        self.word_embedding    = nn.Embedding(len(toks_vocab), emb_size)
        self.window_size      = window_size
        self.input_length      = 1 + window_size *2
        self.toks_vocab        = toks_vocab
        self.tags_vocab        = tags_vocab
        
        self.net = nn.Sequential(
            nn.Linear(emb_size*self.input_length, hidden_size), 
            nn.ReLU(),
            nn.Linear(hidden_size, hidden_size*2),
            nn.Dropout(drop_out),
            nn.ReLU(),
            nn.Linear(hidden_size*2, len(tags_vocab)), #output # of classes
            nn.LogSoftmax(dim=1)
        )
        
        if pretrainedw2v:
            self.word_embedding.weight.data.copy_(torch.from_numpy(self.pretrainedw2v_loader(pretrainedw2v).wv.vectors))
        
    @staticmethod
    def pretrainedw2v_loader(self, path_to_pretrained = None):
        if not path_to_pretrained:
            # Download the pretrained French Word2Vec model https://fauconnier.github.io/#data
            model_name = 'frWac_non_lem_no_postag_no_phrase_500_skip_cut100.bin.gz'
            model = api.load(model_name)
        else:
            model = KeyedVectors.load_word2vec_format(path_to_pretrained, binary=True)
        return model
        
    def forward(self, Xtoks_IDs):
        b, seq = Xtoks_IDs.shape

        input = self.word_embedding(Xtoks_IDs) # Batch, inputsize*emb_size
        #print(input.shape) #B, window_size, emb_size
        #input.view(b, -1) B, window_size * emb_size
        return self.net(input.view(b, -1))           # tag_size = 3

    def _init_weights(self):
        pass
    
        
    def train_model(self, trainset, epochs = 10, lr = 1e-3, batch_size = 10, device = "cpu", reg = None):
        """
        the train data is in form of nested lists: [sentences[tokens]]
        """
        self.to(device)
        #adaptive gradient descent, for every update lr is a function of the amount of change in the parameters
        #optimizer   = torch.optim.Adam()
        optimizer   = torch.optim.SGD(self.parameters(), lr = lr, momentum = 0.9)
        loss_fnc    = nn.NLLLoss()
        trainloader = trainset.get_loader(batch_size = batch_size, shuffle = True)

        train_loss = []
        
        for e in range(epochs):
            self.train()

            ep_loss = []
            
            for X_toks, Y_gold in tqdm(trainloader):
                #print(x.shape)
                optimizer.zero_grad()
                y_hat = self.forward(X_toks)
               
                
                #print(y_hat.shape)
                loss_value = loss_fnc(y_hat, Y_gold)
                ep_loss.append(loss_value.item())
                loss_value.backward()
                optimizer.step()
            loss = sum(ep_loss)/len(ep_loss)
            print(loss)
            train_loss.append(loss)
           
            #print("Epoch %d | Mean train loss  %.4f | Mean dev loss %.4f"%(e,loss, devloss) )
            print("Epoch %d | Mean train loss  %.4f"%(e,loss) )
            print()
        print(sum(train_loss)/len(train_loss))
        


    def validate(self, data_loader, device = "cpu"):
        loss_fnc   = nn.NLLLoss()
        loss_lst   = []
        self.eval()
        pass

    def predict(self, string):
        """
        
        """
        self.eval()

        pass
    def evaluation(self, y_hat, y_gold):
        TP = 0
        FN = 0
        FP = 0
        for logits, gold in zip(y_hat, y_gold):
            pred = int(torch.argmax(logits))
            gold = int(gold)

            if pred*gold!=0:
                TP+=1
            elif gold> 0:
                FN+=1
            elif pred>0:
                FP+=1
        return TP, FN, FP
            
            
                
            
            #if torch.argmax(y_hat) != gold and 
            
    

In [212]:
#define the hyperparameters
batch_size    = 32
window_size   =  3#left context and right context
lr            = 1e-3
device        = "cpu"
epochs        = 50
emb_size      = 64
hidden_size   = 64
nb_layers     = 2
drop_out      = 0.

In [213]:
train_dataset = MWEDataset("corpus/train.conllu",  isTrain = True,  window_size = 3)
testset    = MWEDataset("corpus/test.conllu", window_size = 3)

model = MweClassifer(toks_vocab   = train_dataset.toks_vocab,
                     tags_vocab   = testset.tags_vocab, 
                     window_size  = 3, 
                     emb_size     = emb_size, 
                     hidden_size  = hidden_size, 
                     drop_out     = 0.2)

token Vocab size 35694
token Vocab size 35694


In [None]:

train_loss = model.train_model(train_dataset,epochs= 2, lr=1e-3, batch_size = batch_size)


100%|██████████████████████████████████████| 9861/9861 [01:14<00:00, 132.19it/s]


1.5036433325277092
Epoch 0 | Mean train loss  1.5036



 91%|██████████████████████████████████▍   | 8950/9861 [01:09<00:06, 130.29it/s]

In [20]:
train_loss

0.02773142185015094

In [265]:
with torch.no_grad():
    
    tp, fn, fp = 0, 0, 0
    for X, X_deprel, X_pos, y_true in testset.get_loader(batch_size = 500):
        #logits = model(X, X_deprel.zero_())
        #logits = model(X, X_deprel)
        a, b, c = model.evaluation(logits, y_true)
        tp+=a
        fn+=b
        fp+=c
    
    

In [266]:
print(tp)
print(fn)
print(fp)

0
1419
0


In [None]:
print(tp/(fn+tp))