In [3]:
filepath='/content/sample_data/hi_hdtb-ud-test.conllu'

**Data Cleaning**

In [4]:
def clean_conllu_file():
    with open(filepath, 'r') as f:
        data = f.readlines()
    output = ''
    for line in data:
        if line == '\n':
            continue
        if line.startswith('# sent_id'):
            output += '\n'
        output += line
    print(output[:100])
    with open('/content/sample_data/DATA/newfile.conllu', 'w+') as f:
        f.write(output)

In [5]:
clean_conllu_file()


# sent_id = test-s1
# text = इसके अतिरिक्त गुग्गुल कुंड, भीम गुफा तथा भीमशिला भी दर्शनीय स्थल हैं ।


**Converting a conllu file to json**

In [6]:
import pprint
import os
import json
import copy
def conllu_to_json(filepath=None):
    # initialize
    text = []
    # Read conllu file
    with open(filepath, 'r', encoding='utf-8') as f:
        source = f.read()
    sentences = source.strip().split('\n\n')
    for sentence in sentences:
        temp_lines = sentence.strip().split('\n')
        sent = {}
        lines = []
        for line in temp_lines:
            words = line.split(' ')
            # Drop all lines beginning with #
            if words[0] == '#':
                if words[1] == 'sent_id':
                    sent['sent_id'] = words[3]
            else:
                lines.append(line)
        reject_sentence = False
        words = []
        for line in lines:
            words_list = line.split('\t')
            try:
                int(words_list[0])
            except ValueError:
                reject_sentence = True
                break

            word = {
                "id": words_list[0],
                "form": words_list[1],
                "lemma": words_list[2],
                "upostag": words_list[3],
                "xpostag": words_list[4],
                "feats": words_list[5],
                "head": words_list[6],
                "deprel": words_list[7],
                "deps": words_list[8],
                "misc": words_list[9]
            }
            words.append(word)

            if word['deprel'] == 'root':
                words.append({
                    "id": "0",
                    "form": "<ROOT> ",
                    "lemma": "<ROOT>",
                    "upostag": "ROOT",
                    "xpostag": "ROOT",
                    "feats": "_",
                    "head": "-1",
                    "deprel": "_",
                    "deps": "_",
                    "misc": "_"
                })

        if reject_sentence:
            continue

        sent['words'] = words
        text.append(sent)

    with open(filepath.replace('conllu', 'json'), 'w+') as f:
        f.write(json.dumps(text, indent=4))

In [7]:
conllu_to_json('/content/sample_data/DATA/newfile.conllu')

In [8]:
import json
import os
import copy
import torch
import pickle
import time
import matplotlib.pyplot as plt
import numpy as np
import torch.nn as nn
import torch.autograd as autograd
import torch.optim as optim
import torch.nn.functional as F
from time import gmtime, strftime
from torch.autograd import Variable
from gensim.models import Word2Vec

**Word Embedding**

In [9]:
def pretrain_word_embeddings(data, len_word_embed, len_pos_embed):
    '''
    corpus_words in the following shape:
    [ [some sentence]
      [some other sentence] ]
    all_words in the shape:
    [ some sentence some other sentence]
    corpus_words in Format: [['I', 'like', 'custard'],...]
    corpus_pos in Format: [['NN', 'VB', 'PRN'],...]
    '''
    corpus_words = []
    corpus_pos = []
    all_words = []
    all_pos = []
    l2i = {}
    for sentence in data:
        words = []
        pos_s = []
        for word in sentence['words']:
            words.append(word['form'])
            pos_s.append(word['xpostag'])
            label = word['deprel']
            if label not in l2i:
                l2i[label] = len(l2i)
        corpus_words.append(words)
        corpus_pos.append(pos_s)
        all_words.extend(words)
        all_pos.extend(pos_s)
    with open('/content/sample_data/DATA/labels.json', 'w+') as f:
        f.write(json.dumps(l2i, indent=4))
    w2i = {word: idx for idx, word in enumerate(all_words)}
    p2i = {pos: idx for idx, pos in enumerate(all_pos)}

    # pre-train word and pos embeddings. These will be starting points for our learnable embeddings
    word_embeddings_gensim = Word2Vec(corpus_words, size=len_word_embed, window=5, min_count=1, workers=8)
    pos_embeddings_gensim = Word2Vec(corpus_pos, size=len_pos_embed, window=5, min_count=1, workers=8)

    # initialise the embeddings. The tensors are still empty
    pretrained_word_embeddings = torch.FloatTensor(max(w2i.values())+1, len_word_embed)
    pretrained_pos_embeddings = torch.FloatTensor(max(p2i.values())+1, len_pos_embed)

    # fill the tensors with the pre-trained embeddings
    for word in w2i.keys():
        idx = w2i[word]
        pretrained_word_embeddings[idx, :] = torch.from_numpy(word_embeddings_gensim[word])
    for pos in p2i.keys():
        idx = p2i[pos]
        pretrained_pos_embeddings[idx, :] = torch.from_numpy(pos_embeddings_gensim[pos])

    return w2i, p2i, l2i, pretrained_word_embeddings, pretrained_pos_embeddings


**Model Building**

In [10]:
class Network(nn.Module):
    def __init__(self, w2i, p2i, pretrained_word_embeddings, pretrained_pos_embeddings,
                 len_word_embed, len_pos_embed, len_feature_vec=20, lstm_hidden_size=400,
                 mlp_arc_hidden_size=500, mlp_label_hidden_size=200, n_label=47):

        super(Network, self).__init__()
        self.len_word_embed = len_word_embed
        self.len_pos_embed = len_pos_embed
        self.len_data_vec = len_word_embed + len_pos_embed
        self.len_feature_vec = len_feature_vec
        self.lstm_hidden_size = lstm_hidden_size
        self.mlp_arc_hidden_size = mlp_arc_hidden_size
        self.mlp_label_hidden_size = mlp_label_hidden_size
        self.n_label = n_label
        self.w2i = w2i
        self.p2i = p2i

        # trainable parameters
        self.word_embeddings = torch.nn.Embedding(len(pretrained_word_embeddings), len_word_embed)
        self.word_embeddings.weight = torch.nn.Parameter(pretrained_word_embeddings)
        self.pos_embeddings = torch.nn.Embedding(len(pretrained_pos_embeddings), len_pos_embed)
        self.pos_embeddings.weight = torch.nn.Parameter(pretrained_pos_embeddings)

        self.BiLSTM = torch.nn.LSTM(input_size=self.len_data_vec, hidden_size=self.lstm_hidden_size,
                                    num_layers = 3, dropout=.33, bidirectional=True)

        self.MLP_arc_head_layer1 = torch.nn.Linear(self.lstm_hidden_size * 2, mlp_arc_hidden_size)
        self.MLP_arc_head_layer2 = torch.nn.Linear(mlp_arc_hidden_size, len_feature_vec)
        self.MLP_arc_dep_layer1 = torch.nn.Linear(self.lstm_hidden_size * 2, mlp_arc_hidden_size)
        self.MLP_arc_dep_layer2 = torch.nn.Linear(mlp_arc_hidden_size, len_feature_vec)

        self.MLP_label_head_layer1 = torch.nn.Linear(self.lstm_hidden_size * 2, mlp_label_hidden_size)
        self.MLP_label_head_layer2 = torch.nn.Linear(mlp_label_hidden_size, len_feature_vec)
        self.MLP_label_dep_layer1 = torch.nn.Linear(self.lstm_hidden_size * 2, mlp_label_hidden_size)
        self.MLP_label_dep_layer2 = torch.nn.Linear(mlp_label_hidden_size, len_feature_vec)

        self.MLP_label_classifier_layer1 = torch.nn.Linear(self.len_feature_vec*2, self.len_feature_vec)
        self.MLP_label_classifier_layer2 = torch.nn.Linear(self.len_feature_vec, self.n_label)

        self.U_1 = nn.Parameter(torch.randn(len_feature_vec, len_feature_vec))
        self.u_2 = nn.Parameter(torch.randn(1, len_feature_vec))

        self.arc_loss = []
        self.label_loss = []
        self.total_loss = []
        self.arc_loss_particular = []
        self.label_loss_particular = []
        self.total_loss_particular = []

    def MLP_arc_head(self, r):
        hidden = F.relu(self.MLP_arc_head_layer1(r))
        h = self.MLP_arc_head_layer2(hidden)
        return h

    def MLP_arc_dep(self, r):
        hidden = F.relu(self.MLP_arc_dep_layer1(r))
        h = self.MLP_arc_dep_layer2(hidden)
        return h

    def MLP_label_head(self, r):
        hidden = F.relu(self.MLP_label_head_layer1(r))
        h = self.MLP_label_head_layer2(hidden)
        return h

    def MLP_label_dep(self, r):
        hidden = F.relu(self.MLP_label_dep_layer1(r))
        h = self.MLP_label_dep_layer2(hidden)
        return h

    def MLP_label_classifier(self, r):
        hidden = F.relu(self.MLP_label_classifier_layer1(r))
        h = self.MLP_label_classifier_layer2(hidden)
        return h

    def forward(self, sequence):
        # read input
        seq_len = len(sequence[0])
        word_sequence = sequence[:,0]
        pos_sequence = sequence[:,1]
        gold_tree = sequence[:,2] if seq_len == 3 else None # if there is no gold tree given, only predict arcs, not labels

        # find word embeddings given sequence of indices and construct data vector
        word_embeddings = self.word_embeddings(word_sequence)
        pos_embeddings = self.pos_embeddings(pos_sequence)
        x = torch.cat((word_embeddings, pos_embeddings), 1)
        x = x[:, None, :]  # add an empty y-dimension, because that's how LSTM takes its input

        # initialise hidden state of the LSTM
        hidden_init_1 = torch.zeros(6, 1, self.lstm_hidden_size)
        hidden_init_2 = torch.zeros(6, 1, self.lstm_hidden_size)
        if torch.cuda.is_available:
            hidden_init_1 = hidden_init_1.cuda()
            hidden_init_2 = hidden_init_2.cuda()
        hidden = (autograd.Variable(hidden_init_1), autograd.Variable(hidden_init_2))

        # embed words in their context
        r, _ = self.BiLSTM(x, hidden)

        # "fork in the road"; arcs
        h_arc_head = torch.squeeze(self.MLP_arc_head(r))
        h_arc_dep = torch.squeeze(self.MLP_arc_dep(r))
        adj_matrix = h_arc_head @ self.U_1 @ torch.t(h_arc_dep) + h_arc_head @ torch.t(self.u_2)

        pred_labels = None
        if gold_tree is not None:
            h_label_head = torch.squeeze(self.MLP_label_head(r))
            h_label_dep = torch.squeeze(self.MLP_label_dep(r))
            h_label_dep = h_label_dep[gold_tree.data]
            arcs_to_label = torch.cat((h_label_head, h_label_dep),1)
            pred_labels = self.MLP_label_classifier(arcs_to_label)
        return adj_matrix , pred_labels

**Training the Model**

In [17]:
def train(show=True, save=False):
    start = time.time()
    filepath_dataset = '/content/sample_data/DATA/newfile.json'
    data = json.load(open(filepath_dataset, 'r'))
    # initialise word-embeddings (the starting point from which we'll train)
    len_word_embed = 100
    len_pos_embed = 20
    w2i, p2i, l2i, pwe, ppe = pretrain_word_embeddings(data, len_word_embed, len_pos_embed)
    network = Network(w2i, p2i, pwe, ppe, len_word_embed, len_pos_embed, n_label=len(l2i))
    if torch.cuda.is_available():
        network.cuda()
    # initialise trainer
    criterion = nn.CrossEntropyLoss()
    lr = 0.002
    weight_decay = 1e-6
    betas = (0.9, 0.9)
    optimizer = optim.Adam(network.parameters(), lr=lr, betas=betas, weight_decay=weight_decay)
    # messages
    n_data = len(data)
    if show:
        if save:
            print('training the model. Weights will be backed up...')
        else:
            print('performing a dry run...')
        print('size of the dataset: ', n_data)

    # start training
    # an epoch is a loop over the entire dataset
    for epoch in range(50):
        arc_loss_per_data = []
        label_loss_per_data = []
        total_loss_per_data = []
        for i in range(len(data)):
            network.zero_grad()  # PyTorch remembers gradients. We can forget them now, because we are starting a new sentence
            # prepare targets
            seq_len = len(data[i]['words'])
            gold_mat = convert_sentence_to_adjacency_matrix(data[i])
            gold_tree = adjacency_matrix_to_tensor(gold_mat)
            arc_target = Variable(gold_tree, requires_grad=False)
            labels_target = torch.LongTensor(seq_len)
            for j, word in enumerate(data[i]['words']):
                labels_target[j] = l2i[word['deprel']]
            labels_target = Variable(labels_target, requires_grad=False)
            # prepare input
            sequence = torch.LongTensor(seq_len, 3)
            for j, word in enumerate(data[i]['words']):
                sequence[j,0] = w2i[word['form']]
                sequence[j,1] = p2i[word['xpostag']]
                sequence[j,2] = gold_tree[j]
            sequence_var = Variable(sequence)
            # prepare GPU
            if torch.cuda.is_available():
                arc_target = arc_target.cuda()
                labels_target = labels_target.cuda()
                sequence_var = sequence_var.cuda()

            # run the network
            adj_mat, labels_pred = network(sequence_var)

            # determine losses
            arc_pred = torch.t(adj_mat)  # nn.CrossEntropyLoss() wants the classes in the second dimension
            arc_loss = criterion(arc_pred, arc_target)
            label_loss = criterion(labels_pred, labels_target)
            total_loss = arc_loss + label_loss

            # backprop
            total_loss.backward()
            optimizer.step()

            new_tensor = torch.t(F.softmax(torch.t(adj_mat))).data
            if torch.cuda.is_available():
                new_tensor = new_tensor.cpu()

            plt.imshow(new_tensor.numpy())

        # backup current parameters and write to log
        current_date_and_time = strftime("%Y-%m-%d_%H:%M:%S", gmtime())
        if save:
            torch.save(network, '/content/sample_data/DATA' + '/latest_weights')
        # messages
        if show:
            print('-'*10)
            print('latest backup at ', current_date_and_time)
            # print('epoch {} loss {:.4f}'.format(epoch, network.total_loss[-1]))
    end = time.time()
    if show:
        print('execution took ', end - start, ' seconds')
        plt.show()
    return

In [18]:
def convert_sentence_to_adjacency_matrix(sentence):
    '''
    Input: sentence in json
    Output: adjancency matrix (gold standard)
    '''
    sentence_len = len(sentence['words'])
    # Initialize a matrix of size N x N
    adjancency_matrix = np.zeros((sentence_len, sentence_len))
    for word in sentence['words']:
        word_id = int(word['id'])
        head = int(word['head'])
        # Ignore the root(0)-(-1) connection
        if head == -1:
            continue
        adjancency_matrix[head][word_id] = 1
    return adjancency_matrix


def adjacency_matrix_to_tensor(matrix):
    output = [0] * matrix.shape[0]
    for i in range(matrix.shape[0]):
        for j in range(matrix.shape[0]):
            if matrix[i][j] == 1:
                output[j] = i
    output1 = torch.LongTensor(output)
    return(output1)


def get_labels():
    path = '/content/sample_data/DATA/labels.json'
    with open(path) as f:
        content = f.readlines()
    labels = {}
    count = 1
    for line in content:
        temp = line.split(':')
        if len(temp) == 2:
            # labels.append(temp[0])
            labels[count] = temp[0]
            labels[temp[0]] = count
            count += 1
    with open(path + '.json', 'w+') as f:
        f.write(json.dumps(labels, indent=4))

In [None]:
 train(save=True)



training the model. Weights will be backed up...
size of the dataset:  1684




----------
latest backup at  2021-04-30_15:18:47
----------
latest backup at  2021-04-30_15:19:58
----------
latest backup at  2021-04-30_15:21:08
----------
latest backup at  2021-04-30_15:22:19
----------
latest backup at  2021-04-30_15:23:30
----------
latest backup at  2021-04-30_15:24:42
----------
latest backup at  2021-04-30_15:25:54
----------
latest backup at  2021-04-30_15:27:06
----------
latest backup at  2021-04-30_15:28:17
----------
latest backup at  2021-04-30_15:29:29
----------
latest backup at  2021-04-30_15:30:42
----------
latest backup at  2021-04-30_15:31:55
----------
latest backup at  2021-04-30_15:33:08
----------
latest backup at  2021-04-30_15:34:21
----------
latest backup at  2021-04-30_15:35:34
----------
latest backup at  2021-04-30_15:36:47
----------
latest backup at  2021-04-30_15:38:01
----------
latest backup at  2021-04-30_15:39:16
----------
latest backup at  2021-04-30_15:40:29
----------
latest backup at  2021-04-30_15:41:42
----------
latest ba