In [1]:
from Data_Preprocessing import *
from CRF_model import *
import numpy as np

In [2]:
#Input: the filepath to one of en_ewt-ud-train.conllu, en_ewt-ud-test.conllu, or en_ewt-ud-dev.conllu
#Output: 1, x_final: a list containing each tokenized sentence within our file. 
#        2, x_lemmas_final: a list containing the lemma forms of each token within our tokenized sentences
#        3, y_final a list containing the POS tags that correspond to each token within our tokenized sentences
#        4, label_IDs: a dict containing every possible POS tag in our file alongside an integer ID for each one 
#        5, word_IDs: a dict containing every possible word token in our file alongside an integer ID for each one
       
def create_data(filepath, lines = None):
    
    with open(filepath, 'r', encoding='UTF-8') as f:
        x_toadd = []
        x_lemmas_toadd = []
        y_toadd = []
        x_final = []
        x_lemmas_final = []
        y_final = []
        label_IDs = {}
        word_IDs = {}
        
        limit = -1
        
        if lines is not None:
            limit = lines
        else:
            limit = sum(1 for line in f) + 1
        
        for line in f:
                       
            conllu = line.split()
            #case: end of sentence reached
            if len(conllu) == 0:
                
                #filters out one-word sentences such as links to websites
                if len(x_toadd) == 1:
                    x_toadd = []
                    x_lemmas_toadd = []
                    y_toadd = []  
                    continue
                    
                x_final.append(x_toadd)
                x_lemmas_final.append(x_lemmas_toadd)
                y_final.append(y_toadd)
                x_toadd = []
                x_lemmas_toadd = []
                y_toadd = []  
                
                limit -= 1
                if limit == 0:
                    break
                
                
                
            elif conllu[0].isnumeric():
                word = conllu[1]
#                 XPOS = conllu[4]
#                 extra_tags = conllu[5].split('|')
                label = conllu[3]
                lemma = conllu[2]
    
                if label not in label_IDs.keys():
                    label_IDs[label] = len(label_IDs)
                
                if word not in word_IDs.keys():
                    word_IDs[word] = len(word_IDs)
                    
                x_toadd.append(word)
                x_lemmas_toadd.append(lemma)
                y_toadd.append(label)
                
                
                    
                              
    return x_final, x_lemmas_final, y_final, label_IDs, word_IDs

In [3]:
x_sentences, x_lemmas, y_sentences, label_IDs, word_IDs = create_data("UD_English-EWT/en_ewt-ud-train.conllu", 10)

In [4]:
label_IDs['BOS'] = len(label_IDs)
label_IDs['EOS'] = len(label_IDs)
word_IDs['OOV'] = len(word_IDs)

In [5]:
datum = Data_Preprocessing(x_data = x_sentences[:10], y_data = y_sentences[:10], x_lemmas = x_lemmas[:10], x_vocab = list(word_IDs.keys()), labels = list(label_IDs.keys()), label_dict = label_IDs)

In [6]:
x_feats, y_feats = datum.generate_features()

Now generating x...
Now generating y...


In [7]:
x_feats_np = np.asarray(x_feats, dtype=object)

In [8]:
crf = CRF_model(x_feats = x_feats_np, y_feats = y_feats, label_dict = label_IDs)

In [9]:
#NOTE: Returns a math domain error due to trying to log zero or a negative no. in Z(x).
crf.train()

ERROR: tried to log 0 or a negative no. when calculating Z(x)! Breaking function
Debug data:
Z[BOS_ID][EOS_ID]:
-3.63710532005095e+18
Z: 
[[ 9.24161822e+17  9.24161822e+17  9.24161822e+17  9.24161822e+17
   9.24161822e+17  9.24161822e+17  9.24161822e+17  9.24161822e+17
   9.24161822e+17  9.24161822e+17  9.24161822e+17  9.24161822e+17
   9.24161822e+17  9.24161822e+17  9.24161822e+17  9.24161822e+17]
 [-1.50658099e+18 -1.50658099e+18 -1.50658099e+18 -1.50658099e+18
  -1.50658099e+18 -1.50658099e+18 -1.50658099e+18 -1.50658099e+18
  -1.50658099e+18 -1.50658099e+18 -1.50658099e+18 -1.50658099e+18
  -1.50658099e+18 -1.50658099e+18 -1.50658099e+18 -1.50658099e+18]
 [-2.11210722e+18 -2.11210722e+18 -2.11210722e+18 -2.11210722e+18
  -2.11210722e+18 -2.11210722e+18 -2.11210722e+18 -2.11210722e+18
  -2.11210722e+18 -2.11210722e+18 -2.11210722e+18 -2.11210722e+18
  -2.11210722e+18 -2.11210722e+18 -2.11210722e+18 -2.11210722e+18]
 [-3.19532178e+18 -3.19532178e+18 -3.19532178e+18 -3.19532178e+18
 

TypeError: 'int' object is not subscriptable