In [4]:
import numpy as np
import regex as re
from tensorflow import math
import tensorflow as tf

In [19]:
class Data_Preprocessing:
    def __init__(self, x_data, y_data, x_lemmas, x_vocab, labels, label_dict):
        
        #x_data: the sentence we're converting into features
        #y_data: the labels that correspond to the sentence we're converting
        #x_lemmas, the lemma forms of each word within the sentence we're converting
        #x_vocab: a set containing our x vocabulary
        #labels: a set containing our y vocabulary
        
        self.x = x_data
        self.y = y_data
        self.x_l = x_lemmas
        self.x_v = x_vocab
        self.l = labels
        self.l_d = label_dict
        
    #the state function b(x, i) in Intro to CRFs Hanna M. Wallach et al.
    def equivalence_func(self, x, i, x_seq):

        return 1 if x == x_seq[i] else 0

    def transition_funcs(self, labels):

#         transition_encodings = []

#         for y_prev in y_seq[:-1]:
#             for y in y_seq[1:]:
#                 transition_encodings += equivalence_func(x, i, x_seq) if y_prev == y_seq[i-1] and y == y_seq[i] else 0

#         return transition_encodings

        
        return [lambda y_prev, y_cur, i, x, y, x_l: equivalence_func(x[i], i, x) if y_prev == y[i-1] and y_cur == y[i] else 0
               for y_prev in labels for y_cur in labels]
    
        #return equivalence_func(x, i, x_seq) if y_prev == y_seq[i-1] and y == y_seq[i] else 0

    def state_funcs(self):

        features = []
        
        #feature 1: 1 if lemma form is same as base form, else 0
        features += [lambda y_prev, y_cur, i, x, y, x_l: 1 if x_l[i] is x[i] else 0]
        #feature 2: 1 if a numeral is present in x, else 0
        features += [lambda y_prev, y_cur, i, x, y, x_l: 1 if re.search('[0-9]', x[i]) else 0]
        #feature 3: a "word/label state" feature: a feature that says "this word right here is a combination of this
        #word and this label."
        features += [lambda y_prev, y_cur, i, x, y, x_l, x_word = x_word, y_label = y_label: 1 if y_label == y 
                     and i < len(x) and x_word == x[i] else 0
        for y_label in self.l
        for x_word in self.x_v]
        
        return features

    #Generates features for a single sentence in out x dataset
    def featurize_sentence(self, sentence, labels, lemmas, funcs):

        feature_len = len(funcs)
        #Dimensions of our features: the amount of bigrams in our input sentence, all possible labels for n-1, all possible
        #labels for n, the amount of feature functions for each word
        features = np.zeros((len(sentence) + 1, len(labels), len(labels), feature_len))
        
        #Enumerates over all possible label sequences for each word. Added 1 to x range since we're counting bigrams
        for i in range(0, len(sentence) + 1):
            for j in range(0, len(self.l)):
                for k in range(0, len(self.l)):
                    for x in range(0, len(funcs)):
                        features[i, j, k, x] = funcs[x](self.l[j], self.l[k], i, sentence, labels, lemmas[i])
    
    def featurize_labels(self, labels):
        BOS = 'BOS'
        EOS = 'EOS'
        
        labels.insert(0, BOS)
        labels.append(EOS)
        
        return [self.l_d[y] if y in self.l_d.keys() else self.l_d['OOV'] for y in labels]
    
    def generate_features(self):
        
        
        x_vals = [self.featurize_sentence(x, y, xl, [self.transition_funcs(self.l) + self.state_funcs()]) for x, y, xl in zip(self.x, self.y, self.x_l)]
        y_vals = [self.featurize_labels(y) for y in self.y]
        
        return x_vals, y_vals
        

In [3]:
np.exp(1)

2.718281828459045