In [2]:
import numpy as np

In [90]:
def  MLE_emission_parameters(train_dir):
    ''' Calculates the emission parameters by count(y->x)/count(y)
    
    :param train_dir: our train file path to either ES or RU
    :type train_dir: str

    :return: count_y_dict, Count(y), keys are word '!', value MLE
    :rtype: dict

    :return: count_y_to_x_dict, Count(y->x), keys are tuples of word and label ('!', 'O'), value MLE
    :rtype: dict

    :return: emission_dict, Count(y->x)/Count(y), keys are tuples of word and label ('!', 'O'), value MLE
    :rtype: dict
    
    '''
    train_dir = "data/ES/train"

    count_y_dict = {}
    count_y_to_x_dict = {}
    emission_dict = {}

    with open(train_dir, "r", encoding="utf8") as f:
        for line in f:
            if len(line.split(" ")) == 2:
                word, label = line.replace("\n","").split(" ")
            else:
                continue
            if label in count_y_dict:
                count_y_dict[label] = count_y_dict.get(label)+1
            else:
                count_y_dict[label] = 1
            if (word,label) in count_y_to_x_dict:
                count_y_to_x_dict[(word,label)] = count_y_to_x_dict.get((word,label))+1
            else:
                count_y_to_x_dict[(word,label)] = 1
    print("count(y): \n", count_y_dict, "\n")
    print("count(y->x): \n",list(count_y_to_x_dict.items())[0:5], len(count_y_to_x_dict), "\n")
    for key, value in count_y_to_x_dict.items(): # Default is iterate keys()
        word = key[0]
        label = key[1]
        emission_dict[key] = value / count_y_dict.get(label)
    print("MLE: \n",list(emission_dict.items())[0:5], len(emission_dict) ,"\n")

    return count_y_dict, count_y_to_x_dict, emission_dict

def  add_unknown_word_token(count_y_dict,emission_dict, k=1):
    ''' Appends the unknown_word_token
    
    :param emission_dict: Emission dictionary
    :type emission_dict: dict

    :param k: we assume we have observed that there are k occurrences of such an event.
    :type k: int

    :return: emission_plus_unknown_dict, keys are tuple of word and label ('!', 'O'), value MLE
    :rtype: dict
    
    '''
    print("#UNK# values:")
    for key in count_y_dict:
        emission_dict[("#UNK#",key)] = k/(count_y_dict.get(key)+k)
        print(("#UNK#",key),emission_dict.get(("#UNK#",key)))
    return emission_dict


# Finding Emission Parameter

In [143]:
train_dir = "data/ES/train"

count_y_dict, count_y_to_x_dict, emission_dict = MLE_emission_parameters(train_dir)
emission_dict = add_unknown_word_token(count_y_dict,emission_dict,k=1)
emission_dict

count(y): 
 {'O': 31627, 'B-positive': 1274, 'B-negative': 429, 'I-negative': 229, 'I-positive': 400, 'B-neutral': 85, 'I-neutral': 44} 

count(y->x): 
 [(('disfrutemos', 'O'), 1), (('de', 'O'), 1091), (('una', 'O'), 238), (('buenísima', 'O'), 4), (('calidad', 'O'), 159)] 5969 

MLE: 
 [(('disfrutemos', 'O'), 3.161855376735068e-05), (('de', 'O'), 0.03449584216017959), (('una', 'O'), 0.007525215796629462), (('buenísima', 'O'), 0.00012647421506940273), (('calidad', 'O'), 0.005027350049008759)] 5969 

#UNK# values:
('#UNK#', 'O') 3.161755406601745e-05
('#UNK#', 'B-positive') 0.000784313725490196
('#UNK#', 'B-negative') 0.002325581395348837
('#UNK#', 'I-negative') 0.004347826086956522
('#UNK#', 'I-positive') 0.0024937655860349127
('#UNK#', 'B-neutral') 0.011627906976744186
('#UNK#', 'I-neutral') 0.022222222222222223


{('disfrutemos', 'O'): 3.161855376735068e-05,
 ('de', 'O'): 0.03449584216017959,
 ('una', 'O'): 0.007525215796629462,
 ('buenísima', 'O'): 0.00012647421506940273,
 ('calidad', 'O'): 0.005027350049008759,
 ('en', 'O'): 0.01799095709362254,
 ('el', 'O'): 0.02219622474468018,
 ('producto', 'B-positive'): 0.009419152276295133,
 ('y', 'O'): 0.03569734720333892,
 ('inmejorable', 'O'): 0.0004742783065102602,
 ('relación', 'O'): 0.001486072027065482,
 ('precio', 'O'): 0.004236886204824992,
 ('.', 'O'): 0.05659721124355772,
 ('Hoy', 'O'): 0.0001897113226041041,
 ('he', 'O'): 0.0012647421506940273,
 ('ido', 'O'): 0.0006007525215796629,
 ('a', 'O'): 0.014038637872703702,
 ('comer', 'O'): 0.0016441647959022354,
 ('con', 'O'): 0.010940019603503336,
 ('mia', 'O'): 3.161855376735068e-05,
 ('padres', 'O'): 3.161855376735068e-05,
 ('salido', 'O'): 3.161855376735068e-05,
 ('muy', 'O'): 0.013785689442564896,
 ('defraudado', 'O'): 0.00012647421506940273,
 ('A', 'O'): 0.000695608182881715,
 ('mejorar', 'O'

# Predicting our sequence labels

In [150]:
def predict_y(emission_dict,test_dir="data/ES/dev.in",output_dir="data/ES/dev.p1.out"):
    ''' Finds our predicted_y with our emission_table
    
    :param train_df: our train file in either ES or RU
    :type train_df: pd.DataFrame

    :return: emission_df, indexes are tuple of words and output ('!', 'O'), column MLE
    :rtype: pd.DataFrame
    
    '''
    
    emission_word_set = set(i[0] for i in list(emission_dict.keys()))
    emission_label_lst = list(set(i[1] for i in list(emission_dict.keys())))


    with open(output_dir,'w', encoding="utf-8") as f:
        with open(test_dir,'r',encoding="utf-8") as file:
            for line in file:
                if len(line.replace("\n","")) > 0:
                    word = line.replace("\n","")
                else:
                    f.write("\n")                    
                    continue
                if word not in emission_word_set: # If there is no such word in emission set set x as unknown
                    word = "#UNK#"

                label_lst = []
                for label in emission_label_lst:
                    if emission_dict.get((word,label)):
                        label_lst.append(emission_dict.get((word,label)))
                    else:
                        label_lst.append(0)
                predicted_y_idx = np.argmax(label_lst)
                predicted_y = emission_label_lst[predicted_y_idx] # Convert argmax index to predicted name
                f.write(f"{word} {predicted_y}\n") # write in our original word
            
def analysis(predicted_df,truth_df):

    correct_predictions = 0

    for i in range(len(truth_df)):
        predicted_df.iloc(i) 

In [151]:
test_dir = "data/ES/dev.in"
predict_y(emission_dict,test_dir)