In [207]:
import pandas as pd 
import numpy as np



In [170]:
def  MLE_emission_parameters(train_df):
    ''' Calculates the emission parameters by count(y->x)/count(y)
    
    :param train_df: our train file in either ES or RU
    :type train_df: pd.DataFrame

    :return: emission_df, indexes are tuple of words and output ('!', 'O'), column MLE
    :rtype: pd.DataFrame
    
    '''
    train_df = train_df[0].str.split(" ", expand=True) # Get our words and labels
    train_df.rename(columns={0: "word", 1: "label"},inplace=True) 
    y_labels_counts = train_df['label'].value_counts() # count(y) for each label
    print("count(y): \n", y_labels_counts, "\n")
    y_labels_to_x_counts = pd.DataFrame(train_df.groupby(by=["word",'label']).size(),index=None,columns=["count"]) # count(y->x) for each label and word
    print("count(y->x): \n",y_labels_to_x_counts, "\n")
    emission_df = y_labels_to_x_counts.join(other=y_labels_counts,on="label") 
    emission_df['MLE'] = emission_df['count'] /emission_df['label'] # calculates our MLE
    df = pd.DataFrame(emission_df['MLE'])
    df.reset_index(inplace=True)
    # print(df)
    return df

def  add_unknown_word_token(train_df, k=1):
    ''' Appends the unknown_word_token
    
    :param train_df: our train file in either ES or RU
    :type train_df: pd.DataFrame

    :param k: we assume we have observed that there are k occurrences of such an event.
    :type k: int

    :return: emission_plus_unknown_df, indexes are tuple of words and output ('!', 'O'), column MLE
    :rtype: pd.DataFrame
    
    '''
    y_labels_counts = train_df['label'].value_counts() # count(y) for each label
    for idx in y_labels_counts.index:
        count_y = y_labels_counts.loc[idx]
        train_df = train_df.append({"word":"#UNK#","label":idx,"MLE":k/(count_y+k)},ignore_index=True) 
    return train_df


# Finding Emission Parameter

In [172]:
train_dir = "data/ES/train"

train_df = pd.read_csv(train_dir,sep="/t",index_col=None,header=None) # Fix error of tokenization
df = MLE_emission_parameters(train_df)
df = add_unknown_word_token(df,k=1)
df

count(y): 
 O             31627
B-positive     1274
B-negative      429
I-positive      400
I-negative      229
B-neutral        85
I-neutral        44
Name: label, dtype: int64 

count(y->x): 
                  count
word label            
!    O             162
"    I-negative      2
     I-positive      3
     O              50
%    O              13
...                ...
â€˜  O               1
â€™  O               9
â€�  I-negative      1
     I-positive      1
     O              34

[5969 rows x 1 columns] 



  This is separate from the ipykernel package so we can avoid doing imports until


Unnamed: 0,word,label,MLE
0,!,O,0.005122
1,"""",I-negative,0.008734
2,"""",I-positive,0.007500
3,"""",O,0.001581
4,%,O,0.000411
...,...,...,...
5971,#UNK#,B-negative,0.004695
5972,#UNK#,I-positive,0.004831
5973,#UNK#,I-negative,0.007194
5974,#UNK#,B-neutral,0.018519


# Emissions table

In [231]:
emission_table = pd.pivot_table(df, values='MLE', index=['word'],
                    columns=['label'])
emission_table.fillna(0,inplace=True)
emission_table

label,B-negative,B-neutral,B-positive,I-negative,I-neutral,I-positive,O
word,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
!,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.005122
"""",0.000000,0.000000,0.000000,0.008734,0.000000,0.007500,0.001581
#UNK#,0.004695,0.018519,0.003236,0.007194,0.029412,0.004831,0.000199
%,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000411
(,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.004395
...,...,...,...,...,...,...,...
â€³,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000032
â€œ,0.000000,0.000000,0.000000,0.000000,0.000000,0.002500,0.000980
â€˜,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000032
â€™,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000285


# Predicting our sequence labels

In [293]:
def predict_y(test_df,emission_table,output_dir="data/ES/dev.p1.out"):
    ''' Finds our predicted_y with our emission_table
    
    :param train_df: our train file in either ES or RU
    :type train_df: pd.DataFrame

    :return: emission_df, indexes are tuple of words and output ('!', 'O'), column MLE
    :rtype: pd.DataFrame
    
    '''

    with open(output_dir,'w', encoding="utf-8") as f:
        emission_set = set(emission_table.index)
        for row in test_df.values: # Finds for each x
            x = row[0]
            if pd.isna(x):
                f.write("\n")
                continue
            if x not in emission_set: # If there is no such word in emission set set x as unknown
                x = "#UNK#"
            predicted_y_idx = np.argmax(emission_table.loc[x])
            predicted_y = emission_table.columns[predicted_y_idx] # Convert argmax index to predicted name
            f.write(f"{row[0]} {predicted_y}\n") # write in our original word
            
def analysis(predicted_df,truth_df):

    correct_predictions = 0

    for i in range(len(truth_df)):
        predicted_df.iloc(i) 
    





In [294]:
test_dir = "data/ES/dev.in"

test_df = pd.read_csv(test_dir,sep="/t",index_col=None,header=None,skip_blank_lines=False) # Fix error of tokenization
test_df

  This is separate from the ipykernel package so we can avoid doing imports until


Unnamed: 0,0
0,La
1,comida
2,estuvo
3,muy
4,sabrosa
...,...
5555,faltara
5556,de
5557,nada
5558,!


In [295]:
predict_y(test_df,emission_table)

['La']
['comida']
['estuvo']
['muy']
['sabrosa']
['.']
[nan]
['Servicio']
['esmerado']
['.']
[nan]
['Para']
['un']
['menu']
['gastronomico']
[',']
['si']
['la']
['comida']
['es']
['solo']
['buena']
[',']
['por']
['no']
['decir']
['discreta']
['(']
['en']
['lo']
['que']
['a']
['paella']
['como']
['plato']
['principal']
['se']
['refiere']
[')']
['el']
['resultado']
['fue']
['algo']
['decepcionante']
[',']
['aunque']
['el']
['precio']
['podria']
['considerarse']
['excepcional']
['.']
[nan]
['Servicio']
['atento']
['y']
['eficiente']
[',']
['ambiente']
['agradable']
['.']
[nan]
['Tienen']
['una']
['carta']
['bastante']
['extensa']
['donde']
['cada']
['plato']
['parece']
['mejor']
['que']
['el']
['anterior']
['.']
[nan]
['Un']
['10']
['en']
['comida']
['mediterranea']
['!']
[nan]
['Una']
['verguenza']
['todo']
['.']
[nan]
['lo']
['tendre']
['en']
['cuenta']
['para']
['las']
['proximas']
['veces']
['.']
[nan]
['Mi']
['opiniÃ³n']
['sobre']
['el']
['restaurante']
['La']
['Poma']
['es']
['que']