### Implementation of LASER to the corpus
### EN-FI

In [1]:
# Imports
import numpy as np
from scipy import spatial
import pandas as pd
from datetime import datetime

from sklearn.linear_model import LinearRegression
from sklearn.neural_network import MLPRegressor
from sklearn.model_selection import train_test_split, KFold
from sklearn.metrics import mean_absolute_error
from sklearn.preprocessing import StandardScaler

import keras
from keras import models
from keras import layers


Using TensorFlow backend.


Load Dataset

In [2]:
df1 = pd.read_csv("scores.csv")

In [3]:
df1.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6748 entries, 0 to 6747
Data columns (total 6 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   source       6748 non-null   object 
 1   reference    6748 non-null   object 
 2   translation  6748 non-null   object 
 3   z-score      6748 non-null   float64
 4   avg-score    6748 non-null   float64
 5   annotators   6748 non-null   int64  
dtypes: float64(2), int64(1), object(3)
memory usage: 316.4+ KB


In [4]:
df1.head()

Unnamed: 0,source,reference,translation,z-score,avg-score,annotators
0,"You can turn yourself into a pineapple, a dog ...","Voit muuttaa itsesi ananasta, koirasta tai Roy...","Voit muuttaa itsesi ananakseksi, koiraksi tai ...",-0.286195,34.2,5
1,Also shot were three men: two 29-year-olds and...,Myös ammuttiin kolme miestä: kaksi 29-vuotiait...,Myös kolmea miestä ammuttiin: kahta 29-vuotias...,0.547076,58.4,5
2,The information is stored at the cash register...,Tiedot tallennetaan kassakoneisiin joka tapauk...,Tiedot kuitenkin tallentuvat kassoilla joka ta...,1.122476,74.6,5
3,Xinhua says that there were traces of hydrochl...,"Xinhua kertoo, että Xinyin näytteestä oli sunn...","Xinhua kertoo, että Xinyin sunnuntaina antamas...",0.383095,53.6,5
4,"MacDonald, who was brought on board CBC's comm...",Voitaisiin kuulla CBD: n kommenttitiimin toimi...,"MacDonaldin, joka tuli CBC:n selostajatiimiin ...",-0.493065,32.25,4


Comparison of LASER embeddings

In [5]:
source_arr = np.load("laser.source_embeds.npy")
refer_arr = np.load("laser.reference_embeds.npy")
trans_arr = np.load("laser.translation_embeds.npy")

Cosine Similarity

In [6]:
cos_similarity_ref_hyp = []
cos_similarity_src_ref = []
cos_similarity_src_hyp = []

# for each of the cos similarity, put them into lists.
for i in range(refer_arr.shape[0]):
    cos_similarity_ref_hyp.append((spatial.distance.cosine(refer_arr[i],trans_arr[i])*-1)+1)
    cos_similarity_src_ref.append((spatial.distance.cosine(source_arr[i],refer_arr[i])*-1)+1)
    cos_similarity_src_hyp.append((spatial.distance.cosine(source_arr[i],trans_arr[i])*-1)+1)


# Standardize and transform into series.
scaler = StandardScaler()
cs_rh = pd.Series(scaler.fit_transform(np.array(cos_similarity_ref_hyp).reshape(-1,1)).flatten(),name="cos_sim_ref_hyp")
cs_sr = pd.Series(scaler.fit_transform(np.array(cos_similarity_src_ref).reshape(-1,1)).flatten(),name="cos_sim_src_ref")
cs_sh = pd.Series(scaler.fit_transform(np.array(cos_similarity_src_hyp).reshape(-1,1)).flatten(),name="cos_sim_src_hyp")



In [7]:
cos_df = pd.concat([cs_rh,cs_sr,cs_sh,df1.loc[:,"z-score"]],axis=1)
cos_df.head()

Unnamed: 0,cos_sim_ref_hyp,cos_sim_src_ref,cos_sim_src_hyp,z-score
0,0.477108,0.201864,0.526518,-0.286195
1,1.095012,0.719836,0.963557,0.547076
2,0.16388,0.209093,-1.019948,1.122476
3,0.608808,-0.279303,-0.384327,0.383095
4,-0.16637,0.129337,0.2895,-0.493065


In [8]:
# Pearson
print("Pairwise Pearson")
cos_df.corr()

Pairwise Pearson


Unnamed: 0,cos_sim_ref_hyp,cos_sim_src_ref,cos_sim_src_hyp,z-score
cos_sim_ref_hyp,1.0,0.845841,0.424134,0.580044
cos_sim_src_ref,0.845841,1.0,0.411307,0.473789
cos_sim_src_hyp,0.424134,0.411307,1.0,0.120887
z-score,0.580044,0.473789,0.120887,1.0


In [9]:
# Kendall
print("Pairwise Kendall")
cos_df.corr(method="kendall")

Pairwise Kendall


Unnamed: 0,cos_sim_ref_hyp,cos_sim_src_ref,cos_sim_src_hyp,z-score
cos_sim_ref_hyp,1.0,0.496232,0.426147,0.396893
cos_sim_src_ref,0.496232,1.0,0.415437,0.285051
cos_sim_src_hyp,0.426147,0.415437,1.0,0.090363
z-score,0.396893,0.285051,0.090363,1.0


In [10]:
# Mean Absolute Deviation of each of these.
for col in cos_df.columns[:3]:
    print(f"Mean Absolute Error of {col} in regards to the z-score: {mean_absolute_error(cos_df[col],cos_df['z-score'])}")

Mean Absolute Error of cos_sim_ref_hyp in regards to the z-score: 0.6917964800619091
Mean Absolute Error of cos_sim_src_ref in regards to the z-score: 0.7733002483540644
Mean Absolute Error of cos_sim_src_hyp in regards to the z-score: 0.9760511145354878


---
Linear Regression <br>to try and predict avg score based on the cos similarity 

In [11]:
X = cos_df.drop(columns='z-score')
y = cos_df['z-score']

In [12]:
lin_model = LinearRegression()

In [13]:
kf = KFold(n_splits = 10)
for train_index, test_index in kf.split(X):
    X_train, X_val = X.iloc[train_index], X.iloc[test_index]
    y_train, y_val = y.iloc[train_index], y.iloc[test_index]
    
    lin_model.fit(X_train,y_train)
    y_pred = lin_model.predict(X_val)
    lin_model_3_mae = mean_absolute_error(y_val,y_pred)
    print(lin_model_3_mae)

0.6468149724838524
0.5216359161669204
0.5891346545590176
0.5811340058834468
0.5058610726747285
0.6333172450476015
0.6290470524901891
0.6102779526848207
0.604387008203543
0.5502845073983063


In [14]:
lin_model_3_par = [lin_model.intercept_]
lin_model_3_par.extend([coef for coef in lin_model.coef_])
print(lin_model_3_par)

[-0.12910357677916118, 0.6033789040394587, -0.03223887391205409, -0.12863824289315037]


In [15]:
# Check the corr of this model. 
lin_3_pred = cos_df.apply(lambda x: lin_model_3_par[0] + lin_model_3_par[1]*x[0] + lin_model_3_par[2]*x[1] + lin_model_3_par[3]*x[2],axis=1)
lin_3_corr_p = lin_3_pred.corr(df1["z-score"], method="pearson")
lin_3_corr_k = lin_3_pred.corr(df1["z-score"], method="kendall")
print(str(lin_3_corr_p)+" pearson")
print(str(lin_3_corr_k)+" kendall")

0.5964671629760575 pearson
0.4175936670826609 kendall


In [16]:
# A model with src_ref and src_hyp only. 
X = cos_df.drop(columns=['cos_sim_ref_hyp','z-score'])
y = cos_df['z-score']

In [17]:
kf = KFold(n_splits = 10)
for train_index, test_index in kf.split(X):
    X_train, X_val = X.iloc[train_index], X.iloc[test_index]
    y_train, y_val = y.iloc[train_index], y.iloc[test_index]
    
    lin_model.fit(X_train,y_train)
    y_pred = lin_model.predict(X_val)
    print(mean_absolute_error(y_val,y_pred))

0.7050738881793724
0.555527514567338
0.6281727759117081
0.6279054469972988
0.5632306713185548
0.6283650135244989
0.6881284006574842
0.6807746537623373
0.6675518484751372
0.5884071146266736


In [18]:
y_pred

array([-1.35867745e-01, -7.49542407e-01,  5.75584852e-02, -6.14337038e-01,
        1.99056500e-01,  1.49862276e-01, -1.34404671e-01, -2.18168492e-01,
       -2.74559980e-02, -1.37366406e+00, -5.86309862e-01,  5.38773912e-02,
        3.47617470e-02, -4.16060472e-01, -1.73565166e-02, -5.23431664e-02,
        7.05609163e-02,  2.15281024e-01,  5.12335711e-03,  2.33288897e-02,
       -3.16280503e-01, -1.40422029e-01,  1.42377211e-01,  1.32901871e-01,
       -1.26015190e-01,  2.22110656e-01, -3.66982295e-01,  1.02114820e-01,
        6.25822506e-02, -5.92147077e-01,  2.82625365e-01,  1.99466035e-01,
       -5.72518350e-02,  2.18319290e-01, -1.37853419e-01, -1.21254384e-01,
        1.05134856e-01,  2.03434995e-02,  6.78040110e-02, -2.65909374e-01,
       -1.99589275e-01, -1.75841720e-01,  1.88132933e-01,  6.00034354e-02,
       -5.18864424e-01,  1.08209975e-01,  1.27406164e-01, -2.73179516e-02,
        1.00146652e-01, -3.25212911e-01,  1.94734530e-01, -3.43042808e-01,
        2.42561631e-01, -

In [19]:
lin_model_2_par = [lin_model.intercept_]
lin_model_2_par.extend([coef for coef in lin_model.coef_])
print(lin_model_2_par)

[-0.12420079734183924, 0.46567598768565377, -0.08263205778532721]


In [20]:
# Check the corr of this model. 
lin_2_pred = cos_df.apply(lambda x: lin_model_2_par[0] + lin_model_2_par[1]*x[0] + lin_model_2_par[2]*x[1],axis=1)
lin_2_corr_p = lin_2_pred.corr(df1["z-score"])
lin_2_corr_k = lin_2_pred.corr(df1["z-score"],method="kendall")
print(str(lin_2_corr_p)+" pearson")
print(str(lin_2_corr_p)+" kendall")

0.579973394417236 pearson
0.579973394417236 kendall


---
Using a Neural Network

In [21]:
MLP_model = MLPRegressor(hidden_layer_sizes=(2,2),max_iter=250,learning_rate_init=0.0015,solver="lbfgs")

In [22]:
# Train test Split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=42)

In [23]:
scaler = StandardScaler().fit(X_train)
X_train = scaler.transform(X_train)
X_test = scaler.transform(X_test)

In [24]:
kf = KFold(5)
for train_index, val_index in kf.split(X_train):
    kf_X_train, X_val = X_train[train_index], X_train[val_index]
    kf_y_train, y_val = y_train.iloc[train_index], y_train.iloc[val_index]    
    
    MLP_model.fit(kf_X_train,kf_y_train)
    y_val_pred = MLP_model.predict(X_val)
    print(f"Model loss: {MLP_model.loss_}")
    print(f"The MAE is {mean_absolute_error(y_val,y_val_pred)}")

Model loss: 0.29345538877037475
The MAE is 0.6305805977262839
Model loss: 0.3917660954954603
The MAE is 0.7369247136040338
Model loss: 0.28602705696677616
The MAE is 0.6089573017716251
Model loss: 0.291571739540215
The MAE is 0.5898337554168828
Model loss: 0.2972188243189721
The MAE is 0.6101514771694931


---
Using Neural Networks, directly on the embeddings.

In [25]:
# An input has to be of shape (3,1024), because we have 3 embedded vectors of size 1024.
# combined array
#full_arr = np.dstack((source_arr,refer_arr,trans_arr))

# Option 2, make it size (1,2048), by concatenating the arrays. This is what we are using now, according to teacher's indications.
full_arr_c = np.concatenate((refer_arr,trans_arr),axis=1)


In [26]:
X = full_arr_c.copy()
y = df1["z-score"].to_numpy()

# Train test Split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=42)

In [27]:
X_train.shape

(6073, 2048)

In [28]:
model = models.Sequential()

model.add(layers.Dense(1024,activation="relu",input_dim=X_train.shape[1]))
model.add(layers.Dense(512,activation="relu"))
model.add(layers.Dropout(0.5))
model.add(layers.Dense(128,activation="relu"))
model.add(layers.Dropout(0.25))
model.add(layers.Dense(64,activation="relu"))

model.add(layers.Dense(1,activation="tanh"))
model.compile(optimizer=keras.optimizers.Adam(lr=0.002),loss="mse",metrics=["mae"])

In [29]:
model.summary()

Model: "sequential_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_1 (Dense)              (None, 1024)              2098176   
_________________________________________________________________
dense_2 (Dense)              (None, 512)               524800    
_________________________________________________________________
dropout_1 (Dropout)          (None, 512)               0         
_________________________________________________________________
dense_3 (Dense)              (None, 128)               65664     
_________________________________________________________________
dropout_2 (Dropout)          (None, 128)               0         
_________________________________________________________________
dense_4 (Dense)              (None, 64)                8256      
_________________________________________________________________
dense_5 (Dense)              (None, 1)                

In [30]:
# Fit. Batch Size < > Learning Rate

model.fit(X_train,y_train,epochs=50,batch_size=64,verbose=0)

<keras.callbacks.callbacks.History at 0x1ff85aa6b70>

In [31]:
# Evaluation and comparison.
test_mse,test_mae = model.evaluate(X_test,y_test)
test_corr = np.corrcoef(model.predict(X_test).flatten(),y_test)[0][1]
print(f"Test MSE: {test_mse:.4f}.\nTest MAE: {test_mae:.4f}.")
print(f"Pearson correlation between y_val_predicted and actual y_val: {test_corr:.4f}")

Test MSE: 0.6503.
Test MAE: 0.6510.
Pearson correlation between y_val_predicted and actual y_val: 0.5006


In [32]:
# For reference, pt.2
for col in cos_df.columns[:3]:
    print(f"Mean Absolute Error of {col} in regards to the z-score: {mean_absolute_error(cos_df[col],cos_df['z-score'])}")

Mean Absolute Error of cos_sim_ref_hyp in regards to the z-score: 0.6917964800619091
Mean Absolute Error of cos_sim_src_ref in regards to the z-score: 0.7733002483540644
Mean Absolute Error of cos_sim_src_hyp in regards to the z-score: 0.9760511145354878


In [33]:
print("Pearson correlation of cos_sim")
print(cos_df.corr().iloc[:,3])

Pearson correlation of cos_sim
cos_sim_ref_hyp    0.580044
cos_sim_src_ref    0.473789
cos_sim_src_hyp    0.120887
z-score            1.000000
Name: z-score, dtype: float64


In [34]:
print("Kendall correlation of cos_sim")
print(cos_df.corr(method="kendall").iloc[:,3])

Kendall correlation of cos_sim
cos_sim_ref_hyp    0.396893
cos_sim_src_ref    0.285051
cos_sim_src_hyp    0.090363
z-score            1.000000
Name: z-score, dtype: float64


In [35]:
# Corr of linear models on cos_sim
print(f"Linear Model with the 3 cos_similarities, pearson: {lin_3_corr_p}")
print(f"Linear Model with the 3 cos_similarities, kendall: {lin_3_corr_k}")
print(f"Linear Model with the 2 (src_ref and src_hyp) cos_similarities, pearson: {lin_2_corr_p}")
print(f"Linear Model with the 2 (src_ref and src_hyp) cos_similarities, kendall: {lin_2_corr_k}")

Linear Model with the 3 cos_similarities, pearson: 0.5964671629760575
Linear Model with the 3 cos_similarities, kendall: 0.4175936670826609
Linear Model with the 2 (src_ref and src_hyp) cos_similarities, pearson: 0.579973394417236
Linear Model with the 2 (src_ref and src_hyp) cos_similarities, kendall: 0.3942370453509555


In [36]:
# Save Model.
model.save("en_fi_laser_model__testcorr_{:.4f}_MAE_{:.4f}_time{}.h5".format(test_corr,test_mae,datetime.now().strftime('%d_%m_%H_%M')
))