### Implementation of LASER to the corpus
### CS-EN

In [1]:
# Imports
import numpy as np
from scipy import spatial
import pandas as pd
from datetime import datetime

from sklearn.linear_model import LinearRegression
from sklearn.neural_network import MLPRegressor
from sklearn.model_selection import train_test_split, KFold
from sklearn.metrics import mean_absolute_error
from sklearn.preprocessing import StandardScaler

import keras
from keras import models
from keras import layers


Using TensorFlow backend.


Load Dataset

In [2]:
df1 = pd.read_csv("scores.csv")

In [3]:
df1.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 11585 entries, 0 to 11584
Data columns (total 6 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   source       11585 non-null  object 
 1   reference    11585 non-null  object 
 2   translation  11585 non-null  object 
 3   z-score      11585 non-null  float64
 4   avg-score    11585 non-null  float64
 5   annotators   11585 non-null  int64  
dtypes: float64(2), int64(1), object(3)
memory usage: 543.2+ KB


In [4]:
df1.head()

Unnamed: 0,source,reference,translation,z-score,avg-score,annotators
0,Uchopíte pak zbraň mezi své předloktí a rameno...,You will then grab the weapon between your for...,You then grasp the gun between your forearm an...,-0.675383,60.0,3
1,"Ale je-li New York změna, pak je to také znovu...","But if New York is changed, then it's also a r...","But if New York is change, it is also reinvent...",-0.829403,44.0,2
2,"Dlouho a intenzivně jsem během léta přemýšlel,...",I have been thinking over and over again over ...,I have thought long and hard over the course o...,0.803185,96.5,2
3,"Najdou si jiný způsob, jak někde podvádět.",They find another way to cheat somewhere.,They will find another way how to defraud others.,0.563149,90.5,2
4,Zpráva o výměně v čele prezidentovy administra...,The report on the replacement of the president...,The news of the replacement at the top of the ...,0.021549,74.666667,3


Comparison of LASER embeddings

In [5]:
source_arr = np.load("laser.source_embeds.npy")
refer_arr = np.load("laser.reference_embeds.npy")
trans_arr = np.load("laser.translation_embeds.npy")

Cosine Similarity

In [6]:
cos_similarity_ref_hyp = []
cos_similarity_src_ref = []
cos_similarity_src_hyp = []

# for each of the cos similarity, put them into lists.
for i in range(refer_arr.shape[0]):
    cos_similarity_ref_hyp.append((spatial.distance.cosine(refer_arr[i],trans_arr[i])*-1)+1)
    cos_similarity_src_ref.append((spatial.distance.cosine(source_arr[i],refer_arr[i])*-1)+1)
    cos_similarity_src_hyp.append((spatial.distance.cosine(source_arr[i],trans_arr[i])*-1)+1)


# Standardize and transform into series.
scaler = StandardScaler()
cs_rh = pd.Series(scaler.fit_transform(np.array(cos_similarity_ref_hyp).reshape(-1,1)).flatten(),name="cos_sim_ref_hyp")
cs_sr = pd.Series(scaler.fit_transform(np.array(cos_similarity_src_ref).reshape(-1,1)).flatten(),name="cos_sim_src_ref")
cs_sh = pd.Series(scaler.fit_transform(np.array(cos_similarity_src_hyp).reshape(-1,1)).flatten(),name="cos_sim_src_hyp")



In [7]:
cos_df = pd.concat([cs_rh,cs_sr,cs_sh,df1.loc[:,"z-score"]],axis=1)
cos_df.head()

Unnamed: 0,cos_sim_ref_hyp,cos_sim_src_ref,cos_sim_src_hyp,z-score
0,0.944591,-0.059577,0.31333,-0.675383
1,-0.310273,-0.69982,0.766601,-0.829403
2,0.610518,0.22953,0.8765,0.803185
3,-0.546027,0.187428,0.142012,0.563149
4,-0.589712,0.53717,-0.51044,0.021549


In [8]:
# Pearson
print("Pairwise Pearson")
cos_df.corr()

Pairwise Pearson


Unnamed: 0,cos_sim_ref_hyp,cos_sim_src_ref,cos_sim_src_hyp,z-score
cos_sim_ref_hyp,1.0,0.604009,0.797262,0.430993
cos_sim_src_ref,0.604009,1.0,0.682677,0.193411
cos_sim_src_hyp,0.797262,0.682677,1.0,0.279916
z-score,0.430993,0.193411,0.279916,1.0


In [9]:
# Kendall
print("Pairwise Kendall")
cos_df.corr(method="kendall")

Pairwise Kendall


Unnamed: 0,cos_sim_ref_hyp,cos_sim_src_ref,cos_sim_src_hyp,z-score
cos_sim_ref_hyp,1.0,0.412618,0.617482,0.293171
cos_sim_src_ref,0.412618,1.0,0.519509,0.134352
cos_sim_src_hyp,0.617482,0.519509,1.0,0.178779
z-score,0.293171,0.134352,0.178779,1.0


In [10]:
# Mean Absolute Deviation of each of these.
for col in cos_df.columns[:3]:
    print(f"Mean Absolute Error of {col} in regards to the z-score: {mean_absolute_error(cos_df[col],cos_df['z-score'])}")

Mean Absolute Error of cos_sim_ref_hyp in regards to the z-score: 0.7505428608912943
Mean Absolute Error of cos_sim_src_ref in regards to the z-score: 0.8685112112028813
Mean Absolute Error of cos_sim_src_hyp in regards to the z-score: 0.8435137708922917


---
Linear Regression <br>to try and predict avg score based on the cos similarity 

In [11]:
X = cos_df.drop(columns='z-score')
y = cos_df['z-score']

In [12]:
lin_model = LinearRegression()

In [13]:
kf = KFold(n_splits = 10)
for train_index, test_index in kf.split(X):
    X_train, X_val = X.iloc[train_index], X.iloc[test_index]
    y_train, y_val = y.iloc[train_index], y.iloc[test_index]
    
    lin_model.fit(X_train,y_train)
    y_pred = lin_model.predict(X_val)
    lin_model_3_mae = mean_absolute_error(y_val,y_pred)
    print(lin_model_3_mae)

0.5779222658053401
0.5857079428485272
0.6184520926808259
0.6078207010586433
0.7768129689038648
0.5612979583250219
0.5852983467641449
0.6135030256824922
0.5922014643887218
0.6984563470021806


In [14]:
lin_model_3_par = [lin_model.intercept_]
lin_model_3_par.extend([coef for coef in lin_model.coef_])
print(lin_model_3_par)

[-0.017747794267827563, 0.5008850510604128, -0.047184819744446126, -0.1253158772010228]


In [15]:
# Check the corr of this model. 
lin_3_pred = cos_df.apply(lambda x: lin_model_3_par[0] + lin_model_3_par[1]*x[0] + lin_model_3_par[2]*x[1] + lin_model_3_par[3]*x[2],axis=1)
lin_3_corr_p = lin_3_pred.corr(df1["z-score"], method="pearson")
lin_3_corr_k = lin_3_pred.corr(df1["z-score"], method="kendall")
print(str(lin_3_corr_p)+" pearson")
print(str(lin_3_corr_k)+" kendall")

0.44586422378085433 pearson
0.30368876066894357 kendall


In [16]:
# A model with src_ref and src_hyp only. 
X = cos_df.drop(columns=['cos_sim_ref_hyp','z-score'])
y = cos_df['z-score']

In [17]:
kf = KFold(n_splits = 10)
for train_index, test_index in kf.split(X):
    X_train, X_val = X.iloc[train_index], X.iloc[test_index]
    y_train, y_val = y.iloc[train_index], y.iloc[test_index]
    
    lin_model.fit(X_train,y_train)
    y_pred = lin_model.predict(X_val)
    print(mean_absolute_error(y_val,y_pred))

0.6208575338875748
0.6397308062238354
0.6666137729859257
0.658962972318979
0.8436010303091502
0.6241674982128292
0.6509860838716531
0.6721840501001044
0.654924047298013
0.7494366317731082


In [18]:
y_pred

array([0.24608331, 0.09961482, 0.12248068, ..., 0.34918741, 0.20155448,
       0.00794485])

In [19]:
lin_model_2_par = [lin_model.intercept_]
lin_model_2_par.extend([coef for coef in lin_model.coef_])
print(lin_model_2_par)

[-0.01808070832084239, 0.01802516155853481, 0.23009733373077312]


In [20]:
# Check the corr of this model. 
lin_2_pred = cos_df.apply(lambda x: lin_model_2_par[0] + lin_model_2_par[1]*x[0] + lin_model_2_par[2]*x[1],axis=1)
lin_2_corr_p = lin_2_pred.corr(df1["z-score"])
lin_2_corr_k = lin_2_pred.corr(df1["z-score"],method="kendall")
print(str(lin_2_corr_p)+" pearson")
print(str(lin_2_corr_p)+" kendall")

0.21652616256543797 pearson
0.21652616256543797 kendall


---
Using a Neural Network

In [21]:
MLP_model = MLPRegressor(hidden_layer_sizes=(2,2),max_iter=250,learning_rate_init=0.0015,solver="lbfgs")

In [22]:
# Train test Split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=42)

In [23]:
scaler = StandardScaler().fit(X_train)
X_train = scaler.transform(X_train)
X_test = scaler.transform(X_test)

In [24]:
kf = KFold(5)
for train_index, val_index in kf.split(X_train):
    kf_X_train, X_val = X_train[train_index], X_train[val_index]
    kf_y_train, y_val = y_train.iloc[train_index], y_train.iloc[val_index]    
    
    MLP_model.fit(kf_X_train,kf_y_train)
    y_val_pred = MLP_model.predict(X_val)
    print(f"Model loss: {MLP_model.loss_}")
    print(f"The MAE is {mean_absolute_error(y_val,y_val_pred)}")

Model loss: 0.3441028392724287
The MAE is 0.6828216373388843
Model loss: 0.34475682667418267
The MAE is 0.6649259172575942
Model loss: 0.34657736411580903
The MAE is 0.6771364437920069
Model loss: 0.3501870267445633
The MAE is 0.6717719562412185
Model loss: 0.3481556787963551
The MAE is 0.6681649805295131


---
Using Neural Networks, directly on the embeddings.

In [25]:
# An input has to be of shape (3,1024), because we have 3 embedded vectors of size 1024.
# combined array
#full_arr = np.dstack((source_arr,refer_arr,trans_arr))

# Option 2, make it size (1,2048), by concatenating the arrays. This is what we are using now, according to teacher's indications.
full_arr_c = np.concatenate((refer_arr,trans_arr),axis=1)


In [26]:
X = full_arr_c.copy()
y = df1["z-score"].to_numpy()

# Train test Split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=42)

In [27]:
X_train.shape

(10426, 2048)

In [28]:
model = models.Sequential()

model.add(layers.Dense(1024,activation="relu",input_dim=X_train.shape[1]))
model.add(layers.Dense(512,activation="relu"))
model.add(layers.Dropout(0.5))
model.add(layers.Dense(128,activation="relu"))
model.add(layers.Dropout(0.25))
model.add(layers.Dense(64,activation="relu"))

model.add(layers.Dense(1,activation="tanh"))
model.compile(optimizer=keras.optimizers.Adam(lr=0.002),loss="mse",metrics=["mae"])

In [29]:
model.summary()

Model: "sequential_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_1 (Dense)              (None, 1024)              2098176   
_________________________________________________________________
dense_2 (Dense)              (None, 512)               524800    
_________________________________________________________________
dropout_1 (Dropout)          (None, 512)               0         
_________________________________________________________________
dense_3 (Dense)              (None, 128)               65664     
_________________________________________________________________
dropout_2 (Dropout)          (None, 128)               0         
_________________________________________________________________
dense_4 (Dense)              (None, 64)                8256      
_________________________________________________________________
dense_5 (Dense)              (None, 1)                

In [30]:
# Fit. Batch Size < > Learning Rate

model.fit(X_train,y_train,epochs=50,batch_size=64,verbose=0)

<keras.callbacks.callbacks.History at 0x29bb66dd4a8>

In [31]:
# Evaluation and comparison.
test_mse,test_mae = model.evaluate(X_test,y_test)
test_corr = np.corrcoef(model.predict(X_test).flatten(),y_test)[0][1]
print(f"Test MSE: {test_mse:.4f}.\nTest MAE: {test_mae:.4f}.")
print(f"Pearson correlation between y_val_predicted and actual y_val: {test_corr:.4f}")

Test MSE: 0.6632.
Test MAE: 0.6200.
Pearson correlation between y_val_predicted and actual y_val: 0.4257


In [32]:
# For reference, pt.2
for col in cos_df.columns[:3]:
    print(f"Mean Absolute Error of {col} in regards to the z-score: {mean_absolute_error(cos_df[col],cos_df['z-score'])}")

Mean Absolute Error of cos_sim_ref_hyp in regards to the z-score: 0.7505428608912943
Mean Absolute Error of cos_sim_src_ref in regards to the z-score: 0.8685112112028813
Mean Absolute Error of cos_sim_src_hyp in regards to the z-score: 0.8435137708922917


In [33]:
print("Pearson correlation of cos_sim")
print(cos_df.corr().iloc[:,3])

Pearson correlation of cos_sim
cos_sim_ref_hyp    0.430993
cos_sim_src_ref    0.193411
cos_sim_src_hyp    0.279916
z-score            1.000000
Name: z-score, dtype: float64


In [34]:
print("Kendall correlation of cos_sim")
print(cos_df.corr(method="kendall").iloc[:,3])

Kendall correlation of cos_sim
cos_sim_ref_hyp    0.293171
cos_sim_src_ref    0.134352
cos_sim_src_hyp    0.178779
z-score            1.000000
Name: z-score, dtype: float64


In [35]:
# Corr of linear models on cos_sim
print(f"Linear Model with the 3 cos_similarities, pearson: {lin_3_corr_p}")
print(f"Linear Model with the 3 cos_similarities, kendall: {lin_3_corr_k}")
print(f"Linear Model with the 2 (src_ref and src_hyp) cos_similarities, pearson: {lin_2_corr_p}")
print(f"Linear Model with the 2 (src_ref and src_hyp) cos_similarities, kendall: {lin_2_corr_k}")

Linear Model with the 3 cos_similarities, pearson: 0.44586422378085433
Linear Model with the 3 cos_similarities, kendall: 0.30368876066894357
Linear Model with the 2 (src_ref and src_hyp) cos_similarities, pearson: 0.21652616256543797
Linear Model with the 2 (src_ref and src_hyp) cos_similarities, kendall: 0.15064035891872554


In [36]:
# Save Model.
model.save("cs_en_laser_model__testcorr_{:.4f}_MAE_{:.4f}_time{}.h5".format(test_corr,test_mae,datetime.now().strftime('%d_%m_%H_%M')
))