Implementation of LASER to the corpus

In [1]:
# Imports
import numpy as np
from scipy import spatial
import pandas as pd

from sklearn.linear_model import LinearRegression
from sklearn.neural_network import MLPRegressor
from sklearn.model_selection import train_test_split, KFold
from sklearn.metrics import mean_absolute_error

Load Dataset

In [2]:
df1 = pd.read_csv("scores.csv")

In [3]:
df1.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 21704 entries, 0 to 21703
Data columns (total 6 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   source       21704 non-null  object 
 1   reference    21704 non-null  object 
 2   translation  21704 non-null  object 
 3   z-score      21704 non-null  float64
 4   avg-score    21704 non-null  float64
 5   annotators   21704 non-null  int64  
dtypes: float64(2), int64(1), object(3)
memory usage: 1017.5+ KB


In [4]:
df1.head()

Unnamed: 0,source,reference,translation,z-score,avg-score,annotators
0,"Ihr Zeitlupentempo maßen sie, als sie vor Spit...",Her timeless pace measures them when they equi...,Their slow speed was measured by researchers o...,-0.345024,76.0,1
1,"Er sagte, dass die Bereiche ruhige Treffpunkte...",He said the areas offer quiet meeting points b...,He said the spaces provided calm meeting point...,0.9038,97.5,2
2,Für die Geschäftsleute an der B 27 ist es nur ...,"For businessmen at the B 27, it's only a small...",This is only a small consolation for businesse...,0.700503,94.0,1
3,Diese Fähigkeit sei möglicherweise angeboren o...,This ability may be born or developed with gen...,"This ability may be innate, or may develop as ...",-1.256572,51.5,2
4,Weil sie Wassertemperaturen um die sechs Grad ...,Because they prefer water temperatures around ...,They generally only come to the surface in win...,0.293909,87.0,2


Comparison of LASER embeddings

In [5]:
source_arr = np.load("laser.source_embeds.npy")
refer_arr = np.load("laser.reference_embeds.npy")
trans_arr = np.load("laser.translation_embeds.npy")

Cosine Similarity

In [6]:
cos_similarity_ref_hyp = []
cos_similarity_src_ref = []
cos_similarity_src_hyp = []
for i in range(refer_arr.shape[0]):
    cos_similarity_ref_hyp.append((spatial.distance.cosine(refer_arr[i],trans_arr[i])*-1)+1)
    cos_similarity_src_ref.append((spatial.distance.cosine(source_arr[i],refer_arr[i])*-1)+1)
    cos_similarity_src_hyp.append((spatial.distance.cosine(source_arr[i],trans_arr[i])*-1)+1)

cs_rh = pd.Series(cos_similarity_ref_hyp,name="cos_sim_ref_hyp")
cs_sr = pd.Series(cos_similarity_src_ref,name="cos_sim_src_ref")
cs_sh = pd.Series(cos_similarity_src_hyp,name="cos_sim_src_hyp")

In [7]:
cos_df = pd.concat([cs_rh,cs_sr,cs_sh,df1.loc[:,"avg-score"]],axis=1)
cos_df

Unnamed: 0,cos_sim_ref_hyp,cos_sim_src_ref,cos_sim_src_hyp,avg-score
0,0.797121,0.882998,0.789376,76.0
1,0.959148,0.933919,0.921730,97.5
2,0.837410,0.912467,0.884474,94.0
3,0.815609,0.920045,0.816867,51.5
4,0.819054,0.947187,0.803535,87.0
...,...,...,...,...
21699,0.961398,0.964125,0.959146,100.0
21700,0.968720,0.885553,0.873433,98.0
21701,0.932501,0.939480,0.915016,76.0
21702,0.936046,0.893743,0.921694,61.0


In [8]:
# Pearson
print("Pairwise Pearson")
cos_df.corr()

Pairwise Pearson


Unnamed: 0,cos_sim_ref_hyp,cos_sim_src_ref,cos_sim_src_hyp,avg-score
cos_sim_ref_hyp,1.0,0.618973,0.79733,0.280546
cos_sim_src_ref,0.618973,1.0,0.692192,0.163928
cos_sim_src_hyp,0.79733,0.692192,1.0,0.1744
avg-score,0.280546,0.163928,0.1744,1.0


In [9]:
# Kendall
print("Pairwise Kendall")
cos_df.corr(method="kendall")

Pairwise Kendall


Unnamed: 0,cos_sim_ref_hyp,cos_sim_src_ref,cos_sim_src_hyp,avg-score
cos_sim_ref_hyp,1.0,0.435721,0.608597,0.205159
cos_sim_src_ref,0.435721,1.0,0.540446,0.108857
cos_sim_src_hyp,0.608597,0.540446,1.0,0.120058
avg-score,0.205159,0.108857,0.120058,1.0


In [10]:
# Spearman
print("Pairwise Spearman")
cos_df.corr(method="spearman")

Pairwise Spearman


Unnamed: 0,cos_sim_ref_hyp,cos_sim_src_ref,cos_sim_src_hyp,avg-score
cos_sim_ref_hyp,1.0,0.604819,0.787616,0.29984
cos_sim_src_ref,0.604819,1.0,0.723731,0.16077
cos_sim_src_hyp,0.787616,0.723731,1.0,0.176985
avg-score,0.29984,0.16077,0.176985,1.0


In [11]:
## how abouts we train a neural network to develop a predicting ability of the avg score given these numbers.
# We hypothesize that sometimes the reference itself is not a particularly good translation of the source. 
# Therefore, this will have an effect on how strong the hypothesis is.


---
Linear Regression <br>to try and predict avg score based on the cos similarity 

In [12]:
X = cos_df.drop(columns=['avg-score'])
y = cos_df['avg-score']

In [13]:
lin_model = LinearRegression()

In [14]:
kf = KFold(n_splits = 10)
for train_index, test_index in kf.split(X):
    X_train, X_val = X.iloc[train_index], X.iloc[test_index]
    y_train, y_val = y.iloc[train_index], y.iloc[test_index]
    
    lin_model.fit(X_train,y_train)
    y_pred = lin_model.predict(X_val)
    print(mean_absolute_error(y_val,y_pred))

20.643857248069885
22.095997530264782
21.374442416477358
19.788936854485776
18.867238070215205
20.24474878509248
18.28611122388313
20.184921857868726
20.14624069430001
19.858331179776346


In [15]:
print(lin_model.coef_)
print(lin_model.intercept_)

[145.27319955  18.57441577 -69.77403636]
-12.814729725824535


In [16]:
# A model with src_ref and src_hyp only. 
X = cos_df.drop(columns=['cos_sim_ref_hyp','avg-score'])
y = cos_df['avg-score']

In [17]:
kf = KFold(n_splits = 10)
for train_index, test_index in kf.split(X):
    X_train, X_val = X.iloc[train_index], X.iloc[test_index]
    y_train, y_val = y.iloc[train_index], y.iloc[test_index]
    
    lin_model.fit(X_train,y_train)
    y_pred = lin_model.predict(X_val)
    print(mean_absolute_error(y_val,y_pred))
print(lin_model.coef_)
print(lin_model.intercept_)

21.508638638402555
22.84826415538236
22.175787429356284
20.80840128906012
19.856675719286176
20.79620599379446
19.19960404972948
20.977044763624193
20.64521355746907
20.45475072664573
[44.87181799 43.18620258]
-6.6743358751459


Seems like it is about the same, which I guess does make sense, consdering the correlations between the three are likely to be very strong.

---
Using a Neural Network

In [18]:
MLP_model = MLPRegressor(hidden_layer_sizes=(2,1),max_iter=250,learning_rate_init=0.05)

In [19]:
kf = KFold(5)
for train_index, test_index in kf.split(X):
    X_train, X_val = X.iloc[train_index], X.iloc[test_index]
    y_train, y_val = y.iloc[train_index], y.iloc[test_index]
    
    MLP_model.fit(X_train,y_train)
    y_pred = MLP_model.predict(X_val)
    print(f"Model loss: {MLP_model.loss_}")
    print(f"The MAE is {mean_absolute_error(y_val,y_pred)}")

Model loss: 324.16727987553236
The MAE is 22.240001356232497
Model loss: 341.7885849862733
The MAE is 21.966627384369616
Model loss: 352.6754747671317
The MAE is 20.917675726150467
Model loss: 355.47480496049286
The MAE is 20.646105247284837
Model loss: 349.7771303162559
The MAE is 21.03146420456577


In [20]:
## Futher possible work: 

## Could try Word Movers Distance instead of Cosine Similarity?