Implementation of LASER to the corpus

In [1]:
# Imports
import numpy as np
from scipy import spatial
import pandas as pd
from datetime import datetime

from sklearn.linear_model import LinearRegression
from sklearn.neural_network import MLPRegressor
from sklearn.model_selection import train_test_split, KFold
from sklearn.metrics import mean_absolute_error
from sklearn.preprocessing import StandardScaler

import keras
from keras import models
from keras import layers


Using TensorFlow backend.


Load Dataset

In [2]:
df1 = pd.read_csv("scores.csv")

In [3]:
df1.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 21704 entries, 0 to 21703
Data columns (total 6 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   source       21704 non-null  object 
 1   reference    21704 non-null  object 
 2   translation  21704 non-null  object 
 3   z-score      21704 non-null  float64
 4   avg-score    21704 non-null  float64
 5   annotators   21704 non-null  int64  
dtypes: float64(2), int64(1), object(3)
memory usage: 1017.5+ KB


In [4]:
df1.head()

Unnamed: 0,source,reference,translation,z-score,avg-score,annotators
0,"Ihr Zeitlupentempo maßen sie, als sie vor Spit...",Her timeless pace measures them when they equi...,Their slow speed was measured by researchers o...,-0.345024,76.0,1
1,"Er sagte, dass die Bereiche ruhige Treffpunkte...",He said the areas offer quiet meeting points b...,He said the spaces provided calm meeting point...,0.9038,97.5,2
2,Für die Geschäftsleute an der B 27 ist es nur ...,"For businessmen at the B 27, it's only a small...",This is only a small consolation for businesse...,0.700503,94.0,1
3,Diese Fähigkeit sei möglicherweise angeboren o...,This ability may be born or developed with gen...,"This ability may be innate, or may develop as ...",-1.256572,51.5,2
4,Weil sie Wassertemperaturen um die sechs Grad ...,Because they prefer water temperatures around ...,They generally only come to the surface in win...,0.293909,87.0,2


Comparison of LASER embeddings

In [5]:
source_arr = np.load("laser.source_embeds.npy")
refer_arr = np.load("laser.reference_embeds.npy")
trans_arr = np.load("laser.translation_embeds.npy")

Cosine Similarity

In [6]:
cos_similarity_ref_hyp = []
cos_similarity_src_ref = []
cos_similarity_src_hyp = []

# for each of the cos similarity, put them into lists.
for i in range(refer_arr.shape[0]):
    cos_similarity_ref_hyp.append((spatial.distance.cosine(refer_arr[i],trans_arr[i])*-1)+1)
    cos_similarity_src_ref.append((spatial.distance.cosine(source_arr[i],refer_arr[i])*-1)+1)
    cos_similarity_src_hyp.append((spatial.distance.cosine(source_arr[i],trans_arr[i])*-1)+1)


# Standardize and transform into series.
scaler = StandardScaler()
cs_rh = pd.Series(scaler.fit_transform(np.array(cos_similarity_ref_hyp).reshape(-1,1)).flatten(),name="cos_sim_ref_hyp")
cs_sr = pd.Series(scaler.fit_transform(np.array(cos_similarity_src_ref).reshape(-1,1)).flatten(),name="cos_sim_src_ref")
cs_sh = pd.Series(scaler.fit_transform(np.array(cos_similarity_src_hyp).reshape(-1,1)).flatten(),name="cos_sim_src_hyp")



In [7]:
cos_df = pd.concat([cs_rh,cs_sr,cs_sh,df1.loc[:,"z-score"]],axis=1)
cos_df

Unnamed: 0,cos_sim_ref_hyp,cos_sim_src_ref,cos_sim_src_hyp,z-score
0,-1.250038,-0.422368,-1.306007,-0.345024
1,1.048153,0.533522,0.774393,0.903800
2,-0.678580,0.130823,0.188789,0.700503
3,-0.987798,0.273078,-0.873894,-1.256572
4,-0.938938,0.782586,-1.083445,0.293909
...,...,...,...,...
21699,1.080071,1.100543,1.362532,1.246459
21700,1.183931,-0.374398,0.015242,0.792878
21701,0.670196,0.637920,0.668873,0.597068
21702,0.720476,-0.220658,0.773841,-0.305719


In [8]:
# Pearson
print("Pairwise Pearson")
cos_df.corr()

Pairwise Pearson


Unnamed: 0,cos_sim_ref_hyp,cos_sim_src_ref,cos_sim_src_hyp,z-score
cos_sim_ref_hyp,1.0,0.618973,0.79733,0.316523
cos_sim_src_ref,0.618973,1.0,0.692192,0.182758
cos_sim_src_hyp,0.79733,0.692192,1.0,0.195875
z-score,0.316523,0.182758,0.195875,1.0


In [9]:
# Kendall
print("Pairwise Kendall")
cos_df.corr(method="kendall")

Pairwise Kendall


Unnamed: 0,cos_sim_ref_hyp,cos_sim_src_ref,cos_sim_src_hyp,z-score
cos_sim_ref_hyp,1.0,0.435721,0.608597,0.223125
cos_sim_src_ref,0.435721,1.0,0.540446,0.118914
cos_sim_src_hyp,0.608597,0.540446,1.0,0.130874
z-score,0.223125,0.118914,0.130874,1.0


In [10]:
# Spearman
print("Pairwise Spearman")
cos_df.corr(method="spearman")

Pairwise Spearman


Unnamed: 0,cos_sim_ref_hyp,cos_sim_src_ref,cos_sim_src_hyp,z-score
cos_sim_ref_hyp,1.0,0.604819,0.787616,0.326655
cos_sim_src_ref,0.604819,1.0,0.723731,0.17615
cos_sim_src_hyp,0.787616,0.723731,1.0,0.193575
z-score,0.326655,0.17615,0.193575,1.0


In [11]:
# Mean Absolute Deviation of each of these.
for col in cos_df.columns[:3]:
    print(f"Mean Absolute Error of {col} in regards to the z-score: {mean_absolute_error(cos_df[col],cos_df['z-score'])}")

Mean Absolute Error of cos_sim_ref_hyp in regards to the z-score: 0.8128335093539766
Mean Absolute Error of cos_sim_src_ref in regards to the z-score: 0.8796000501264563
Mean Absolute Error of cos_sim_src_hyp in regards to the z-score: 0.8861561012422285


---
Linear Regression <br>to try and predict avg score based on the cos similarity 

In [12]:
X = cos_df.drop(columns='z-score')
y = cos_df['z-score']

In [13]:
lin_model = LinearRegression()

In [14]:
kf = KFold(n_splits = 10)
for train_index, test_index in kf.split(X):
    X_train, X_val = X.iloc[train_index], X.iloc[test_index]
    y_train, y_val = y.iloc[train_index], y.iloc[test_index]
    
    lin_model.fit(X_train,y_train)
    y_pred = lin_model.predict(X_val)
    print(mean_absolute_error(y_val,y_pred))

0.6527408819767678
0.6757954960873159
0.6668912096850256
0.6452570374362728
0.6109181224425928
0.6440656783167589
0.5899620045423257
0.6368125210787335
0.6355302938729582
0.6398936072186637


In [15]:
print(lin_model.coef_)
print(lin_model.intercept_)

[ 0.37602588  0.03522878 -0.16227454]
-0.004071303295636321


In [16]:
# A model with src_ref and src_hyp only. 
X = cos_df.drop(columns=['cos_sim_ref_hyp','z-score'])
y = cos_df['z-score']

In [17]:
kf = KFold(n_splits = 10)
for train_index, test_index in kf.split(X):
    X_train, X_val = X.iloc[train_index], X.iloc[test_index]
    y_train, y_val = y.iloc[train_index], y.iloc[test_index]
    
    lin_model.fit(X_train,y_train)
    y_pred = lin_model.predict(X_val)
    print(mean_absolute_error(y_val,y_pred))
print(lin_model.coef_)
print(lin_model.intercept_)

0.6872387987829903
0.7074090038492186
0.7017583072988852
0.6806976681436335
0.648423808347745
0.6688137843687497
0.6230586532258409
0.6679940901556314
0.6562495683670608
0.6630161471191572
[0.08666064 0.10156855]
-0.007630570925880059


Seems like it is about the same, which I guess does make sense, consdering the correlations between the three are likely to be very strong.

In [18]:
y_pred

array([-0.11235052, -0.2473391 ,  0.15787498, ...,  0.11558839,
        0.05184496,  0.16605814])

---
Using a Neural Network

In [19]:
MLP_model = MLPRegressor(hidden_layer_sizes=(2,2),max_iter=250,learning_rate_init=0.0015,solver="lbfgs")

In [20]:
# Train test Split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=42)

In [21]:
scaler = StandardScaler().fit(X_train)
X_train = scaler.transform(X_train)
X_test = scaler.transform(X_test)

In [22]:
kf = KFold(5)
for train_index, val_index in kf.split(X_train):
    kf_X_train, X_val = X_train[train_index], X_train[val_index]
    kf_y_train, y_val = y_train.iloc[train_index], y_train.iloc[val_index]    
    
    MLP_model.fit(kf_X_train,kf_y_train)
    y_val_pred = MLP_model.predict(X_val)
    print(f"Model loss: {MLP_model.loss_}")
    print(f"The MAE is {mean_absolute_error(y_val,y_val_pred)}")

Model loss: 0.34808729565010144
The MAE is 0.6624872628836372
Model loss: 0.3617487613701269
The MAE is 0.6927428554099251
Model loss: 0.34688982189492
The MAE is 0.6653113892548552
Model loss: 0.34917679403439633
The MAE is 0.6545163184925111
Model loss: 0.34433538795235197
The MAE is 0.670170779046891


---
Using Neural Networks, directly on the embeddings.

In [23]:
# An input has to be of shape (3,1024), because we have 3 embedded vectors of size 1024.
# combined array
#full_arr = np.dstack((source_arr,refer_arr,trans_arr))

# Option 2, make it size (1,2048), by concatenating the arrays. This is what we are using now, according to teacher's indications.
full_arr_c = np.concatenate((refer_arr,trans_arr),axis=1)


In [32]:
X = full_arr_c.copy()
y = df1["z-score"].to_numpy()

# Train test Split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=43)

In [33]:
X_train.shape

(19533, 2048)

In [34]:
model = models.Sequential()

model.add(layers.Dense(1024,activation="relu",input_dim=X_train.shape[1]))
model.add(layers.Dense(512,activation="relu"))
model.add(layers.Dropout(0.5))
model.add(layers.Dense(128,activation="relu"))
model.add(layers.Dropout(0.25))
model.add(layers.Dense(64,activation="relu"))

model.add(layers.Dense(1,activation="tanh"))
model.compile(optimizer=keras.optimizers.Adam(lr=0.002),loss="mse",metrics=["mae"])

In [35]:
model.summary()

Model: "sequential_2"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_6 (Dense)              (None, 1024)              2098176   
_________________________________________________________________
dense_7 (Dense)              (None, 512)               524800    
_________________________________________________________________
dropout_3 (Dropout)          (None, 512)               0         
_________________________________________________________________
dense_8 (Dense)              (None, 128)               65664     
_________________________________________________________________
dropout_4 (Dropout)          (None, 128)               0         
_________________________________________________________________
dense_9 (Dense)              (None, 64)                8256      
_________________________________________________________________
dense_10 (Dense)             (None, 1)                

In [59]:
# Fit. Batch Size < > Learning Rate

model.fit(X_train,y_train,epochs=75,batch_size=64,verbose=0)

<keras.callbacks.callbacks.History at 0x20200146a20>

In [60]:
# Evaluation and comparison.
test_mse,test_mae = model.evaluate(X_test,y_test)
print(f"Test MSE: {test_mse:.4f}.\nTest MAE: {test_mae:.4f}.")
print(f"Pearson correlation between y_val_predicted and actual y_val: {np.corrcoef(model.predict(X_test).flatten(),y_test)[0][1]:.4f}")

Test MSE: 0.7975.
Test MAE: 0.6632.
Pearson correlation between y_val_predicted and actual y_val: 0.3114


In [61]:
# Save Model.
model.save("de_en_laser_model_MAE_{:.4f}_time{}.h5".format(val_mae,datetime.now().strftime('%d_%m_%H_%M')
))