Implementation of LASER to the corpus

In [1]:
# Imports
import numpy as np
from scipy import spatial
import pandas as pd

from sklearn.linear_model import LinearRegression
from sklearn.neural_network import MLPRegressor
from sklearn.model_selection import train_test_split, KFold
from sklearn.metrics import mean_absolute_error
from sklearn.preprocessing import StandardScaler

from keras import models
from keras import layers
import keras

Using TensorFlow backend.


Load Dataset

In [2]:
df1 = pd.read_csv("scores.csv")

In [3]:
df1.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6748 entries, 0 to 6747
Data columns (total 6 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   source       6748 non-null   object 
 1   reference    6748 non-null   object 
 2   translation  6748 non-null   object 
 3   z-score      6748 non-null   float64
 4   avg-score    6748 non-null   float64
 5   annotators   6748 non-null   int64  
dtypes: float64(2), int64(1), object(3)
memory usage: 316.4+ KB


In [4]:
df1.head()

Unnamed: 0,source,reference,translation,z-score,avg-score,annotators
0,"You can turn yourself into a pineapple, a dog ...","Voit muuttaa itsesi ananasta, koirasta tai Roy...","Voit muuttaa itsesi ananakseksi, koiraksi tai ...",-0.286195,34.2,5
1,Also shot were three men: two 29-year-olds and...,Myös ammuttiin kolme miestä: kaksi 29-vuotiait...,Myös kolmea miestä ammuttiin: kahta 29-vuotias...,0.547076,58.4,5
2,The information is stored at the cash register...,Tiedot tallennetaan kassakoneisiin joka tapauk...,Tiedot kuitenkin tallentuvat kassoilla joka ta...,1.122476,74.6,5
3,Xinhua says that there were traces of hydrochl...,"Xinhua kertoo, että Xinyin näytteestä oli sunn...","Xinhua kertoo, että Xinyin sunnuntaina antamas...",0.383095,53.6,5
4,"MacDonald, who was brought on board CBC's comm...",Voitaisiin kuulla CBD: n kommenttitiimin toimi...,"MacDonaldin, joka tuli CBC:n selostajatiimiin ...",-0.493065,32.25,4


Comparison of LASER embeddings

In [5]:
source_arr = np.load("laser.source_embeds.npy")
refer_arr = np.load("laser.reference_embeds.npy")
trans_arr = np.load("laser.translation_embeds.npy")

Cosine Similarity

In [6]:
cos_similarity_ref_hyp = []
cos_similarity_src_ref = []
cos_similarity_src_hyp = []

# for each of the cos similarity, put them into lists.
for i in range(refer_arr.shape[0]):
    cos_similarity_ref_hyp.append((spatial.distance.cosine(refer_arr[i],trans_arr[i])*-1)+1)
    cos_similarity_src_ref.append((spatial.distance.cosine(source_arr[i],refer_arr[i])*-1)+1)
    cos_similarity_src_hyp.append((spatial.distance.cosine(source_arr[i],trans_arr[i])*-1)+1)


# Standardize and transform into series.
scaler = StandardScaler()
cs_rh = pd.Series(scaler.fit_transform(np.array(cos_similarity_ref_hyp).reshape(-1,1)).flatten(),name="cos_sim_ref_hyp")
cs_sr = pd.Series(scaler.fit_transform(np.array(cos_similarity_src_ref).reshape(-1,1)).flatten(),name="cos_sim_src_ref")
cs_sh = pd.Series(scaler.fit_transform(np.array(cos_similarity_src_hyp).reshape(-1,1)).flatten(),name="cos_sim_src_hyp")



In [7]:
cos_df = pd.concat([cs_rh,cs_sr,cs_sh,df1.loc[:,"z-score"]],axis=1)
cos_df

Unnamed: 0,cos_sim_ref_hyp,cos_sim_src_ref,cos_sim_src_hyp,z-score
0,0.477108,0.201864,0.526518,-0.286195
1,1.095012,0.719836,0.963557,0.547076
2,0.163880,0.209093,-1.019948,1.122476
3,0.608808,-0.279303,-0.384327,0.383095
4,-0.166370,0.129337,0.289500,-0.493065
...,...,...,...,...
6743,0.518537,0.987485,0.842505,-0.293103
6744,0.528176,0.721836,0.667015,-0.548929
6745,0.078825,-0.725582,-1.813669,0.463936
6746,0.372191,0.558403,0.875841,-0.482206


In [8]:
# Pearson
print("Pairwise Pearson")
cos_df.corr()

Pairwise Pearson


Unnamed: 0,cos_sim_ref_hyp,cos_sim_src_ref,cos_sim_src_hyp,z-score
cos_sim_ref_hyp,1.0,0.845841,0.424134,0.580044
cos_sim_src_ref,0.845841,1.0,0.411307,0.473789
cos_sim_src_hyp,0.424134,0.411307,1.0,0.120887
z-score,0.580044,0.473789,0.120887,1.0


In [9]:
# Kendall
print("Pairwise Kendall")
cos_df.corr(method="kendall")

Pairwise Kendall


Unnamed: 0,cos_sim_ref_hyp,cos_sim_src_ref,cos_sim_src_hyp,z-score
cos_sim_ref_hyp,1.0,0.496232,0.426147,0.396893
cos_sim_src_ref,0.496232,1.0,0.415437,0.285051
cos_sim_src_hyp,0.426147,0.415437,1.0,0.090363
z-score,0.396893,0.285051,0.090363,1.0


In [10]:
# Spearman
print("Pairwise Spearman")
cos_df.corr(method="spearman")

Pairwise Spearman


Unnamed: 0,cos_sim_ref_hyp,cos_sim_src_ref,cos_sim_src_hyp,z-score
cos_sim_ref_hyp,1.0,0.661866,0.576545,0.555323
cos_sim_src_ref,0.661866,1.0,0.569308,0.407616
cos_sim_src_hyp,0.576545,0.569308,1.0,0.13353
z-score,0.555323,0.407616,0.13353,1.0


---
Linear Regression <br>to try and predict avg score based on the cos similarity 

In [11]:
X = cos_df.drop(columns='z-score')
y = cos_df['z-score']

In [12]:
lin_model = LinearRegression()

In [13]:
kf = KFold(n_splits = 10)
for train_index, test_index in kf.split(X):
    X_train, X_val = X.iloc[train_index], X.iloc[test_index]
    y_train, y_val = y.iloc[train_index], y.iloc[test_index]
    
    lin_model.fit(X_train,y_train)
    y_pred = lin_model.predict(X_val)
    print(mean_absolute_error(y_val,y_pred))

0.6468149724838524
0.5216359161669204
0.5891346545590176
0.5811340058834468
0.5058610726747285
0.6333172450476015
0.6290470524901891
0.6102779526848207
0.604387008203543
0.5502845073983063


In [14]:
print(lin_model.coef_)
print(lin_model.intercept_)

[ 0.6033789  -0.03223887 -0.12863824]
-0.12910357677916118


In [15]:
# A model with src_ref and src_hyp only. 
X = cos_df.drop(columns=['cos_sim_ref_hyp','z-score'])
y = cos_df['z-score']

In [16]:
kf = KFold(n_splits = 10)
for train_index, test_index in kf.split(X):
    X_train, X_val = X.iloc[train_index], X.iloc[test_index]
    y_train, y_val = y.iloc[train_index], y.iloc[test_index]
    
    lin_model.fit(X_train,y_train)
    y_pred = lin_model.predict(X_val)
    print(mean_absolute_error(y_val,y_pred))
print(lin_model.coef_)
print(lin_model.intercept_)

0.7050738881793724
0.555527514567338
0.6281727759117081
0.6279054469972988
0.5632306713185548
0.6283650135244989
0.6881284006574842
0.6807746537623373
0.6675518484751372
0.5884071146266736
[ 0.46567599 -0.08263206]
-0.12420079734183924


In [17]:
y_pred

array([-1.35867745e-01, -7.49542407e-01,  5.75584852e-02, -6.14337038e-01,
        1.99056500e-01,  1.49862276e-01, -1.34404671e-01, -2.18168492e-01,
       -2.74559980e-02, -1.37366406e+00, -5.86309862e-01,  5.38773912e-02,
        3.47617470e-02, -4.16060472e-01, -1.73565166e-02, -5.23431664e-02,
        7.05609163e-02,  2.15281024e-01,  5.12335711e-03,  2.33288897e-02,
       -3.16280503e-01, -1.40422029e-01,  1.42377211e-01,  1.32901871e-01,
       -1.26015190e-01,  2.22110656e-01, -3.66982295e-01,  1.02114820e-01,
        6.25822506e-02, -5.92147077e-01,  2.82625365e-01,  1.99466035e-01,
       -5.72518350e-02,  2.18319290e-01, -1.37853419e-01, -1.21254384e-01,
        1.05134856e-01,  2.03434995e-02,  6.78040110e-02, -2.65909374e-01,
       -1.99589275e-01, -1.75841720e-01,  1.88132933e-01,  6.00034354e-02,
       -5.18864424e-01,  1.08209975e-01,  1.27406164e-01, -2.73179516e-02,
        1.00146652e-01, -3.25212911e-01,  1.94734530e-01, -3.43042808e-01,
        2.42561631e-01, -

---
Using a Neural Network

In [18]:
MLP_model = MLPRegressor(hidden_layer_sizes=(2,2),max_iter=250,learning_rate_init=0.0015,solver="lbfgs")

In [19]:
# Train test Split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=42)

In [20]:
scaler = StandardScaler().fit(X_train)
X_train = scaler.transform(X_train)
X_test = scaler.transform(X_test)

In [21]:
kf = KFold(5)
for train_index, val_index in kf.split(X_train):
    kf_X_train, X_val = X_train[train_index], X_train[val_index]
    kf_y_train, y_val = y_train.iloc[train_index], y_train.iloc[val_index]    
    
    MLP_model.fit(kf_X_train,kf_y_train)
    y_val_pred = MLP_model.predict(X_val)
    print(f"Model loss: {MLP_model.loss_}")
    print(f"The MAE is {mean_absolute_error(y_val,y_val_pred)}")

Model loss: 0.39336673252947535
The MAE is 0.7237142698424986
Model loss: 0.3917661392010335
The MAE is 0.7369257450656819
Model loss: 0.29476381584394856
The MAE is 0.6238666923183857
Model loss: 0.28847260129958563
The MAE is 0.5895409414595201
Model loss: 0.2881154124796711
The MAE is 0.6018808064580092


---
Using Neural Networks, directly on the embeddings.

In [22]:
## Maybe don't run this if you care about time.

In [23]:
# An input has to be of shape (3,1024), because we have 3 embedded vectors of size 1024.
# combined array
full_arr = np.dstack((source_arr,refer_arr,trans_arr))

# Option 2, make it size (1,2048), by concatenating the arrays. This is what we are using now, according to teacher's indications.
full_arr_c = np.concatenate((refer_arr,trans_arr),axis=1)


In [24]:
X = full_arr_c.copy()
y = df1["z-score"].to_numpy()

# Train test Split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=42)

In [25]:
X_train.shape

(6073, 2048)

In [26]:
model = models.Sequential()

model.add(layers.Dense(1024,activation="relu",input_dim=X_train.shape[1]))
model.add(layers.Dense(512,activation="relu"))
model.add(layers.Dense(128,activation="relu"))
model.add(layers.Dense(64,activation="relu"))
model.add(layers.Dense(1,activation="tanh"))
model.compile(optimizer=keras.optimizers.Adam(lr=0.001),loss="mse",metrics=["mae"])

In [27]:
model.summary()

Model: "sequential_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_1 (Dense)              (None, 1024)              2098176   
_________________________________________________________________
dense_2 (Dense)              (None, 512)               524800    
_________________________________________________________________
dense_3 (Dense)              (None, 128)               65664     
_________________________________________________________________
dense_4 (Dense)              (None, 64)                8256      
_________________________________________________________________
dense_5 (Dense)              (None, 1)                 65        
Total params: 2,696,961
Trainable params: 2,696,961
Non-trainable params: 0
_________________________________________________________________


In [28]:
# Fit. Batch Size < > Learning Rate

model.fit(X_train,y_train,epochs=10,batch_size=128,verbose=1)
val_mse,val_mae = model.evaluate(X_test,y_test)
print(f"This is val MSE: {val_mse}.\nThis the MAE:{val_mae}")

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
This is val MSE: 0.6698645572309141.
This the MAE:0.6685957908630371


In [29]:
test = model.predict(X_test)

In [30]:
test

array([[ 1.00085482e-01],
       [ 6.35345221e-01],
       [ 8.56472015e-01],
       [-9.99899268e-01],
       [ 9.64676499e-01],
       [-9.81116176e-01],
       [ 3.57894421e-01],
       [-8.91184866e-01],
       [ 7.99853444e-01],
       [-6.82983771e-02],
       [ 8.74893606e-01],
       [ 7.79064953e-01],
       [ 7.53711045e-01],
       [-9.09449518e-01],
       [-9.66547132e-01],
       [ 1.52477473e-01],
       [ 4.43318069e-01],
       [-8.26700151e-01],
       [-6.18386984e-01],
       [-9.89880800e-01],
       [ 9.05480087e-01],
       [-9.16115463e-01],
       [ 8.50535572e-01],
       [ 8.95236015e-01],
       [ 8.33286405e-01],
       [ 4.02354032e-01],
       [ 1.58951029e-01],
       [ 6.14271581e-01],
       [ 9.72830117e-01],
       [-1.95220366e-01],
       [ 8.15298498e-01],
       [-4.55351561e-01],
       [-9.33613539e-01],
       [-6.03345096e-01],
       [-2.17038244e-01],
       [ 1.53425723e-01],
       [ 3.22465658e-01],
       [ 9.26541388e-01],
       [ 4.8

In [31]:
df1["z-score"].corr(pd.Series(model.predict(X).flatten()))
# The results are much better, and indicative of a good metric.

0.797005621303713

In [None]:
model.save("en-fi_laser_model")