Implementation of LASER to the corpus

In [3]:
# Imports
import numpy as np
from scipy import spatial
import numpy as np
import pandas as pd
import time

from sklearn.linear_model import LinearRegression
from sklearn.neural_network import MLPRegressor
from sklearn.model_selection import train_test_split, KFold
from sklearn.metrics import mean_absolute_error
from sklearn.preprocessing import StandardScaler

import keras
from keras import models
from keras import layers


Using TensorFlow backend.


In [4]:
# Parameters

# laser model to use
laser_model = "de_en_laser_model__testcorr_0.3574_MAE_0.6445_time28_05_14_10.h5"

# lin model to use
lin_model_par = [0.0034786935062026455, 0.38461416565797896, 0.03342224180059689, -0.13847138238775347]

Load Dataset

In [5]:
df1 = pd.read_csv("scores.csv")

In [6]:
df1.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 13157 entries, 0 to 13156
Data columns (total 3 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   source       13157 non-null  object
 1   reference    13156 non-null  object
 2   translation  13157 non-null  object
dtypes: object(3)
memory usage: 308.5+ KB


In [7]:
df1.head()

Unnamed: 0,source,reference,translation
0,Через полчаса обуглившийся клубень достают и п...,"After half an hour, the charred tuber is taken...","After half-an-hour, the charred tuber is retri..."
1,"Здесь никто не думает отменять смертную казнь,...","Here, no one thinks to abolish the death penal...","Here, no one is concerned with abolishing the ..."
2,"Собеседники ""Известий"" в ОНФ отмечают, что док...","The interlocutors of"" Izvestiya ""in the onf no...",Izvestia’s sources in the ONF note that the re...
3,На древней Венере могли существовать океаны.,On the ancient Venus could exist in the oceans.,Oceans could have existed on ancient Venus.
4,До этого момента убийства оставались лишь исто...,"Up to this point, the murders were just a stor...","Up until this point, the murders have remained..."


In [8]:
# Verification
# Check for empty or sparse reference / translation, and drop them.
for column in ["source","reference","translation"]:
    print(column)
    bad_idx = [idx for idx in np.where(df1[column].str.len()<=2)[0]]
    if bad_idx != []:
        print(df1.iloc[bad_idx])
    print(f"Bad idx: {bad_idx}")
#    df1 = df1.drop(index=bad_idx)

source
Bad idx: []
reference
     source reference translation
7193   Мда.         .        Yep.
Bad idx: [7193]
translation
Bad idx: []


Loading LASER embeddings

In [9]:
source_arr = np.load("laser.source_embeds.npy")
refer_arr = np.load("laser.reference_embeds.npy")
trans_arr = np.load("laser.translation_embeds.npy")

Cosine Similarity

In [10]:
cos_similarity_ref_hyp = []
cos_similarity_src_ref = []
cos_similarity_src_hyp = []

# for each of the cos similarity, put them into lists.
for i in range(refer_arr.shape[0]):
    cos_similarity_ref_hyp.append((spatial.distance.cosine(refer_arr[i],trans_arr[i])*-1)+1)
    cos_similarity_src_ref.append((spatial.distance.cosine(source_arr[i],refer_arr[i])*-1)+1)
    cos_similarity_src_hyp.append((spatial.distance.cosine(source_arr[i],trans_arr[i])*-1)+1)


# Standardize and transform into series.
scaler = StandardScaler()
cs_rh = pd.Series(scaler.fit_transform(np.array(cos_similarity_ref_hyp).reshape(-1,1)).flatten(),name="cos_sim_ref_hyp")
cs_sr = pd.Series(scaler.fit_transform(np.array(cos_similarity_src_ref).reshape(-1,1)).flatten(),name="cos_sim_src_ref")
cs_sh = pd.Series(scaler.fit_transform(np.array(cos_similarity_src_hyp).reshape(-1,1)).flatten(),name="cos_sim_src_hyp")



In [11]:
# Z scores of Cosine Similarity.
cos_df = pd.concat([cs_rh,cs_sr,cs_sh],axis=1)
cos_df

Unnamed: 0,cos_sim_ref_hyp,cos_sim_src_ref,cos_sim_src_hyp
0,0.275654,-0.530827,-1.112059
1,1.199890,0.684188,0.874728
2,0.177493,0.858142,0.572896
3,-0.650409,0.231983,-0.054596
4,-0.436737,-0.194828,-0.396830
...,...,...,...
13152,1.297018,-0.182575,0.373908
13153,0.238635,0.308089,0.003420
13154,0.518615,0.353747,0.573046
13155,0.354592,0.143956,0.988300


---
Using Neural Networks, directly on the embeddings.

In [11]:
# Loading the model.
model = keras.models.load_model(laser_model)

In [12]:
model.summary()

Model: "sequential_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_1 (Dense)              (None, 1024)              2098176   
_________________________________________________________________
dense_2 (Dense)              (None, 512)               524800    
_________________________________________________________________
dropout_1 (Dropout)          (None, 512)               0         
_________________________________________________________________
dense_3 (Dense)              (None, 128)               65664     
_________________________________________________________________
dropout_2 (Dropout)          (None, 128)               0         
_________________________________________________________________
dense_4 (Dense)              (None, 64)                8256      
_________________________________________________________________
dense_5 (Dense)              (None, 1)                

In [13]:
# Prepare the input layer.
full_arr_c = np.concatenate((refer_arr,trans_arr),axis=1)
X = full_arr_c.copy()

In [None]:
# If using the linear model,
# cos_df.apply(lambda x: lin_model_3_par[0] + lin_model_3_par[1]*x[0] + lin_model_3_par[2]*x[1] + lin_model_3_par[3]*x[2],axis=1)

In [12]:
df1["metric"] = cos_df.apply(lambda x: lin_model_par[0] + lin_model_par[1]*x[0] + lin_model_par[2]*x[1] + lin_model_par[3]*x[2],axis=1)

In [13]:
df1.to_csv("scores.csv",index=False)