Implementation of LASER to the corpus

In [3]:
# Imports
import numpy as np
from scipy import spatial
import numpy as np
import pandas as pd
import time

from sklearn.linear_model import LinearRegression
from sklearn.neural_network import MLPRegressor
from sklearn.model_selection import train_test_split, KFold
from sklearn.metrics import mean_absolute_error
from sklearn.preprocessing import StandardScaler

import keras
from keras import models
from keras import layers


Using TensorFlow backend.


In [13]:
# Parameters

# laser model to use
laser_model = None

# Lin Model to use
lin_model_par = [0,1,2,3]

Load Dataset

In [5]:
df1 = pd.read_csv("scores.csv")

In [6]:
df1.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 22128 entries, 0 to 22127
Data columns (total 3 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   source       22128 non-null  object
 1   reference    22128 non-null  object
 2   translation  22128 non-null  object
dtypes: object(3)
memory usage: 518.8+ KB


In [7]:
df1.head()

Unnamed: 0,source,reference,translation
0,The future and the destinies of the citizens o...,世界上每个国家公民的未来和命运日益联系在一起。,世界各国人民前途命运越来越紧密地联系在一起。
1,"After all that hard work, the finished result ...",经过那么多的努力，最终的结果现在已经可以揭晓了。,经过这么艰辛的工作，最终的结果现在才得以公布。
2,Author: researcher of Suning Institute of Fina...,作者：苏宁金融研究所研究员，财经专栏作家，财经评论员。,作者：苏宁金融研究院特约研究员，财经专栏作家，财经评论员。
3,“The Great Wall” tells the story of a Chinese ...,《长城》讲述了古代一支中国精锐部队在世界著名的中国长城上与怪物桃蒂英勇作战的故事。,《长城》讲述了在古代，一支中国精英部队为保卫人类，在举世闻名的长城上与怪兽饕餮进行生死决战的故事。
4,Our comrades from the Political Bureau should ...,政治局同志要学习历史，讲道理，不能混淆公、私利益，叫白黑，模糊义与利的界限，处理基于裙带关系...,中央政治局的同志都应该明史知理，不能颠倒了公私、混淆了是非、模糊了义利、放纵了亲情，要带头树...


In [9]:
# Verification
# Check for empty or sparse reference / translation, and drop them.
for column in ["source","reference","translation"]:
    print(column)
    bad_idx = [idx for idx in np.where(df1[column].str.len()<=0)[0]]
    if bad_idx != []:
        print(df1.iloc[bad_idx])
    print(f"Bad idx: {bad_idx}")
#    df1 = df1.drop(index=bad_idx)

source
Bad idx: []
reference
Bad idx: []
translation
Bad idx: []


Loading LASER embeddings

In [10]:
source_arr = np.load("laser.source_embeds.npy")
refer_arr = np.load("laser.reference_embeds.npy")
trans_arr = np.load("laser.translation_embeds.npy")

Cosine Similarity

In [11]:
cos_similarity_ref_hyp = []
cos_similarity_src_ref = []
cos_similarity_src_hyp = []

# for each of the cos similarity, put them into lists.
for i in range(refer_arr.shape[0]):
    cos_similarity_ref_hyp.append((spatial.distance.cosine(refer_arr[i],trans_arr[i])*-1)+1)
    cos_similarity_src_ref.append((spatial.distance.cosine(source_arr[i],refer_arr[i])*-1)+1)
    cos_similarity_src_hyp.append((spatial.distance.cosine(source_arr[i],trans_arr[i])*-1)+1)


# Standardize and transform into series.
scaler = StandardScaler()
cs_rh = pd.Series(scaler.fit_transform(np.array(cos_similarity_ref_hyp).reshape(-1,1)).flatten(),name="cos_sim_ref_hyp")
cs_sr = pd.Series(scaler.fit_transform(np.array(cos_similarity_src_ref).reshape(-1,1)).flatten(),name="cos_sim_src_ref")
cs_sh = pd.Series(scaler.fit_transform(np.array(cos_similarity_src_hyp).reshape(-1,1)).flatten(),name="cos_sim_src_hyp")



In [12]:
# Z scores of Cosine Similarity.
cos_df = pd.concat([cs_rh,cs_sr,cs_sh],axis=1)
cos_df

Unnamed: 0,cos_sim_ref_hyp,cos_sim_src_ref,cos_sim_src_hyp
0,-0.329423,-0.941544,-0.468860
1,-0.176342,-0.773642,-0.255267
2,1.525802,0.300737,0.757324
3,0.841361,-0.018431,0.120806
4,0.366930,0.781779,0.603072
...,...,...,...
22123,-0.706582,1.194561,-0.378290
22124,-0.155772,0.916058,0.499427
22125,0.371621,0.370761,0.125973
22126,-1.950879,-0.959012,-2.042749


---
Using Neural Networks, directly on the embeddings.

In [11]:
# Loading the model.
model = keras.models.load_model(laser_model)

In [12]:
model.summary()

Model: "sequential_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_1 (Dense)              (None, 1024)              2098176   
_________________________________________________________________
dense_2 (Dense)              (None, 512)               524800    
_________________________________________________________________
dropout_1 (Dropout)          (None, 512)               0         
_________________________________________________________________
dense_3 (Dense)              (None, 128)               65664     
_________________________________________________________________
dropout_2 (Dropout)          (None, 128)               0         
_________________________________________________________________
dense_4 (Dense)              (None, 64)                8256      
_________________________________________________________________
dense_5 (Dense)              (None, 1)                

In [13]:
# Prepare the input layer.
full_arr_c = np.concatenate((refer_arr,trans_arr),axis=1)
X = full_arr_c.copy()

In [14]:
df1["metric"] = model.predict(X)

In [16]:
df1.to_csv("scores.csv",index=False)