### Implementation of LASER to the corpus
### EN-ZH

In [1]:
# Imports
import numpy as np
from scipy import spatial
import pandas as pd
from datetime import datetime

from sklearn.linear_model import LinearRegression
from sklearn.neural_network import MLPRegressor
from sklearn.model_selection import train_test_split, KFold
from sklearn.metrics import mean_absolute_error
from sklearn.preprocessing import StandardScaler

import keras
from keras import models
from keras import layers


Using TensorFlow backend.


Load Dataset

In [2]:
df1 = pd.read_csv("scores.csv")

In [3]:
df1.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10221 entries, 0 to 10220
Data columns (total 6 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   source       10221 non-null  object 
 1   reference    10221 non-null  object 
 2   translation  10221 non-null  object 
 3   z-score      10221 non-null  float64
 4   avg-score    10221 non-null  float64
 5   annotators   10221 non-null  int64  
dtypes: float64(2), int64(1), object(3)
memory usage: 479.2+ KB


In [4]:
df1.head()

Unnamed: 0,source,reference,translation,z-score,avg-score,annotators
0,"""In the GISS model's simulation, Venus' slow s...",GSIS的科学家AnthonyDelGenio在新闻稿中解释说：“在GISS模型的模拟模型中...,戈达德太空研究所科学家安东尼·德尔·杰尼奥在新闻发布会上解释说：“在戈达德太空研究所的模型模...,-1.171867,50.0,1
1,Ai Yanhan of China in the Women's 4 x 200m Fre...,中国在英国女性4x200mFreestreyWTE中的最后被称为：“中国14岁的孩子从球下降...,参加女子4x200米自由泳接力赛决赛的中国小将艾衍含被这样描述：“那名14岁的中国小姑娘犯了...,-2.255403,26.5,2
2,"Then came 2012, when nothing much went right f...",然后来到2012年，当她和她的队友们没有什么好处。,2012年，她和她的队友都不被看好。,-2.508996,21.0,1
3,"Since last year, Guodian Group has exported a ...",自去年以来，GoudianGroup从南非通过南非港口出口了163套风力发电项目。,自去年以来，国电集团共计有163套风电项目陆续从连云港港出口南非。,-2.41678,23.0,1
4,"Some alleged that the Kempinski hotel simply ""...","一些人指称，Kempinski旅馆只是""被捕""，以满足阿拉伯客户的要求。",有人认为凯宾斯基酒店简直是为了满足阿拉伯客户的要求而“卑躬屈膝”。,-1.489676,45.0,7


Comparison of LASER embeddings

In [5]:
source_arr = np.load("laser.source_embeds.npy")
refer_arr = np.load("laser.reference_embeds.npy")
trans_arr = np.load("laser.translation_embeds.npy")

Cosine Similarity

In [6]:
cos_similarity_ref_hyp = []
cos_similarity_src_ref = []
cos_similarity_src_hyp = []

# for each of the cos similarity, put them into lists.
for i in range(refer_arr.shape[0]):
    cos_similarity_ref_hyp.append((spatial.distance.cosine(refer_arr[i],trans_arr[i])*-1)+1)
    cos_similarity_src_ref.append((spatial.distance.cosine(source_arr[i],refer_arr[i])*-1)+1)
    cos_similarity_src_hyp.append((spatial.distance.cosine(source_arr[i],trans_arr[i])*-1)+1)


# Standardize and transform into series.
scaler = StandardScaler()
cs_rh = pd.Series(scaler.fit_transform(np.array(cos_similarity_ref_hyp).reshape(-1,1)).flatten(),name="cos_sim_ref_hyp")
cs_sr = pd.Series(scaler.fit_transform(np.array(cos_similarity_src_ref).reshape(-1,1)).flatten(),name="cos_sim_src_ref")
cs_sh = pd.Series(scaler.fit_transform(np.array(cos_similarity_src_hyp).reshape(-1,1)).flatten(),name="cos_sim_src_hyp")



In [7]:
cos_df = pd.concat([cs_rh,cs_sr,cs_sh,df1.loc[:,"z-score"]],axis=1)
cos_df.head()

Unnamed: 0,cos_sim_ref_hyp,cos_sim_src_ref,cos_sim_src_hyp,z-score
0,0.506499,-0.22607,0.134138,-1.171867
1,-0.00075,0.306927,0.277149,-2.255403
2,0.283278,-0.423836,-1.636946,-2.508996
3,-0.167969,-0.195989,0.802346,-2.41678
4,-0.649535,0.919665,-0.351194,-1.489676


In [36]:
# Pearson
print("Pairwise Pearson")
cos_df.corr()

Pairwise Pearson


Unnamed: 0,cos_sim_ref_hyp,cos_sim_src_ref,cos_sim_src_hyp,z-score
cos_sim_ref_hyp,1.0,0.549882,0.678549,0.409383
cos_sim_src_ref,0.549882,1.0,0.64472,0.170944
cos_sim_src_hyp,0.678549,0.64472,1.0,0.224158
z-score,0.409383,0.170944,0.224158,1.0


In [9]:
# Kendall
print("Pairwise Kendall")
cos_df.corr(method="kendall")

Pairwise Kendall


Unnamed: 0,cos_sim_ref_hyp,cos_sim_src_ref,cos_sim_src_hyp,z-score
cos_sim_ref_hyp,1.0,0.303052,0.457667,0.281366
cos_sim_src_ref,0.303052,1.0,0.470356,0.100637
cos_sim_src_hyp,0.457667,0.470356,1.0,0.124009
z-score,0.281366,0.100637,0.124009,1.0


In [10]:
# Mean Absolute Deviation of each of these.
for col in cos_df.columns[:3]:
    print(f"Mean Absolute Error of {col} in regards to the z-score: {mean_absolute_error(cos_df[col],cos_df['z-score'])}")

Mean Absolute Error of cos_sim_ref_hyp in regards to the z-score: 0.7785663920526503
Mean Absolute Error of cos_sim_src_ref in regards to the z-score: 0.9397270535360308
Mean Absolute Error of cos_sim_src_hyp in regards to the z-score: 0.9254213676156057


---
Linear Regression <br>to try and predict avg score based on the cos similarity 

In [11]:
X = cos_df.drop(columns='z-score')
y = cos_df['z-score']

In [12]:
lin_model = LinearRegression()

In [13]:
kf = KFold(n_splits = 10)
for train_index, test_index in kf.split(X):
    X_train, X_val = X.iloc[train_index], X.iloc[test_index]
    y_train, y_val = y.iloc[train_index], y.iloc[test_index]
    
    lin_model.fit(X_train,y_train)
    y_pred = lin_model.predict(X_val)
    lin_model_3_mae = mean_absolute_error(y_val,y_pred)
    print(lin_model_3_mae)

0.7318015561163422
0.7078267577059839
0.6823247554210009
0.7415446409532402
0.6582688283043029
0.6841192224089204
0.6878611778508932
0.6571855823984333
0.6821971881379517
0.6358732738906558


In [14]:
lin_model_3_par = [lin_model.intercept_]
lin_model_3_par.extend([coef for coef in lin_model.coef_])
print(lin_model_3_par)

[-0.07425558325657274, 0.44863739230268007, -0.037769510766578744, -0.07451250543866782]


In [15]:
# Check the corr of this model. 
lin_3_pred = cos_df.apply(lambda x: lin_model_3_par[0] + lin_model_3_par[1]*x[0] + lin_model_3_par[2]*x[1] + lin_model_3_par[3]*x[2],axis=1)
lin_3_corr_p = lin_3_pred.corr(df1["z-score"], method="pearson")
lin_3_corr_k = lin_3_pred.corr(df1["z-score"], method="kendall")
print(str(lin_3_corr_p)+" pearson")
print(str(lin_3_corr_k)+" kendall")

0.4173818446793282 pearson
0.2894802208478055 kendall


In [16]:
# A model with src_ref and src_hyp only. 
X = cos_df.drop(columns=['cos_sim_ref_hyp','z-score'])
y = cos_df['z-score']

In [17]:
kf = KFold(n_splits = 10)
for train_index, test_index in kf.split(X):
    X_train, X_val = X.iloc[train_index], X.iloc[test_index]
    y_train, y_val = y.iloc[train_index], y.iloc[test_index]
    
    lin_model.fit(X_train,y_train)
    y_pred = lin_model.predict(X_val)
    print(mean_absolute_error(y_val,y_pred))

0.8034793625698244
0.7561748589369753
0.7227276933222807
0.7949766138367033
0.7106186214155411
0.7499394138269524
0.7318171319851264
0.7405564579943311
0.7523008423723003
0.7360701835835555


In [18]:
y_pred

array([ 0.07185793, -0.11050848, -0.21701867, ...,  0.19712859,
       -0.54550928,  0.18215215])

In [19]:
lin_model_2_par = [lin_model.intercept_]
lin_model_2_par.extend([coef for coef in lin_model.coef_])
print(lin_model_2_par)

[-0.08491266375244289, 0.04634152696063723, 0.17725944328727994]


In [20]:
# Check the corr of this model. 
lin_2_pred = cos_df.apply(lambda x: lin_model_2_par[0] + lin_model_2_par[1]*x[0] + lin_model_2_par[2]*x[1],axis=1)
lin_2_corr_p = lin_2_pred.corr(df1["z-score"])
lin_2_corr_k = lin_2_pred.corr(df1["z-score"],method="kendall")
print(str(lin_2_corr_p)+" pearson")
print(str(lin_2_corr_p)+" kendall")

0.23872130278013967 pearson
0.23872130278013967 kendall


---
Using a Neural Network

In [21]:
MLP_model = MLPRegressor(hidden_layer_sizes=(2,2),max_iter=250,learning_rate_init=0.0015,solver="lbfgs")

In [22]:
# Train test Split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=42)

In [23]:
scaler = StandardScaler().fit(X_train)
X_train = scaler.transform(X_train)
X_test = scaler.transform(X_test)

In [24]:
kf = KFold(5)
for train_index, val_index in kf.split(X_train):
    kf_X_train, X_val = X_train[train_index], X_train[val_index]
    kf_y_train, y_val = y_train.iloc[train_index], y_train.iloc[val_index]    
    
    MLP_model.fit(kf_X_train,kf_y_train)
    y_val_pred = MLP_model.predict(X_val)
    print(f"Model loss: {MLP_model.loss_}")
    print(f"The MAE is {mean_absolute_error(y_val,y_val_pred)}")

Model loss: 0.40855744359855206
The MAE is 0.7412479970650153
Model loss: 0.4145916720302242
The MAE is 0.7288228802303076
Model loss: 0.4098581552866674
The MAE is 0.7533073456775995
Model loss: 0.4020278203609589
The MAE is 0.7490238051296397
Model loss: 0.4033525914424843
The MAE is 0.7418247295362148


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
  self.n_iter_ = _check_optimize_result("lbfgs", opt_res, self.max_iter)


---
Using Neural Networks, directly on the embeddings.

In [25]:
# An input has to be of shape (3,1024), because we have 3 embedded vectors of size 1024.
# combined array
#full_arr = np.dstack((source_arr,refer_arr,trans_arr))

# Option 2, make it size (1,2048), by concatenating the arrays. This is what we are using now, according to teacher's indications.
full_arr_c = np.concatenate((refer_arr,trans_arr),axis=1)


In [26]:
X = full_arr_c.copy()
y = df1["z-score"].to_numpy()

# Train test Split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=42)

In [27]:
X_train.shape

(9198, 2048)

In [28]:
model = models.Sequential()

model.add(layers.Dense(1024,activation="relu",input_dim=X_train.shape[1]))
model.add(layers.Dense(512,activation="relu"))
model.add(layers.Dropout(0.5))
model.add(layers.Dense(128,activation="relu"))
model.add(layers.Dropout(0.25))
model.add(layers.Dense(64,activation="relu"))

model.add(layers.Dense(1,activation="tanh"))
model.compile(optimizer=keras.optimizers.Adam(lr=0.002),loss="mse",metrics=["mae"])

In [29]:
model.summary()

Model: "sequential_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_1 (Dense)              (None, 1024)              2098176   
_________________________________________________________________
dense_2 (Dense)              (None, 512)               524800    
_________________________________________________________________
dropout_1 (Dropout)          (None, 512)               0         
_________________________________________________________________
dense_3 (Dense)              (None, 128)               65664     
_________________________________________________________________
dropout_2 (Dropout)          (None, 128)               0         
_________________________________________________________________
dense_4 (Dense)              (None, 64)                8256      
_________________________________________________________________
dense_5 (Dense)              (None, 1)                

In [30]:
# Fit. Batch Size < > Learning Rate

model.fit(X_train,y_train,epochs=50,batch_size=64,verbose=0)

<keras.callbacks.callbacks.History at 0x1d7b96aec88>

In [31]:
# Evaluation and comparison.
test_mse,test_mae = model.evaluate(X_test,y_test)
test_corr = np.corrcoef(model.predict(X_test).flatten(),y_test)[0][1]
print(f"Test MSE: {test_mse:.4f}.\nTest MAE: {test_mae:.4f}.")
print(f"Pearson correlation between y_val_predicted and actual y_val: {test_corr:.4f}")

Test MSE: 0.8380.
Test MAE: 0.6916.
Pearson correlation between y_val_predicted and actual y_val: 0.3862


In [32]:
# For reference, pt.2
for col in cos_df.columns[:3]:
    print(f"Mean Absolute Error of {col} in regards to the z-score: {mean_absolute_error(cos_df[col],cos_df['z-score'])}")

Mean Absolute Error of cos_sim_ref_hyp in regards to the z-score: 0.7785663920526503
Mean Absolute Error of cos_sim_src_ref in regards to the z-score: 0.9397270535360308
Mean Absolute Error of cos_sim_src_hyp in regards to the z-score: 0.9254213676156057


In [38]:
print("Pearson correlation of cos_sim")
print(cos_df.corr().iloc[:,3])

Pearson correlation of cos_sim
cos_sim_ref_hyp    0.409383
cos_sim_src_ref    0.170944
cos_sim_src_hyp    0.224158
z-score            1.000000
Name: z-score, dtype: float64


In [39]:
print("Kendall correlation of cos_sim")
print(cos_df.corr(method="kendall").iloc[:,3])

Kendall correlation of cos_sim
cos_sim_ref_hyp    0.281366
cos_sim_src_ref    0.100637
cos_sim_src_hyp    0.124009
z-score            1.000000
Name: z-score, dtype: float64


In [33]:
# Corr of linear models on cos_sim
print(f"Linear Model with the 3 cos_similarities, pearson: {lin_3_corr_p}")
print(f"Linear Model with the 3 cos_similarities, kendall: {lin_3_corr_k}")
print(f"Linear Model with the 2 (src_ref and src_hyp) cos_similarities, pearson: {lin_2_corr_p}")
print(f"Linear Model with the 2 (src_ref and src_hyp) cos_similarities, kendall: {lin_2_corr_k}")

Linear Model with the 3 cos_similarities, pearson: 0.4173818446793282
Linear Model with the 3 cos_similarities, kendall: 0.2894802208478055
Linear Model with the 2 (src_ref and src_hyp) cos_similarities, pearson: 0.23872130278013967
Linear Model with the 2 (src_ref and src_hyp) cos_similarities, kendall: 0.15022675587311157


In [34]:
# Save Model.
model.save("en_zh_laser_model__testcorr_{:.4f}_MAE_{:.4f}_time{}.h5".format(test_corr,test_mae,datetime.now().strftime('%d_%m_%H_%M')
))