### Implementation of LASER to the corpus
### ZH-EN

In [1]:
# Imports
import numpy as np
from scipy import spatial
import pandas as pd
from datetime import datetime

from sklearn.linear_model import LinearRegression
from sklearn.neural_network import MLPRegressor
from sklearn.model_selection import train_test_split, KFold
from sklearn.metrics import mean_absolute_error
from sklearn.preprocessing import StandardScaler

import keras
from keras import models
from keras import layers


Using TensorFlow backend.


Load Dataset

In [2]:
df1 = pd.read_csv("scores.csv")

In [3]:
df1.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 26419 entries, 0 to 26418
Data columns (total 6 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   source       26419 non-null  object 
 1   reference    26419 non-null  object 
 2   translation  26419 non-null  object 
 3   z-score      26419 non-null  float64
 4   avg-score    26419 non-null  float64
 5   annotators   26419 non-null  int64  
dtypes: float64(2), int64(1), object(3)
memory usage: 1.2+ MB


In [4]:
df1.head()

Unnamed: 0,source,reference,translation,z-score,avg-score,annotators
0,他性格活泼，这对英国赛马来说是好事，但是除此之外，他还是一位不可思议的骑师。,"His character is good for the British horse, b...",He's a lively character which is good for Brit...,0.625559,92.75,4
1,近日刚搬至旧金山的一位28岁厨师本周被发现死于当地一家商场的楼梯间。,"A 28 chef, who has just moved to San Francisco...",A 28-year-old chef who had recently moved to S...,0.550952,92.0,4
2,去年，有官员表示，胡克先生的团队所得出的结论是针对伊斯兰国炼油厂的空袭并未大幅削减恐怖组织的...,"Last year, officials said Mr. Hooker's team ha...","Last year, officials said, Mr. Hooker's team c...",0.540814,89.0,5
3,尤其值得玩味的是政府对于饥饿民众们的回应，比如总统市民赫伯特·胡佛“别人的事我可管不了”的态度。,It is particularly interesting to note the gov...,Especially savory are the accounts of the gove...,-0.793944,49.5,4
4,不过，从20世纪90年代至今，人类共进行了18次火星探测，而月球探测只进行了14次。,"However, from the 1990s to the present, human ...","However, ever since the 1990s, a total of 18 h...",0.046532,77.5,4


Comparison of LASER embeddings

In [5]:
source_arr = np.load("laser.source_embeds.npy")
refer_arr = np.load("laser.reference_embeds.npy")
trans_arr = np.load("laser.translation_embeds.npy")

Cosine Similarity

In [6]:
cos_similarity_ref_hyp = []
cos_similarity_src_ref = []
cos_similarity_src_hyp = []

# for each of the cos similarity, put them into lists.
for i in range(refer_arr.shape[0]):
    cos_similarity_ref_hyp.append((spatial.distance.cosine(refer_arr[i],trans_arr[i])*-1)+1)
    cos_similarity_src_ref.append((spatial.distance.cosine(source_arr[i],refer_arr[i])*-1)+1)
    cos_similarity_src_hyp.append((spatial.distance.cosine(source_arr[i],trans_arr[i])*-1)+1)


# Standardize and transform into series.
scaler = StandardScaler()
cs_rh = pd.Series(scaler.fit_transform(np.array(cos_similarity_ref_hyp).reshape(-1,1)).flatten(),name="cos_sim_ref_hyp")
cs_sr = pd.Series(scaler.fit_transform(np.array(cos_similarity_src_ref).reshape(-1,1)).flatten(),name="cos_sim_src_ref")
cs_sh = pd.Series(scaler.fit_transform(np.array(cos_similarity_src_hyp).reshape(-1,1)).flatten(),name="cos_sim_src_hyp")



In [7]:
cos_df = pd.concat([cs_rh,cs_sr,cs_sh,df1.loc[:,"z-score"]],axis=1)
cos_df.head()

Unnamed: 0,cos_sim_ref_hyp,cos_sim_src_ref,cos_sim_src_hyp,z-score
0,0.173155,-0.026448,-0.237776,0.625559
1,1.256537,-0.232695,0.371036,0.550952
2,0.735595,1.418661,0.834117,0.540814
3,-0.476458,-0.265515,0.399952,-0.793944
4,0.000247,0.160120,0.187092,0.046532
...,...,...,...,...
26414,0.218580,0.754503,0.055343,0.563658
26415,-0.135794,-0.383519,0.677467,-0.358579
26416,-0.523033,0.242605,-0.527547,0.554093
26417,-1.972187,-2.763365,-2.246804,-1.724330


In [8]:
# Pearson
print("Pairwise Pearson")
cos_df.corr()

Pairwise Pearson


Unnamed: 0,cos_sim_ref_hyp,cos_sim_src_ref,cos_sim_src_hyp,z-score
cos_sim_ref_hyp,1.0,0.466661,0.62979,0.327467
cos_sim_src_ref,0.466661,1.0,0.73086,0.153644
cos_sim_src_hyp,0.62979,0.73086,1.0,0.157394
z-score,0.327467,0.153644,0.157394,1.0


In [9]:
# Kendall
print("Pairwise Kendall")
cos_df.corr(method="kendall")

Pairwise Kendall


Unnamed: 0,cos_sim_ref_hyp,cos_sim_src_ref,cos_sim_src_hyp,z-score
cos_sim_ref_hyp,1.0,0.29253,0.4039,0.224135
cos_sim_src_ref,0.29253,1.0,0.566472,0.094201
cos_sim_src_hyp,0.4039,0.566472,1.0,0.087771
z-score,0.224135,0.094201,0.087771,1.0


In [11]:
# Mean Absolute Deviation of each of these.
for col in cos_df.columns[:3]:
    print(f"Mean Absolute Error of {col} in regards to the z-score: {mean_absolute_error(cos_df[col],cos_df['z-score'])}")

Mean Absolute Error of cos_sim_ref_hyp in regards to the z-score: 0.8199876472669945
Mean Absolute Error of cos_sim_src_ref in regards to the z-score: 0.9308549534144837
Mean Absolute Error of cos_sim_src_hyp in regards to the z-score: 0.9328463761261162


---
Linear Regression <br>to try and predict avg score based on the cos similarity 

In [12]:
X = cos_df.drop(columns='z-score')
y = cos_df['z-score']

In [13]:
lin_model = LinearRegression()

In [14]:
kf = KFold(n_splits = 10)
for train_index, test_index in kf.split(X):
    X_train, X_val = X.iloc[train_index], X.iloc[test_index]
    y_train, y_val = y.iloc[train_index], y.iloc[test_index]
    
    lin_model.fit(X_train,y_train)
    y_pred = lin_model.predict(X_val)
    lin_model_3_mae = mean_absolute_error(y_val,y_pred)
    print(lin_model_3_mae)

0.6776633492222478
0.6370079351004929
0.6324220639194772
0.6392171464008425
0.6444498893488928
0.6753647773244216
0.6620867984504463
0.650010768748836
0.6580107630612602
0.6589659304575401


In [34]:
lin_model_3_par = [lin_model.intercept_]
lin_model_3_par.extend([coef for coef in lin_model.coef_])
print(lin_model_3_par)

[-0.04074718536387313, 0.0680790974291972, 0.08563648855431708]

In [16]:
# A model with src_ref and src_hyp only. 
X = cos_df.drop(columns=['cos_sim_ref_hyp','z-score'])
y = cos_df['z-score']

In [17]:
kf = KFold(n_splits = 10)
for train_index, test_index in kf.split(X):
    X_train, X_val = X.iloc[train_index], X.iloc[test_index]
    y_train, y_val = y.iloc[train_index], y.iloc[test_index]
    
    lin_model.fit(X_train,y_train)
    y_pred = lin_model.predict(X_val)
    print(mean_absolute_error(y_val,y_pred))

0.7141006029620198
0.6770994441870142
0.6861429193735801
0.6784437905018458
0.6954732339954339
0.7070602092647513
0.6922977112566656
0.6922487836041701
0.690739371662775
0.7027441103540882
[0.0680791  0.08563649]
-0.04074718536387313


In [18]:
y_pred

array([-0.18367885, -0.02664304,  0.08770364, ..., -0.0694081 ,
       -0.42128295,  0.06045823])

In [None]:
lin_model_2_par = [lin_model.intercept_]
lin_model_2_par.extend([coef for coef in lin_model.coef_])
print(lin_model_2_par)

---
Using a Neural Network

In [19]:
MLP_model = MLPRegressor(hidden_layer_sizes=(2,2),max_iter=250,learning_rate_init=0.0015,solver="lbfgs")

In [20]:
# Train test Split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=42)

In [21]:
scaler = StandardScaler().fit(X_train)
X_train = scaler.transform(X_train)
X_test = scaler.transform(X_test)

In [22]:
kf = KFold(5)
for train_index, val_index in kf.split(X_train):
    kf_X_train, X_val = X_train[train_index], X_train[val_index]
    kf_y_train, y_val = y_train.iloc[train_index], y_train.iloc[val_index]    
    
    MLP_model.fit(kf_X_train,kf_y_train)
    y_val_pred = MLP_model.predict(X_val)
    print(f"Model loss: {MLP_model.loss_}")
    print(f"The MAE is {mean_absolute_error(y_val,y_val_pred)}")

Model loss: 0.3836637227911214
The MAE is 0.7006824626349987
Model loss: 0.38257539605733826
The MAE is 0.7068243028013327
Model loss: 0.37064695228275224
The MAE is 0.685056750911303
Model loss: 0.3768615280343884
The MAE is 0.7011582901772277
Model loss: 0.3704462311087519
The MAE is 0.6865049238463271


---
Using Neural Networks, directly on the embeddings.

In [23]:
# An input has to be of shape (3,1024), because we have 3 embedded vectors of size 1024.
# combined array
#full_arr = np.dstack((source_arr,refer_arr,trans_arr))

# Option 2, make it size (1,2048), by concatenating the arrays. This is what we are using now, according to teacher's indications.
full_arr_c = np.concatenate((refer_arr,trans_arr),axis=1)


In [24]:
X = full_arr_c.copy()
y = df1["z-score"].to_numpy()

# Train test Split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=42)

In [25]:
X_train.shape

(23777, 2048)

In [27]:
model = models.Sequential()

model.add(layers.Dense(1024,activation="relu",input_dim=X_train.shape[1]))
model.add(layers.Dense(512,activation="relu"))
model.add(layers.Dropout(0.5))
model.add(layers.Dense(128,activation="relu"))
model.add(layers.Dropout(0.25))
model.add(layers.Dense(64,activation="relu"))

model.add(layers.Dense(1,activation="tanh"))
model.compile(optimizer=keras.optimizers.Adam(lr=0.002),loss="mse",metrics=["mae"])

In [28]:
model.summary()

Model: "sequential_2"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_2 (Dense)              (None, 1024)              2098176   
_________________________________________________________________
dense_3 (Dense)              (None, 512)               524800    
_________________________________________________________________
dropout_1 (Dropout)          (None, 512)               0         
_________________________________________________________________
dense_4 (Dense)              (None, 128)               65664     
_________________________________________________________________
dropout_2 (Dropout)          (None, 128)               0         
_________________________________________________________________
dense_5 (Dense)              (None, 64)                8256      
_________________________________________________________________
dense_6 (Dense)              (None, 1)                

In [29]:
# Fit. Batch Size < > Learning Rate

model.fit(X_train,y_train,epochs=50,batch_size=64,verbose=0)

<keras.callbacks.callbacks.History at 0x2c03ac2c3c8>

In [37]:
# Evaluation and comparison.
test_mse,test_mae = model.evaluate(X_test,y_test)
test_corr = np.corrcoef(model.predict(X_test).flatten(),y_test)[0][1]
print(f"Test MSE: {test_mse:.4f}.\nTest MAE: {test_mae:.4f}.")
print(f"Pearson correlation between y_val_predicted and actual y_val: {test_corr:.4f}")

Test MSE: 0.7155.
Test MAE: 0.6557.
Pearson correlation between y_val_predicted and actual y_val: 0.3936


In [38]:
# For reference, pt.2
for col in cos_df.columns[:3]:
    print(f"Mean Absolute Error of {col} in regards to the z-score: {mean_absolute_error(cos_df[col],cos_df['z-score'])}")

Mean Absolute Error of cos_sim_ref_hyp in regards to the z-score: 0.8199876472669945
Mean Absolute Error of cos_sim_src_ref in regards to the z-score: 0.9308549534144837
Mean Absolute Error of cos_sim_src_hyp in regards to the z-score: 0.9328463761261162


In [40]:
# Save Model.
model.save("zh_en_laser_model__testcorr_{:.4f}_MAE_{:.4f}_time{}.h5".format(test_corr,test_mae,datetime.now().strftime('%d_%m_%H_%M')
))