In [1]:
import numpy as np
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split

In [2]:
# Define the path to folder with embedded corpus
path_to_embedding_corpus = '/Users/henriquevaz/NOVA IMS/YEAR 1/SPRING SEMESTER/TM/Project/My Version/corpus 2/'

# List folders inside it
folders_in_embedding_corpus = ['ru-en/', 'de-en/', 'cs-en/', 'zh-en/', 'en-zh/', 'en-fi/']

# Files inside the folders
embedding_files_in_folders = ['laser.reference_embeds.npy', 'laser.source_embeds.npy', 'laser.translation_embeds.npy']
scores_in_folders = 'scores.csv'

# Read files inside it
embedding = np.load(path_to_embedding_corpus+folders_in_embedding_corpus[4]+embedding_files_in_folders[1])
scores = pd.read_csv(path_to_embedding_corpus+folders_in_embedding_corpus[0]+scores_in_folders)

In [3]:
embedding.shape

(10221, 1024)

In [4]:
np.asarray(scores['z-score']).shape

(17980,)

In [5]:
# Add embeddings to a new dataframe

# The first 5 lines are used to initialize empty instances which we will fill later

# Create a DF with the columns we excpect it to have
df_embedded = pd.DataFrame(columns=['Pair', 'Source', 'Reference', 'Translation', 'z-score'])

# For the arrays we need to force a specific shape in order for the concatenate function to work
array_source = np.asarray([]).reshape(0, 1024)
array_reference = np.asarray([]).reshape(0, 1024)
array_translation = np.asarray([]).reshape(0, 1024)
array_zscore = np.asarray([]).reshape(0,)
array_pair = np.asarray([]).reshape(0,)

# Loop over the different language pairs
for i in range(len(folders_in_embedding_corpus)-1):
    
    # Using np.load we can load every language pair embedding and store it in a variable.
    to_add_source = np.load(path_to_embedding_corpus+folders_in_embedding_corpus[i]+'laser.source_embeds.npy')
    to_add_reference = np.load(path_to_embedding_corpus+folders_in_embedding_corpus[i]+'laser.reference_embeds.npy')
    to_add_translation = np.load(path_to_embedding_corpus+folders_in_embedding_corpus[i]+'laser.translation_embeds.npy')
    # In the case of the scores, these are stored in csv files
    to_add_zscore = pd.read_csv(path_to_embedding_corpus+folders_in_embedding_corpus[i]+scores_in_folders)['z-score']
    # For the pair of languages we want an array that simply repeats the language pair N times (N = number of sentences of each embedding)
    to_add_pair = [folders_in_embedding_corpus[i] for j in range (0, to_add_source.shape[0])]
    
    # Using concatenate we can be constatly adding to our arrays what we are loading and storing in variables
    array_pair = np.concatenate((array_pair, to_add_pair))
    array_source = np.concatenate((array_source, to_add_source))
    array_reference = np.concatenate((array_reference, to_add_reference))
    array_translation = np.concatenate((array_translation, to_add_translation))
    array_zscore = np.concatenate((array_zscore, to_add_zscore))
    
# End by assigning the respective values to each column
df_embedded['Pair'] = array_pair.tolist()
df_embedded['Source'] = array_source.tolist()
df_embedded['Reference'] = array_reference.tolist()
df_embedded['Translation'] = array_translation.tolist()
df_embedded['z-score'] = array_zscore.tolist()

In [6]:
# Take a look at the dataframe
df_embedded

Unnamed: 0,Pair,Source,Reference,Translation,z-score
0,ru-en/,"[0.006561310961842537, 0.0010309879435226321, ...","[0.002888732822611928, 0.003092581406235695, -...","[0.007184059824794531, 0.00223751668818295, -0...",0.878043
1,ru-en/,"[0.010224870406091213, -0.0008201345917768776,...","[0.011173672042787075, -0.0008004391565918922,...","[0.018348928540945053, -6.17729892837815e-05, ...",0.511473
2,ru-en/,"[0.00929456390440464, 0.004119107499718666, -0...","[0.016088047996163368, 0.01641150191426277, -0...","[0.031711727380752563, 0.01690896600484848, -0...",0.947866
3,ru-en/,"[0.034939173609018326, 0.013700926676392555, 0...","[0.016898855566978455, 0.018298206850886345, 0...","[0.024940183386206627, 0.017219029366970062, 0...",1.052601
4,ru-en/,"[0.004986530169844627, 0.01191420666873455, -0...","[0.002971083391457796, 0.005342863965779543, -...","[0.0010912771103903651, 0.006155983079224825, ...",0.738397
...,...,...,...,...,...
87904,en-zh/,"[0.004945721477270126, -0.00010818930604727939...","[0.012050468474626541, 0.001634744112379849, 0...","[0.01212915126234293, 0.0030880619306117296, 0...",0.077860
87905,en-zh/,"[0.024269072338938713, -0.00048510648775845766...","[0.022689804434776306, 0.0009008642518892884, ...","[0.008289525285363197, -0.0009707113495096564,...",0.762374
87906,en-zh/,"[0.00892164371907711, -0.0002100776182487607, ...","[0.0008965639863163233, 0.004110783338546753, ...","[0.0030187279917299747, 0.0032132624182850122,...",0.199586
87907,en-zh/,"[0.022005783393979073, 0.06224991753697395, 0....","[0.0025298059917986393, 0.02254778891801834, 0...","[0.008136971853673458, 0.01679421029984951, 0....",-2.382576


In [7]:
np.asarray(df_embedded['Source'][0])

array([ 0.00656131,  0.00103099, -0.00391787, ...,  0.03454931,
        0.0105996 ,  0.02284318])

In [8]:
# Create columns with differences between embeddings
df_embedded['s-r'] = df_embedded.apply(lambda row : np.asarray(row['Source'])-np.asarray(row['Reference']), axis=1)
df_embedded['s-t'] = df_embedded.apply(lambda row : np.asarray(row['Source'])-np.asarray(row['Translation']), axis=1)
df_embedded['r-t'] = df_embedded.apply(lambda row : np.asarray(row['Reference'])-np.asarray(row['Translation']), axis=1)

In [9]:
df_embedded

Unnamed: 0,Pair,Source,Reference,Translation,z-score,s-r,s-t,r-t
0,ru-en/,"[0.006561310961842537, 0.0010309879435226321, ...","[0.002888732822611928, 0.003092581406235695, -...","[0.007184059824794531, 0.00223751668818295, -0...",0.878043,"[0.003672578139230609, -0.0020615934627130628,...","[-0.0006227488629519939, -0.001206528744660318...","[-0.004295327002182603, 0.0008550647180527449,..."
1,ru-en/,"[0.010224870406091213, -0.0008201345917768776,...","[0.011173672042787075, -0.0008004391565918922,...","[0.018348928540945053, -6.17729892837815e-05, ...",0.511473,"[-0.0009488016366958618, -1.96954351849854e-05...","[-0.00812405813485384, -0.0007583616024930961,...","[-0.007175256498157978, -0.0007386661673081107..."
2,ru-en/,"[0.00929456390440464, 0.004119107499718666, -0...","[0.016088047996163368, 0.01641150191426277, -0...","[0.031711727380752563, 0.01690896600484848, -0...",0.947866,"[-0.006793484091758728, -0.012292394414544106,...","[-0.022417163476347923, -0.012789858505129814,...","[-0.015623679384589195, -0.0004974640905857086..."
3,ru-en/,"[0.034939173609018326, 0.013700926676392555, 0...","[0.016898855566978455, 0.018298206850886345, 0...","[0.024940183386206627, 0.017219029366970062, 0...",1.052601,"[0.01804031804203987, -0.00459728017449379, 0....","[0.009998990222811699, -0.003518102690577507, ...","[-0.008041327819228172, 0.0010791774839162827,..."
4,ru-en/,"[0.004986530169844627, 0.01191420666873455, -0...","[0.002971083391457796, 0.005342863965779543, -...","[0.0010912771103903651, 0.006155983079224825, ...",0.738397,"[0.0020154467783868313, 0.0065713427029550076,...","[0.0038952530594542623, 0.005758223589509726, ...","[0.001879806281067431, -0.000813119113445282, ..."
...,...,...,...,...,...,...,...,...
87904,en-zh/,"[0.004945721477270126, -0.00010818930604727939...","[0.012050468474626541, 0.001634744112379849, 0...","[0.01212915126234293, 0.0030880619306117296, 0...",0.077860,"[-0.007104746997356415, -0.0017429334184271283...","[-0.0071834297850728035, -0.003196251236659009...","[-7.86827877163887e-05, -0.0014533178182318807..."
87905,en-zh/,"[0.024269072338938713, -0.00048510648775845766...","[0.022689804434776306, 0.0009008642518892884, ...","[0.008289525285363197, -0.0009707113495096564,...",0.762374,"[0.001579267904162407, -0.001385970739647746, ...","[0.015979547053575516, 0.00048560486175119877,...","[0.014400279149413109, 0.0018715756013989449, ..."
87906,en-zh/,"[0.00892164371907711, -0.0002100776182487607, ...","[0.0008965639863163233, 0.004110783338546753, ...","[0.0030187279917299747, 0.0032132624182850122,...",0.199586,"[0.008025079732760787, -0.004320860956795514, ...","[0.0059029157273471355, -0.003423340036533773,...","[-0.0021221640054136515, 0.0008975209202617407..."
87907,en-zh/,"[0.022005783393979073, 0.06224991753697395, 0....","[0.0025298059917986393, 0.02254778891801834, 0...","[0.008136971853673458, 0.01679421029984951, 0....",-2.382576,"[0.019475977402180433, 0.03970212861895561, 0....","[0.013868811540305614, 0.04545570723712444, 0....","[-0.005607165861874819, 0.005753578618168831, ..."


In [10]:
# Creating the column names for each value of the sentence embedding vectors
source_cols = ['s'+str(i) for i in range(1, 1025)]
ref_cols = ['r'+str(i) for i in range(1, 1025)]
trans_cols = ['t'+str(i) for i in range(1, 1025)]
source_minus_ref_cols = ['s-r'+str(i) for i in range(1, 1025)]
source_minus_trans_cols = ['s-'+str(i) for i in range(1, 1025)]
ref_minus_trans_cols = ['r-t'+str(i) for i in range(1, 1025)]

In [11]:
# Disagreggate the values in each embedding vector and place them in the respective columns
df_embedded[source_cols] = pd.DataFrame(df_embedded.Source.tolist(), index= df_embedded.index)
df_embedded[ref_cols] = pd.DataFrame(df_embedded.Reference.tolist(), index= df_embedded.index)
df_embedded[trans_cols] = pd.DataFrame(df_embedded.Translation.tolist(), index= df_embedded.index)
df_embedded[source_minus_ref_cols] = pd.DataFrame(df_embedded['s-r'].tolist(), index= df_embedded.index)
df_embedded[source_minus_trans_cols] = pd.DataFrame(df_embedded['s-t'].tolist(), index= df_embedded.index)
df_embedded[ref_minus_trans_cols] = pd.DataFrame(df_embedded['r-t'].tolist(), index= df_embedded.index)

In [12]:
# Some cleaning-up and rearrangement
change_position = df_embedded['z-score']
df_embedded.drop(columns=['z-score'], inplace=True)
df_embedded['z-score'] = change_position

In [13]:
# Check final df
df_embedded

Unnamed: 0,Pair,Source,Reference,Translation,s-r,s-t,r-t,s1,s2,s3,...,r-t1016,r-t1017,r-t1018,r-t1019,r-t1020,r-t1021,r-t1022,r-t1023,r-t1024,z-score
0,ru-en/,"[0.006561310961842537, 0.0010309879435226321, ...","[0.002888732822611928, 0.003092581406235695, -...","[0.007184059824794531, 0.00223751668818295, -0...","[0.003672578139230609, -0.0020615934627130628,...","[-0.0006227488629519939, -0.001206528744660318...","[-0.004295327002182603, 0.0008550647180527449,...",0.006561,0.001031,-0.003918,...,-0.003302,0.001217,-0.007953,0.011058,-0.004098,-0.002394,-0.012562,0.007248,0.006608,0.878043
1,ru-en/,"[0.010224870406091213, -0.0008201345917768776,...","[0.011173672042787075, -0.0008004391565918922,...","[0.018348928540945053, -6.17729892837815e-05, ...","[-0.0009488016366958618, -1.96954351849854e-05...","[-0.00812405813485384, -0.0007583616024930961,...","[-0.007175256498157978, -0.0007386661673081107...",0.010225,-0.000820,-0.004223,...,-0.000374,-0.002683,-0.006574,0.002717,-0.001447,0.001296,0.002994,-0.003468,-0.009072,0.511473
2,ru-en/,"[0.00929456390440464, 0.004119107499718666, -0...","[0.016088047996163368, 0.01641150191426277, -0...","[0.031711727380752563, 0.01690896600484848, -0...","[-0.006793484091758728, -0.012292394414544106,...","[-0.022417163476347923, -0.012789858505129814,...","[-0.015623679384589195, -0.0004974640905857086...",0.009295,0.004119,-0.002359,...,0.003351,0.000316,-0.004499,0.000484,-0.002732,-0.030968,-0.001269,0.003250,-0.000684,0.947866
3,ru-en/,"[0.034939173609018326, 0.013700926676392555, 0...","[0.016898855566978455, 0.018298206850886345, 0...","[0.024940183386206627, 0.017219029366970062, 0...","[0.01804031804203987, -0.00459728017449379, 0....","[0.009998990222811699, -0.003518102690577507, ...","[-0.008041327819228172, 0.0010791774839162827,...",0.034939,0.013701,0.000403,...,0.006583,-0.000600,0.003449,-0.000309,0.000467,-0.012154,-0.001654,-0.007504,0.009270,1.052601
4,ru-en/,"[0.004986530169844627, 0.01191420666873455, -0...","[0.002971083391457796, 0.005342863965779543, -...","[0.0010912771103903651, 0.006155983079224825, ...","[0.0020154467783868313, 0.0065713427029550076,...","[0.0038952530594542623, 0.005758223589509726, ...","[0.001879806281067431, -0.000813119113445282, ...",0.004987,0.011914,-0.000958,...,-0.002526,-0.000117,0.004955,-0.023320,-0.007464,-0.010163,-0.015542,-0.001703,0.013226,0.738397
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
87904,en-zh/,"[0.004945721477270126, -0.00010818930604727939...","[0.012050468474626541, 0.001634744112379849, 0...","[0.01212915126234293, 0.0030880619306117296, 0...","[-0.007104746997356415, -0.0017429334184271283...","[-0.0071834297850728035, -0.003196251236659009...","[-7.86827877163887e-05, -0.0014533178182318807...",0.004946,-0.000108,0.003769,...,0.007170,0.000926,0.003359,0.000967,0.000268,0.007270,0.007344,0.002728,-0.004646,0.077860
87905,en-zh/,"[0.024269072338938713, -0.00048510648775845766...","[0.022689804434776306, 0.0009008642518892884, ...","[0.008289525285363197, -0.0009707113495096564,...","[0.001579267904162407, -0.001385970739647746, ...","[0.015979547053575516, 0.00048560486175119877,...","[0.014400279149413109, 0.0018715756013989449, ...",0.024269,-0.000485,-0.003670,...,0.029545,-0.003317,0.022342,-0.001587,-0.015162,-0.017379,0.006170,-0.004643,-0.001515,0.762374
87906,en-zh/,"[0.00892164371907711, -0.0002100776182487607, ...","[0.0008965639863163233, 0.004110783338546753, ...","[0.0030187279917299747, 0.0032132624182850122,...","[0.008025079732760787, -0.004320860956795514, ...","[0.0059029157273471355, -0.003423340036533773,...","[-0.0021221640054136515, 0.0008975209202617407...",0.008922,-0.000210,-0.001784,...,0.011963,-0.002550,-0.005985,-0.000947,-0.000300,-0.041854,0.004252,-0.002070,-0.010786,0.199586
87907,en-zh/,"[0.022005783393979073, 0.06224991753697395, 0....","[0.0025298059917986393, 0.02254778891801834, 0...","[0.008136971853673458, 0.01679421029984951, 0....","[0.019475977402180433, 0.03970212861895561, 0....","[0.013868811540305614, 0.04545570723712444, 0....","[-0.005607165861874819, 0.005753578618168831, ...",0.022006,0.062250,0.026397,...,-0.000818,0.001726,-0.022708,0.000343,-0.000838,0.005927,-0.006286,-0.001494,0.004483,-2.382576


In [14]:
# Create our training data and target instances
X = df_embedded.drop(columns=['Pair', 'Source', 'Reference', 'Translation', 'z-score', 's-r', 's-t', 'r-t'])
y = df_embedded.iloc[:, -1]

In [15]:
# Look at X
X

Unnamed: 0,s1,s2,s3,s4,s5,s6,s7,s8,s9,s10,...,r-t1015,r-t1016,r-t1017,r-t1018,r-t1019,r-t1020,r-t1021,r-t1022,r-t1023,r-t1024
0,0.006561,0.001031,-0.003918,0.004773,-0.011933,0.005685,0.016794,0.003516,-0.001156,0.000746,...,0.002334,-0.003302,0.001217,-0.007953,0.011058,-0.004098,-0.002394,-0.012562,0.007248,0.006608
1,0.010225,-0.000820,-0.004223,0.031240,0.014482,0.025482,0.000782,0.020936,0.008846,-0.000037,...,0.015266,-0.000374,-0.002683,-0.006574,0.002717,-0.001447,0.001296,0.002994,-0.003468,-0.009072
2,0.009295,0.004119,-0.002359,0.024323,-0.006884,0.005067,-0.001138,0.001637,0.004946,0.000050,...,0.001151,0.003351,0.000316,-0.004499,0.000484,-0.002732,-0.030968,-0.001269,0.003250,-0.000684
3,0.034939,0.013701,0.000403,0.024354,0.002640,0.002479,0.004500,0.002206,0.008627,0.013264,...,-0.015675,0.006583,-0.000600,0.003449,-0.000309,0.000467,-0.012154,-0.001654,-0.007504,0.009270
4,0.004987,0.011914,-0.000958,0.023801,-0.036809,0.001782,0.002315,0.000142,0.001726,0.000035,...,0.039268,-0.002526,-0.000117,0.004955,-0.023320,-0.007464,-0.010163,-0.015542,-0.001703,0.013226
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
87904,0.004946,-0.000108,0.003769,0.060858,0.017561,0.079226,0.003803,0.011656,0.004564,0.040759,...,-0.020613,0.007170,0.000926,0.003359,0.000967,0.000268,0.007270,0.007344,0.002728,-0.004646
87905,0.024269,-0.000485,-0.003670,0.024572,0.013432,0.004812,0.013294,0.006269,0.026511,0.000968,...,0.003380,0.029545,-0.003317,0.022342,-0.001587,-0.015162,-0.017379,0.006170,-0.004643,-0.001515
87906,0.008922,-0.000210,-0.001784,0.007576,0.014360,0.004480,-0.000313,0.004667,0.016907,0.065814,...,-0.001531,0.011963,-0.002550,-0.005985,-0.000947,-0.000300,-0.041854,0.004252,-0.002070,-0.010786
87907,0.022006,0.062250,0.026397,0.012046,0.001776,0.001734,0.015076,0.014784,0.015981,-0.000029,...,-0.010983,-0.000818,0.001726,-0.022708,0.000343,-0.000838,0.005927,-0.006286,-0.001494,0.004483


In [16]:
# Check the datatypes in it
X.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 87909 entries, 0 to 87908
Columns: 6144 entries, s1 to r-t1024
dtypes: float64(6144)
memory usage: 4.0 GB


In [17]:
# Look at the labels
y

0        0.878043
1        0.511473
2        0.947866
3        1.052601
4        0.738397
           ...   
87904    0.077860
87905    0.762374
87906    0.199586
87907   -2.382576
87908    1.217896
Name: z-score, Length: 87909, dtype: float64

In [18]:
# Simple train test split
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=0)

#### Linear Regression

In [19]:
# Initialize a linear regression
lin_model = LinearRegression()

In [20]:
# fit our training data to it
lin_model.fit(X_train, y_train)

LinearRegression()

In [21]:
# Make predictions and store them
predictions = lin_model.predict(X_val)
predictions

array([ 0.32663386, -0.29719265,  0.66463433, ..., -0.3365093 ,
        0.00839607,  0.51702206])

In [22]:
# Look at the actual values
y_val

16941    0.506271
50520   -0.671202
83278    1.300690
7716     0.424194
55457    0.451568
           ...   
30458   -0.211432
10308   -0.168753
41887   -0.751480
11       1.008142
60104    0.601312
Name: z-score, Length: 17582, dtype: float64

In [26]:
# Save the actual values and predicted values in a dataframe
df_linreg_results = pd.DataFrame()
df_linreg_results['True'] = y_val
#df_linreg_results.reset_index(inplace=True)
df_linreg_results['Predicted'] = predictions.tolist()

In [27]:
# Take a look a it
df_linreg_results

Unnamed: 0,True,Predicted
16941,0.506271,0.326634
50520,-0.671202,-0.297193
83278,1.300690,0.664634
7716,0.424194,-0.390418
55457,0.451568,-0.387602
...,...,...
30458,-0.211432,-0.103754
10308,-0.168753,0.198749
41887,-0.751480,-0.336509
11,1.008142,0.008396


In [31]:
# Create a df only with the indexes and languages (So that we dont have to use the huge dataframe)
df_lpair_index = pd.DataFrame(df_embedded['Pair'])
df_lpair_index

Unnamed: 0,Pair
0,ru-en/
1,ru-en/
2,ru-en/
3,ru-en/
4,ru-en/
...,...
87904,en-zh/
87905,en-zh/
87906,en-zh/
87907,en-zh/


In [32]:
# Join both to have scores associated with each language pair
result_with_pairs = df_linreg_results.join(df_lpair_index)
result_with_pairs

Unnamed: 0,True,Predicted,Pair
16941,0.506271,0.326634,ru-en/
50520,-0.671202,-0.297193,cs-en/
83278,1.300690,0.664634,en-zh/
7716,0.424194,-0.390418,ru-en/
55457,0.451568,-0.387602,zh-en/
...,...,...,...
30458,-0.211432,-0.103754,de-en/
10308,-0.168753,0.198749,ru-en/
41887,-0.751480,-0.336509,cs-en/
11,1.008142,0.008396,ru-en/


In [47]:
# Get pearson correlaion coefficient between ground truth and predicted values
total_corr_pearson = result_with_pairs['True'].corr(result_with_pairs['Predicted'], method='pearson')
total_corr_pearson

0.33557607903328657

In [48]:
# Get kendall coefficient between ground truth and predicted values
total_corr_kendall = result_with_pairs['True'].corr(result_with_pairs['Predicted'], method='kendall')
total_corr_kendall

0.22516217580182993

In [49]:
# Get pearson correlaion for each language pair
pearson_corr_pairs = {}

for i in range(len(folders_in_embedding_corpus)-1):
    key = folders_in_embedding_corpus[i]
    pearson_corr_pairs[key] = result_with_pairs[result_with_pairs['Pair'] == key]['True'].corr(result_with_pairs[result_with_pairs['Pair'] == key]['Predicted'], method='pearson')
    
pearson_corr_pairs['total'] = total_corr_pearson

pearson_corr_pairs

{'ru-en/': 0.3039987332964856,
 'de-en/': 0.30123151814327315,
 'cs-en/': 0.3459202083330235,
 'zh-en/': 0.3606701749607575,
 'en-zh/': 0.3663260996258273,
 'total': 0.33557607903328657}

In [53]:
# Create a df from the dict
pearson_corr_df = pd.DataFrame(pearson_corr_pairs.items(), columns=['Pairs', 'Pearson'])
pearson_corr_df

Unnamed: 0,Pairs,Pearson
0,ru-en/,0.303999
1,de-en/,0.301232
2,cs-en/,0.34592
3,zh-en/,0.36067
4,en-zh/,0.366326
5,total,0.335576


In [50]:
# Get kendall correlaion for each language pair
kendall_corr_pairs = {}

for i in range(len(folders_in_embedding_corpus)-1):
    key = folders_in_embedding_corpus[i]
    kendall_corr_pairs[key] = result_with_pairs[result_with_pairs['Pair'] == key]['True'].corr(result_with_pairs[result_with_pairs['Pair'] == key]['Predicted'], method='kendall')

kendall_corr_pairs['total'] = total_corr_kendall

kendall_corr_pairs

{'ru-en/': 0.20361546769543043,
 'de-en/': 0.2000772043333,
 'cs-en/': 0.23594423179404264,
 'zh-en/': 0.24323030995782718,
 'en-zh/': 0.24060863197461663,
 'total': 0.22516217580182993}

In [58]:
# Create a df from the dict
kendall_corr_df = pd.DataFrame(kendall_corr_pairs.items(), columns=['Pairs', 'Kendall'])
kendall_corr_df

Unnamed: 0,Pairs,Kendall
0,ru-en/,0.203615
1,de-en/,0.200077
2,cs-en/,0.235944
3,zh-en/,0.24323
4,en-zh/,0.240609
5,total,0.225162


In [59]:
results_correlations = pd.merge(pearson_corr_df, kendall_corr_df, how='left', on=['Pairs'])
results_correlations

Unnamed: 0,Pairs,Pearson,Kendall
0,ru-en/,0.303999,0.203615
1,de-en/,0.301232,0.200077
2,cs-en/,0.34592,0.235944
3,zh-en/,0.36067,0.24323
4,en-zh/,0.366326,0.240609
5,total,0.335576,0.225162
