# TM Project

### Importing the data

In [None]:
import pandas as pd
import numpy as np

In [None]:
#importing the data

path_corpus = "/Users/franz/Desktop/TM Project/corpus/"

ru_en = pd.read_csv(path_corpus + "ru-en/scores.csv")
de_en = pd.read_csv(path_corpus + "de-en/scores.csv")
cs_en = pd.read_csv(path_corpus + "cs-en/scores.csv")
zh_en = pd.read_csv(path_corpus + "zh-en/scores.csv")
en_zh = pd.read_csv(path_corpus + "en-zh/scores.csv")
en_fi = pd.read_csv(path_corpus + "en-fi/scores.csv")

ru_en_ = ru_en.copy()
de_en_ = de_en.copy()
cs_en_ = cs_en.copy()
zh_en_ = zh_en.copy()
en_zh_ = en_zh.copy()
en_fi_ = en_fi.copy()

In [None]:
de_en.head()

### Data exploration

In [None]:
descriptions = ["Russian into English", "German into English", "Czech into English", "Chinese into English", "English into Chinese", "English into Finish"]

In [None]:
rows = []
zscores = []
avgscores = []
annots = []

i = 0

for element in [ru_en, de_en, cs_en, zh_en, en_zh, en_fi]:
    rows.append(element.shape[0])
    zscores.append(np.round(element["z-score"].mean(),2))
    avgscores.append(np.round(element["avg-score"].mean(), 2))
    annots.append(np.round(element["annotators"].mean(),2))
    i += 1                   
    
exploration_df = pd.DataFrame([rows, zscores, avgscores, annots]).T.rename(columns={0:"rows", 1:"avg z-score", 2:"avg avg-score", 3:"avg annotators"})
exploration_df["description"] = descriptions
exploration_df = exploration_df.set_index("description")
exploration_df

In [None]:
exploration_df.corr()

As there are only 6 different types of translations, these correlations might be not very meaningful!

# Lexical metrics

## BLEU Score - Part 1

In [None]:
from collections import Counter

# a more "pythonic" way to compute BLUE_star 

def BLEU_star_compact(refs, candidate):
    refs = [refs.split()]
    candidate = candidate.split()

    return sum([min(count, max([ref[word] for ref in [Counter(ref) for ref in refs]])) for word, count in Counter(candidate).items()])/len(candidate)

In [None]:
overall_results = {}

correlations_p = []
correlations_k = []

co = 0
for element in [ru_en, de_en, cs_en, zh_en, en_zh, en_fi]:
    
    bleu_scores = []

    #calculating the bleu scores for the translations in comparison to their respective reference
    for i in range(element.shape[0]): #element.shape[0]
        reference = element.loc[i,"reference"]
        translation = element.loc[i,"translation"]
        bleu_scores.append(BLEU_star_compact(reference, translation))

    #add the bleu scores to the dataframe
    development_df = element.copy() #element.shape[0]
    development_df["BLEU"] = bleu_scores
    correlations_p.append(development_df.corr(method="pearson").iloc[-1:,0].values[0])
    correlations_k.append(development_df.corr(method="kendall").iloc[-1:,0].values[0])
    
    if co == 0:
        ru_en_["BLEU"] = bleu_scores
    elif co == 1:
        de_en_["BLEU"] = bleu_scores
    elif co == 2:
        cs_en_["BLEU"] = bleu_scores
    elif co == 3:
        zh_en_["BLEU"] = bleu_scores
    elif co == 4:
        en_zh_["BLEU"] = bleu_scores
    elif co == 5:
        en_fi_["BLEU"] = bleu_scores
    co += 1
        


print("\033[1mCorrelation between z-score and BLEU score\n")
i = 0
for element in correlations_p:
    print("\033[1m", descriptions[i] + ":",  "\033[0mPearson:", np.round(element,4), "| Kendall:", np.round(correlations_k[i],4))
    i += 1

print("\n\033[1mOverall:\033[0m Average Pearson:", np.round(sum(correlations_p)/len(correlations_p),4),
         "| Average Kendall:", np.round(sum(correlations_k)/len(correlations_k),4))

overall_results["BLEU Star Pearson"] = correlations_p
overall_results["BLEU Star Kendall"] = correlations_k

## BLEU Score - Part 2

### 1st Try (sentence_bleu)

In [None]:
from nltk.translate.bleu_score import sentence_bleu

correlations_p = []
correlations_k = []

co = 0
for element in [ru_en, de_en, cs_en, zh_en, en_zh, en_fi]:
    
    bleu_scores = []

    #calculating the bleu scores for the translations in comparison to their respective reference
    for i in range(element.shape[0]): #element.shape[0]
        reference = [element.loc[i,"reference"].split()]
        translation = element.loc[i,"translation"].split()
        bleu_scores.append(sentence_bleu(reference, translation,weights=(0.25, 0.25, 0.25, 0.25)))

    #add the bleu scores to the dataframe
    development_df = element.copy() #element.shape[0]
    development_df["BLEU"] = bleu_scores
    correlations_p.append(development_df.corr(method="pearson").iloc[-1:,0].values[0])
    correlations_k.append(development_df.corr(method="kendall").iloc[-1:,0].values[0])
    
    if co == 0:
        ru_en_["BLEU_s"] = bleu_scores
    elif co == 1:
        de_en_["BLEU_s"] = bleu_scores
    elif co == 2:
        cs_en_["BLEU_s"] = bleu_scores
    elif co == 3:
        zh_en_["BLEU_s"] = bleu_scores
    elif co == 4:
        en_zh_["BLEU_s"] = bleu_scores
    elif co == 5:
        en_fi_["BLEU_s"] = bleu_scores

    co += 1
    
print("\033[1mCorrelation between z-score and BLEU score\n")
i = 0
for element in correlations_p:
    print("\033[1m", descriptions[i] + ":",  "\033[0mPearson:", np.round(element,4), "| Kendall:", np.round(correlations_k[i],4))
    i += 1

print("\n\033[1mOverall:\033[0m Average Pearson:", np.round(sum(correlations_p)/len(correlations_p),4),
         "| Average Kendall:", np.round(sum(correlations_k)/len(correlations_k),4))

overall_results["BLEU Sentence Pearson"] = correlations_p
overall_results["BLEU Sentence Kendall"] = correlations_k

### 2nd Try (corpus_bleu)

In [None]:
from nltk.translate.bleu_score import corpus_bleu

correlations_p = []
correlations_k = []

co = 0
for element in [ru_en, de_en, cs_en, zh_en, en_zh, en_fi]:
    
    bleu_scores = []

    #calculating the bleu scores for the translations in comparison to their respective reference
    for i in range(element.shape[0]): #element.shape[0]
        reference = [element.loc[i,"reference"].split()]
        translation = element.loc[i,"translation"].split()
        while len(reference) < len(translation):
            reference.append(" ")
        while len(reference) > len(translation):
            translation.append(" ")
        bleu_scores.append(corpus_bleu(reference, translation))

    #add the bleu scores to the dataframe
    development_df = element.copy() #element.shape[0]
    development_df["BLEU"] = bleu_scores
    correlations_p.append(development_df.corr(method="pearson").iloc[-1:,0].values[0])
    correlations_k.append(development_df.corr(method="kendall").iloc[-1:,0].values[0])
    
    if co == 0:
        ru_en_["BLEU_c"] = bleu_scores
    elif co == 1:
        de_en_["BLEU_c"] = bleu_scores
    elif co == 2:
        cs_en_["BLEU_c"] = bleu_scores
    elif co == 3:
        zh_en_["BLEU_c"] = bleu_scores
    elif co == 4:
        en_zh_["BLEU_c"] = bleu_scores
    elif co == 5:
        en_fi_["BLEU_c"] = bleu_scores
    co += 1
    i += 1


print("\033[1mCorrelation between z-score and BLEU score\n")
i = 0
for element in correlations_p:
    print("\033[1m", descriptions[i] + ":",  "\033[0mPearson", np.round(element,4), "| Kendall:", np.round(correlations_k[i],4))
    i += 1

print("\n\033[1mOverall:\033[0m Average Pearson:", np.round(sum(correlations_p)/len(correlations_p),4),
         "| Average Kendall:", np.round(sum(correlations_k)/len(correlations_k),4))

overall_results["BLEU Corpus Pearson"] = correlations_p
overall_results["BLEU Corpus Kendall"] = correlations_k

## ROUGE Score

### ROUGE 1

In [None]:
from rouge_score import rouge_scorer

results_p = pd.DataFrame()
results_k = pd.DataFrame()

correlations_p = []
correlations_k = []
j = 0
co = 0
for element in [ru_en, de_en, cs_en, zh_en, en_zh, en_fi]:
    
    precisions = []
    recalls = []
    fmeasures = []

    #calculating the rouge scores for the translations in comparison to their respective reference
    for i in range(element.shape[0]): #element.shape[0]
        scorer = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], use_stemmer=True)
        scores = scorer.score(element.loc[i,"reference"], element.loc[i,"translation"])
        precisions.append(scores["rouge1"].precision)
        recalls.append(scores["rouge1"].recall)
        fmeasures.append(scores["rouge1"].fmeasure)

    #add the rouge scores to the dataframe
    development_df = element.copy() #element.shape[0]
    development_df["ROUGE precision"] = precisions
    development_df["ROUGE recall"] = recalls
    development_df["ROUGE fmeasure"] = fmeasures
    correlations_p.append(development_df.corr(method="pearson").iloc[-1:,0].values[0])
    correlations_k.append(development_df.corr(method="kendall").iloc[-1:,0].values[0])
    col_name = descriptions[j]
    j += 1
    results_p[col_name] = pd.Series(development_df.corr(method="pearson").iloc[0,3:])
    results_k[col_name] = pd.Series(development_df.corr(method="kendall").iloc[0,3:])
    
    if co == 0:
        ru_en_["ROUGE1_precision"] = precisions
        ru_en_["ROUGE1_recall"] = recalls
        ru_en_["ROUGE1_fmeasure"] = fmeasures
    elif co == 1:
        de_en_["ROUGE1_precision"] = precisions
        de_en_["ROUGE1_recall"] = recalls
        de_en_["ROUGE1_fmeasure"] = fmeasures
    elif co == 2:
        cs_en_["ROUGE1_precision"] = precisions
        cs_en_["ROUGE1_recall"] = recalls
        cs_en_["ROUGE1_fmeasure"] = fmeasures
    elif co == 3:
        zh_en_["ROUGE1_precision"] = precisions
        zh_en_["ROUGE1_recall"] = recalls
        zh_en_["ROUGE1_fmeasure"] = fmeasures
    elif co == 4:
        en_zh_["ROUGE1_precision"] = precisions
        en_zh_["ROUGE1_recall"] = recalls
        en_zh_["ROUGE1_fmeasure"] = fmeasures
    elif co == 5:
        en_fi_["ROUGE1_precision"] = precisions
        en_fi_["ROUGE1_recall"] = recalls
        en_fi_["ROUGE1_fmeasure"] = fmeasures
    co += 1


print("\033[1mPearson Correlation between z-score and ROUGE measures (ROUGE 1) \n")


overall_results["ROUGE 1 Precision Pearson"] = results_p.iloc[0,:6].values.tolist()
overall_results["ROUGE 1 Recall Pearson"] = results_p.iloc[1,:6].values.tolist()
overall_results["ROUGE 1 Fmeasure Pearson"] = results_p.iloc[2,:6].values.tolist()

results_p["Average"] = results_p.T.mean()
results_p

In [None]:
print("\033[1mKendall Tau Correlation between z-score and ROUGE measures (ROUGE 1) \n")

overall_results["ROUGE 1 Precision Kendall"] = results_k.iloc[0,:6].values.tolist()
overall_results["ROUGE 1 Recall Kendall"] = results_k.iloc[1,:6].values.tolist()
overall_results["ROUGE 1 Fmeasure Kendall"] = results_k.iloc[2,:6].values.tolist()

results_k["Average"] = results_k.T.mean()
results_k

### ROUGE 2

In [None]:
from rouge_score import rouge_scorer

results_p = pd.DataFrame()
results_k = pd.DataFrame()

correlations_p = []
correlations_k = []
j = 0
co = 0
for element in [ru_en, de_en, cs_en, zh_en, en_zh, en_fi]:
    
    precisions = []
    recalls = []
    fmeasures = []

    #calculating the rouge scores for the translations in comparison to their respective reference
    for i in range(element.shape[0]): #element.shape[0]
        scorer = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], use_stemmer=True)
        scores = scorer.score(element.loc[i,"reference"], element.loc[i,"translation"])
        precisions.append(scores["rouge2"].precision)
        recalls.append(scores["rouge2"].recall)
        fmeasures.append(scores["rouge2"].fmeasure)

    #add the rouge scores to the dataframe
    development_df = element.copy() #element.shape[0]
    development_df["ROUGE precision"] = precisions
    development_df["ROUGE recall"] = recalls
    development_df["ROUGE fmeasure"] = fmeasures
    correlations_p.append(development_df.corr(method="pearson").iloc[-1:,0].values[0])
    correlations_k.append(development_df.corr(method="kendall").iloc[-1:,0].values[0])
    col_name = descriptions[j]
    j += 1
    results_p[col_name] = pd.Series(development_df.corr(method="pearson").iloc[0,3:])
    results_k[col_name] = pd.Series(development_df.corr(method="kendall").iloc[0,3:])
    
    if co == 0:
        ru_en_["ROUGE2_precision"] = precisions
        ru_en_["ROUGE2_recall"] = recalls
        ru_en_["ROUGE2_fmeasure"] = fmeasures
    elif co == 1:
        de_en_["ROUGE2_precision"] = precisions
        de_en_["ROUGE2_recall"] = recalls
        de_en_["ROUGE2_fmeasure"] = fmeasures
    elif co == 2:
        cs_en_["ROUGE2_precision"] = precisions
        cs_en_["ROUGE2_recall"] = recalls
        cs_en_["ROUGE2_fmeasure"] = fmeasures
    elif co == 3:
        zh_en_["ROUGE2_precision"] = precisions
        zh_en_["ROUGE2_recall"] = recalls
        zh_en_["ROUGE2_fmeasure"] = fmeasures
    elif co == 4:
        en_zh_["ROUGE2_precision"] = precisions
        en_zh_["ROUGE2_recall"] = recalls
        en_zh_["ROUGE2_fmeasure"] = fmeasures
    elif co == 5:
        en_fi_["ROUGE2_precision"] = precisions
        en_fi_["ROUGE2_recall"] = recalls
        en_fi_["ROUGE2_fmeasure"] = fmeasures
    co += 1


print("\033[1mPearson Correlation between z-score and ROUGE measures (ROUGE 2) \n")

overall_results["ROUGE 2 Precision Pearson"] = results_p.iloc[0,:6].values.tolist()
overall_results["ROUGE 2 Recall Pearson"] = results_p.iloc[1,:6].values.tolist()
overall_results["ROUGE 2 Fmeasure Pearson"] = results_p.iloc[2,:6].values.tolist()

results_p["Average"] = results_p.T.mean()
results_p

In [None]:
print("\033[1mKendall Tau Correlation between z-score and ROUGE measures (ROUGE 2) \n")

overall_results["ROUGE 2 Precision Kendall"] = results_k.iloc[0,:6].values.tolist()
overall_results["ROUGE 2 Recall Kendall"] = results_k.iloc[1,:6].values.tolist()
overall_results["ROUGE 2 Fmeasure Kendall"] = results_k.iloc[2,:6].values.tolist()

results_k["Average"] = results_k.T.mean()
results_k

### ROUGE L

In [None]:
from rouge_score import rouge_scorer

results_p = pd.DataFrame()
results_k = pd.DataFrame()

correlations_p = []
correlations_k = []
j = 0
co = 0
for element in [ru_en, de_en, cs_en, zh_en, en_zh, en_fi]:
    
    precisions = []
    recalls = []
    fmeasures = []

    #calculating the rouge scores for the translations in comparison to their respective reference
    for i in range(element.shape[0]): #element.shape[0]
        scorer = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], use_stemmer=True)
        scores = scorer.score(element.loc[i,"reference"], element.loc[i,"translation"])
        precisions.append(scores["rougeL"].precision)
        recalls.append(scores["rougeL"].recall)
        fmeasures.append(scores["rougeL"].fmeasure)

    #add the rouge scores to the dataframe
    development_df = element.copy() #element.shape[0]
    development_df["ROUGE precision"] = precisions
    development_df["ROUGE recall"] = recalls
    development_df["ROUGE fmeasure"] = fmeasures
    correlations_p.append(development_df.corr(method="pearson").iloc[-1:,0].values[0])
    correlations_k.append(development_df.corr(method="kendall").iloc[-1:,0].values[0])
    col_name = descriptions[j]
    j += 1
    results_p[col_name] = pd.Series(development_df.corr(method="pearson").iloc[0,3:])
    results_k[col_name] = pd.Series(development_df.corr(method="kendall").iloc[0,3:])
    
    if co == 0:
        ru_en_["ROUGEL_precision"] = precisions
        ru_en_["ROUGEL_recall"] = recalls
        ru_en_["ROUGEL_fmeasure"] = fmeasures
    elif co == 1:
        de_en_["ROUGEL_precision"] = precisions
        de_en_["ROUGEL_recall"] = recalls
        de_en_["ROUGEL_fmeasure"] = fmeasures
    elif co == 2:
        cs_en_["ROUGEL_precision"] = precisions
        cs_en_["ROUGEL_recall"] = recalls
        cs_en_["ROUGEL_fmeasure"] = fmeasures
    elif co == 3:
        zh_en_["ROUGEL_precision"] = precisions
        zh_en_["ROUGEL_recall"] = recalls
        zh_en_["ROUGEL_fmeasure"] = fmeasures
    elif co == 4:
        en_zh_["ROUGEL_precision"] = precisions
        en_zh_["ROUGEL_recall"] = recalls
        en_zh_["ROUGEL_fmeasure"] = fmeasures
    elif co == 5:
        en_fi_["ROUGEL_precision"] = precisions
        en_fi_["ROUGEL_recall"] = recalls
        en_fi_["ROUGEL_fmeasure"] = fmeasures
    co += 1
    


print("\033[1mPearson Correlation between z-score and ROUGE measures (ROUGE L) \n")

overall_results["ROUGE L Precision Pearson"] = results_p.iloc[0,:6].values.tolist()
overall_results["ROUGE L Recall Pearson"] = results_p.iloc[1,:6].values.tolist()
overall_results["ROUGE L Fmeasure Pearson"] = results_p.iloc[2,:6].values.tolist()

results_p["Average"] = results_p.T.mean()
results_p

In [None]:
print("\033[1mKendall Tau Correlation between z-score and ROUGE measures (ROUGE L) \n")

overall_results["ROUGE L Precision Kendall"] = results_k.iloc[0,:6].values.tolist()
overall_results["ROUGE L Recall Kendall"] = results_k.iloc[1,:6].values.tolist()
overall_results["ROUGE L Fmeasure Kendall"] = results_k.iloc[2,:6].values.tolist()

results_k["Average"] = results_k.T.mean()
results_k

## RESULTS TABLE

In [None]:
pearsons = []
kendalls = []

for element in list(overall_results.keys()):
    if element.endswith("Pearson"):
        pearsons.append(element)
    elif element.endswith("Kendall"):
        kendalls.append(element)
        
dict_pearson = { your_key: overall_results[your_key] for your_key in pearsons }
dict_kendall = { your_key: overall_results[your_key] for your_key in kendalls }

pearson_df = pd.DataFrame(list(dict_pearson.values()), index=list(dict_pearson.keys()), columns=descriptions)
pearson_df

In [None]:
pearson_evaluation = pd.DataFrame(pearson_df.idxmax(), columns=["Metric with highest correlation"])
pearson_evaluation["Value"] = pearson_df.max()
pearson_evaluation

In [None]:
kendall_df = pd.DataFrame(list(dict_kendall.values()), index=list(dict_kendall.keys()), columns=descriptions)
kendall_df

In [None]:
kendall_evaluation = pd.DataFrame(kendall_df.idxmax(), columns=["Metric with highest correlation"])
kendall_evaluation["Value"] = kendall_df.max()
kendall_evaluation

## COMBINATION - Predicting the scores for the testset with the best respective metric

In [None]:
#importing the data

path_corpus_test = "/Users/franz/Downloads/testset/"

ru_en_test = pd.read_csv(path_corpus_test + "ru-en/scores.csv")
de_en_test = pd.read_csv(path_corpus_test + "de-en/scores.csv")
cs_en_test = pd.read_csv(path_corpus_test + "cs-en/scores.csv")
zh_en_test = pd.read_csv(path_corpus_test + "zh-en/scores.csv")
en_zh_test = pd.read_csv(path_corpus_test + "en-zh/scores.csv")
en_fi_test = pd.read_csv(path_corpus_test + "en-fi/scores.csv")

Throughout all the test set, there is only one field containing a nan, which is ru_en_test.iloc[9191,1]. As the metric score cannot be computed without a reference, this row has to be deleted!

In [None]:
i = 0 
j = 0
for element in ru_en_test.isna()["reference"].tolist():
    if element == True:
        j = i
    i +=1
    
pd.DataFrame(ru_en_test.iloc[j,:]).T


In [None]:
ru_en_test = ru_en_test.dropna().reset_index(drop=True)

In [None]:
from langdetect import detect

for element in [ru_en_test, de_en_test, cs_en_test, zh_en_test, en_zh_test, en_fi_test]:
    
    # detect the language pair contained in the dataframe
    pair = detect(element.iloc[0,0]) + "_" + detect(element.iloc[0,1])
    
    predicted_scores = []
    
    if pair in ["ru_en", "cs_en", "de_en", "zh-cn_en", "en_fi"]:
        for i in range(element.shape[0]):
            scorer = rouge_scorer.RougeScorer(['rougeL', 'rouge1'], use_stemmer=True)
            scores = scorer.score(element.loc[i,"reference"], element.loc[i,"translation"])
            if pair in ["ru_en", "cs_en"]:
                predicted_scores.append(scores["rougeL"].precision)
            elif pair == "de_en":
                predicted_scores.append(scores["rougeL"].fmeasure)
            elif pair in ["zh-cn_en", "en_fi"]:
                predicted_scores.append(scores["rouge1"].precision)
                
    elif pair == "en_zh-cn":
        for i in range(element.shape[0]):
            reference = [element.loc[i,"reference"].split()]
            translation = element.loc[i,"translation"].split()
            while len(reference) < len(translation):
                reference.append(" ")
            while len(reference) > len(translation):
                translation.append(" ")
            predicted_scores.append(corpus_bleu(reference, translation))
        
            
    element["predicted_score"] = predicted_scores

In [None]:
for element in [ru_en_test, de_en_test, cs_en_test, zh_en_test, en_zh_test, en_fi_test]:
    display(element.head())

## LINEAR REGRESSION ON RESPECTIVE TOP METRICS

In [None]:
from sklearn import linear_model

In [None]:
path_corpus = "/Users/franz/Desktop/TM Project/corpus/"

ru_en = pd.read_csv(path_corpus + "ru-en/scores.csv")
de_en = pd.read_csv(path_corpus + "de-en/scores.csv")
cs_en = pd.read_csv(path_corpus + "cs-en/scores.csv")
zh_en = pd.read_csv(path_corpus + "zh-en/scores.csv")
en_zh = pd.read_csv(path_corpus + "en-zh/scores.csv")
en_fi = pd.read_csv(path_corpus + "en-fi/scores.csv")

In [None]:
#top 3 metrics by language pair
for i in range(6):
    display(pd.DataFrame(pearson_df.iloc[:,i].sort_values(ascending = False).head(5)))

### Russian into English

In [None]:
l_precision = []
l_fmeasure = []
precision_1 = []
fmeasure_1 = []
bleu_star = []

for i in range(ru_en.shape[0]):
    scorer = rouge_scorer.RougeScorer(['rougeL', 'rouge1'], use_stemmer=True)
    scores = scorer.score(ru_en.loc[i,"reference"], ru_en.loc[i,"translation"])
    l_precision.append(scores["rougeL"].precision)
    l_fmeasure.append(scores["rougeL"].fmeasure)
    precision_1.append(scores["rouge1"].precision)
    fmeasure_1.append(scores["rouge1"].fmeasure)

    reference = ru_en.loc[i,"reference"]
    translation = ru_en.loc[i,"translation"]
    bleu_star.append(BLEU_star_compact(reference, translation))
    
ru_en["l_precision"] = l_precision
ru_en["l_fmeasure"] = l_fmeasure
ru_en["precision_1"] = precision_1
ru_en["fmeasure_1"] = fmeasure_1
ru_en["bleu_star"] = bleu_star


In [None]:
ru_en_scores = ru_en[["l_precision", "l_fmeasure", "precision_1", "fmeasure_1", "bleu_star", "z-score"]]

In [None]:
from scipy.stats import zscore

for element in ["l_precision", "l_fmeasure", "precision_1", "fmeasure_1", "bleu_star"]:
    ru_en_scores[element] = zscore(ru_en_scores[element])


In [None]:
ru_en_train = ru_en_scores.iloc[:int(ru_en_scores.shape[0]*0.7),:]
ru_en_test = ru_en_scores.iloc[int(ru_en_scores.shape[0]*0.7):,:]

In [None]:
import numpy as np
from sklearn.linear_model import LinearRegression

X = ru_en_train.iloc[:,:-1]
Y = ru_en_train.iloc[:,-1]
X_test = ru_en_test.iloc[:,:-1]
Y_test = ru_en_test.iloc[:,-1]

model_ru_en = LinearRegression()
model_ru_en.fit(X,Y)

result = pd.DataFrame(Y_test)
result["regression_values"] = model_ru_en.predict(X_test)

In [None]:
result.corr()

### German into English

In [None]:
l_fmeasure = []
fmeasure_1 = []
l_precision = []
l_recall = []
precision_1 = []


for i in range(de_en.shape[0]):
    scorer = rouge_scorer.RougeScorer(['rougeL', 'rouge1'], use_stemmer=True)
    scores = scorer.score(de_en.loc[i,"reference"], de_en.loc[i,"translation"])
    l_fmeasure.append(scores["rougeL"].fmeasure)
    fmeasure_1.append(scores["rouge1"].fmeasure)
    l_precision.append(scores["rougeL"].precision)
    l_recall.append(scores["rougeL"].recall)
    precision_1.append(scores["rouge1"].precision)
    
de_en["l_fmeasure"] = l_fmeasure
de_en["fmeasure_1"] = fmeasure_1
de_en["l_precision"] = l_precision
de_en["l_recall"] = l_recall
de_en["precision_1"] = precision_1

de_en_scores = de_en[["l_fmeasure", "fmeasure_1", "l_precision", "l_recall", "precision_1", "z-score"]]

for element in ["l_fmeasure", "fmeasure_1", "l_precision", "l_recall", "precision_1"]:
    de_en_scores[element] = zscore(de_en_scores[element])
    
de_en_train = de_en_scores.iloc[:int(de_en_scores.shape[0]*0.7),:]
de_en_test = de_en_scores.iloc[int(de_en_scores.shape[0]*0.7):,:]

X = de_en_train.iloc[:,:-1]
Y = de_en_train.iloc[:,-1]
X_test = de_en_test.iloc[:,:-1]
Y_test = de_en_test.iloc[:,-1]

model_de_en = LinearRegression()
model_de_en.fit(X,Y)

result = pd.DataFrame(Y_test)
result["regression_values"] = model_de_en.predict(X_test)

In [None]:
l_fmeasure = []
fmeasure_1 = []
l_precision = []
l_recall = []
precision_1 = []


for i in range(de_en.shape[0]):
    scorer = rouge_scorer.RougeScorer(['rougeL', 'rouge1'], use_stemmer=True)
    scores = scorer.score(de_en.loc[i,"reference"], de_en.loc[i,"translation"])
    l_fmeasure.append(scores["rougeL"].fmeasure)
    fmeasure_1.append(scores["rouge1"].fmeasure)
    l_precision.append(scores["rougeL"].precision)
    l_recall.append(scores["rougeL"].recall)
    precision_1.append(scores["rouge1"].precision)
    
de_en["l_fmeasure"] = l_fmeasure
de_en["fmeasure_1"] = fmeasure_1
de_en["l_precision"] = l_precision
de_en["l_recall"] = l_recall
de_en["precision_1"] = precision_1

de_en_scores = de_en[["l_fmeasure", "fmeasure_1", "l_precision", "l_recall", "precision_1", "z-score"]]

for element in ["l_fmeasure", "fmeasure_1", "l_precision", "l_recall", "precision_1"]:
    de_en_scores[element] = zscore(de_en_scores[element])
    
de_en_train = de_en_scores.iloc[:int(de_en_scores.shape[0]*0.7),:]
de_en_test = de_en_scores.iloc[int(de_en_scores.shape[0]*0.7):,:]

X = de_en_train.iloc[:,:-1]
Y = de_en_train.iloc[:,-1]
X_test = de_en_test.iloc[:,:-1]
Y_test = de_en_test.iloc[:,-1]

model_de_en = LinearRegression()
model_de_en.fit(X,Y)

result = pd.DataFrame(Y_test)
result["regression_values"] = model_de_en.predict(X_test)

In [None]:
result.corr()

### Czech into English

In [None]:
l_precision = []
l_fmeasure = []
precision_1 = []
fmeasure_1 = []
bleu_star = []

for i in range(cs_en.shape[0]):
    scorer = rouge_scorer.RougeScorer(['rougeL', 'rouge1'], use_stemmer=True)
    scores = scorer.score(cs_en.loc[i,"reference"], cs_en.loc[i,"translation"])
    l_precision.append(scores["rougeL"].precision)
    l_fmeasure.append(scores["rougeL"].fmeasure)
    precision_1.append(scores["rouge1"].precision)
    fmeasure_1.append(scores["rouge1"].fmeasure)

    reference = cs_en.loc[i,"reference"]
    translation = cs_en.loc[i,"translation"]
    bleu_star.append(BLEU_star_compact(reference, translation))
    
cs_en["l_precision"] = l_precision
cs_en["l_fmeasure"] = l_fmeasure
cs_en["precision_1"] = precision_1
cs_en["fmeasure_1"] = fmeasure_1
cs_en["bleu_star"] = bleu_star

cs_en_scores = cs_en[["l_precision", "l_fmeasure", "precision_1", "fmeasure_1", "bleu_star", "z-score"]]

for element in ["l_precision", "l_fmeasure", "precision_1", "fmeasure_1", "bleu_star"]:
    cs_en_scores[element] = zscore(cs_en_scores[element])
    
cs_en_train = cs_en_scores.iloc[:int(cs_en_scores.shape[0]*0.7),:]
cs_en_test = cs_en_scores.iloc[int(cs_en_scores.shape[0]*0.7):,:]

X = cs_en_train.iloc[:,:-1]
Y = cs_en_train.iloc[:,-1]
X_test = cs_en_test.iloc[:,:-1]
Y_test = cs_en_test.iloc[:,-1]
X_full = cs_en_scores.iloc[:,:-1]
Y_full = cs_en_scores.iloc[:,-1]

model_cs_en = LinearRegression()
model_cs_en.fit(X,Y)

result = pd.DataFrame(Y_test)
result["regression_values"] = model_cs_en.predict(X_test)

In [None]:
result.corr()

### Chinese into English

In [None]:
l_precision = []
l_fmeasure = []
precision_1 = []
fmeasure_1 = []
bleu_star = []

for i in range(zh_en.shape[0]):
    scorer = rouge_scorer.RougeScorer(['rougeL', 'rouge1'], use_stemmer=True)
    scores = scorer.score(zh_en.loc[i,"reference"], zh_en.loc[i,"translation"])
    l_precision.append(scores["rougeL"].precision)
    l_fmeasure.append(scores["rougeL"].fmeasure)
    precision_1.append(scores["rouge1"].precision)
    fmeasure_1.append(scores["rouge1"].fmeasure)

    reference = zh_en.loc[i,"reference"]
    translation = zh_en.loc[i,"translation"]
    bleu_star.append(BLEU_star_compact(reference, translation))
    
zh_en["l_precision"] = l_precision
zh_en["l_fmeasure"] = l_fmeasure
zh_en["precision_1"] = precision_1
zh_en["fmeasure_1"] = fmeasure_1
zh_en["bleu_star"] = bleu_star

zh_en_scores = zh_en[["l_precision", "l_fmeasure", "precision_1", "fmeasure_1", "bleu_star", "z-score"]]

for element in ["l_precision", "l_fmeasure", "precision_1", "fmeasure_1", "bleu_star"]:
    zh_en_scores[element] = zscore(zh_en_scores[element])
    
zh_en_train = zh_en_scores.iloc[:int(zh_en_scores.shape[0]*0.7),:]
zh_en_test = zh_en_scores.iloc[int(zh_en_scores.shape[0]*0.7):,:]

X = zh_en_train.iloc[:,:-1]
Y = zh_en_train.iloc[:,-1]
X_test = zh_en_test.iloc[:,:-1]
Y_test = zh_en_test.iloc[:,-1]

model_zh_en = LinearRegression()
model_zh_en.fit(X,Y)

result = pd.DataFrame(Y_test)
result["regression_values"] = model_zh_en.predict(X_test)

In [None]:
result.corr()

### English into Chinese 

As only the BLEU Corpus yields acceptable results, there's no need for a Regression for this language pair!

### English into Finish 

In [None]:

l_fmeasure = []
fmeasure_1 = []
l_precision = []
recall_1 = []
precision_1 = []


for i in range(en_fi.shape[0]):
    scorer = rouge_scorer.RougeScorer(['rougeL', 'rouge1'], use_stemmer=True)
    scores = scorer.score(en_fi.loc[i,"reference"], en_fi.loc[i,"translation"])
    l_fmeasure.append(scores["rougeL"].fmeasure)
    fmeasure_1.append(scores["rouge1"].fmeasure)
    l_precision.append(scores["rougeL"].precision)
    recall_1.append(scores["rouge1"].recall)
    precision_1.append(scores["rouge1"].precision)
    
en_fi["l_fmeasure"] = l_fmeasure
en_fi["fmeasure_1"] = fmeasure_1
en_fi["l_precision"] = l_precision
en_fi["recall_1"] = recall_1
en_fi["precision_1"] = precision_1

en_fi_scores = en_fi[["l_fmeasure", "fmeasure_1", "l_precision", "recall_1", "precision_1", "z-score"]]

for element in ["l_fmeasure", "fmeasure_1", "l_precision", "recall_1", "precision_1"]:
    en_fi_scores[element] = zscore(en_fi_scores[element])
    
en_fi_train = en_fi_scores.iloc[:int(en_fi_scores.shape[0]*0.7),:]
en_fi_test = en_fi_scores.iloc[int(en_fi_scores.shape[0]*0.7):,:]

X = en_fi_train.iloc[:,:-1]
Y = en_fi_train.iloc[:,-1]
X_test = en_fi_test.iloc[:,:-1]
Y_test = en_fi_test.iloc[:,-1]

model_en_fi = LinearRegression()
model_en_fi.fit(X,Y)

result = pd.DataFrame(Y_test)
result["regression_values"] = model_en_fi.predict(X_test)

In [None]:
result.corr()

# REGRESSION ON TOP OF ALL LEXICAL METRICS

From this point, a consistent train dev split of 80:20 is used:

In [None]:
from sklearn.model_selection import train_test_split

### RU EN


In [None]:
ru_en_ = ru_en_.drop(columns=["source", "reference", "translation", "avg-score", "annotators"])

In [None]:
for element in ['BLEU', 'BLEU_s', 'BLEU_c', 'ROUGE1_precision', 'ROUGE1_recall', 'ROUGE1_fmeasure', 'ROUGE2_precision', 'ROUGE2_recall', 'ROUGE2_fmeasure', 'ROUGEL_precision', 'ROUGEL_recall', 'ROUGEL_fmeasure']:
    ru_en_[element] = zscore(ru_en_[element])
    
    
ru_en__train, ru_en__test = train_test_split(ru_en_, test_size=0.2, random_state=0, shuffle=True)

X = ru_en__train.iloc[:,1:]
Y = ru_en__train.iloc[:,0]
X_test = ru_en__test.iloc[:,1:]
Y_test = ru_en__test.iloc[:,0] 
X_full = ru_en_.iloc[:,1:]
Y_full = ru_en_.iloc[:,0] 

model_ru_en_ = LinearRegression()
model_ru_en_.fit(X,Y)

final_model_ru_en = LinearRegression()
final_model_ru_en.fit(X_full, Y_full)
    
result = pd.DataFrame(Y_test)
result["regression_values"] = model_ru_en_.predict(X_test)

In [None]:
result.corr()

### DE EN

In [None]:
de_en_ = de_en_.drop(columns=["source", "reference", "translation", "avg-score", "annotators"])

for element in ['BLEU', 'BLEU_s', 'BLEU_c', 'ROUGE1_precision', 'ROUGE1_recall', 'ROUGE1_fmeasure', 'ROUGE2_precision', 'ROUGE2_recall', 'ROUGE2_fmeasure', 'ROUGEL_precision', 'ROUGEL_recall', 'ROUGEL_fmeasure']:
    de_en_[element] = zscore(de_en_[element])
    
de_en__train, de_en__test = train_test_split(de_en_, test_size=0.2, random_state=0, shuffle=True)
    

X = de_en__train.iloc[:,1:]
Y = de_en__train.iloc[:,0]
X_test = de_en__test.iloc[:,1:]
Y_test = de_en__test.iloc[:,0] 

model_de_en_ = LinearRegression()
model_de_en_.fit(X,Y)
    
result = pd.DataFrame(Y_test)
result["regression_values"] = model_de_en_.predict(X_test)

In [None]:
result.corr()

### CS EN

In [None]:
cs_en_ = cs_en_.drop(columns=["source", "reference", "translation", "avg-score", "annotators"])

for element in ['BLEU', 'BLEU_s', 'BLEU_c', 'ROUGE1_precision', 'ROUGE1_recall', 'ROUGE1_fmeasure', 'ROUGE2_precision', 'ROUGE2_recall', 'ROUGE2_fmeasure', 'ROUGEL_precision', 'ROUGEL_recall', 'ROUGEL_fmeasure']:
    cs_en_[element] = zscore(cs_en_[element])
    
cs_en__train, cs_en__test = train_test_split(cs_en_, test_size=0.2, random_state=0, shuffle=True)

X = cs_en__train.iloc[:,1:]
Y = cs_en__train.iloc[:,0]
X_test = cs_en__test.iloc[:,1:]
Y_test = cs_en__test.iloc[:,0] 
X_full = cs_en_.iloc[:,1:]
Y_full = cs_en_.iloc[:,0] 

model_cs_en_ = LinearRegression()
model_cs_en_.fit(X,Y)

final_model_cs_en = LinearRegression()
final_model_cs_en.fit(X_full, Y_full)
    
result = pd.DataFrame(Y_test)
result["regression_values"] = model_cs_en_.predict(X_test)

In [None]:
result.corr()

### ZH EN

In [None]:
zh_en_ = zh_en_.drop(columns=["source", "reference", "translation", "avg-score", "annotators"])

for element in ['BLEU', 'BLEU_s', 'BLEU_c', 'ROUGE1_precision', 'ROUGE1_recall', 'ROUGE1_fmeasure', 'ROUGE2_precision', 'ROUGE2_recall', 'ROUGE2_fmeasure', 'ROUGEL_precision', 'ROUGEL_recall', 'ROUGEL_fmeasure']:
    zh_en_[element] = zscore(zh_en_[element])
    
zh_en__train, zh_en__test = train_test_split(zh_en_, test_size=0.2, random_state=0, shuffle=True)

X = zh_en__train.iloc[:,1:]
Y = zh_en__train.iloc[:,0]
X_test = zh_en__test.iloc[:,1:]
Y_test = zh_en__test.iloc[:,0] 
X_full = zh_en_.iloc[:,1:]
Y_full = zh_en_.iloc[:,0] 

model_zh_en_ = LinearRegression()
model_zh_en_.fit(X,Y)

final_model_zh_en = LinearRegression()
final_model_zh_en.fit(X_full, Y_full)
    
result = pd.DataFrame(Y_test)
result["regression_values"] = model_zh_en_.predict(X_test)

In [None]:
result.corr()

### EN ZH

In [None]:
en_zh_ = en_zh_.drop(columns=["source", "reference", "translation", "avg-score", "annotators"])

for element in ['BLEU', 'BLEU_s', 'BLEU_c', 'ROUGE1_precision', 'ROUGE1_recall', 'ROUGE1_fmeasure', 'ROUGE2_precision', 'ROUGE2_recall', 'ROUGE2_fmeasure', 'ROUGEL_precision', 'ROUGEL_recall', 'ROUGEL_fmeasure']:
    en_zh_[element] = zscore(en_zh_[element])
    
en_zh__train, en_zh__test = train_test_split(en_zh_, test_size=0.2, random_state=0, shuffle=True)

X = en_zh__train.iloc[:,1:]
Y = en_zh__train.iloc[:,0]
X_test = en_zh__test.iloc[:,1:]
Y_test = en_zh__test.iloc[:,0] 

model_en_zh_ = LinearRegression()
model_en_zh_.fit(X,Y)
    
result = pd.DataFrame(Y_test)
result["regression_values"] = model_en_zh_.predict(X_test)

In [None]:
result.corr()

### EN FI

In [None]:
en_fi_ = en_fi_.drop(columns=["source", "reference", "translation", "avg-score", "annotators"])

for element in ['BLEU', 'BLEU_s', 'BLEU_c', 'ROUGE1_precision', 'ROUGE1_recall', 'ROUGE1_fmeasure', 'ROUGE2_precision', 'ROUGE2_recall', 'ROUGE2_fmeasure', 'ROUGEL_precision', 'ROUGEL_recall', 'ROUGEL_fmeasure']:
    en_fi_[element] = zscore(en_fi_[element])
    
en_fi__train, en_fi__test = train_test_split(en_fi_, test_size=0.2, random_state=0, shuffle=True)

X = en_fi__train.iloc[:,1:]
Y = en_fi__train.iloc[:,0]
X_test = en_fi__test.iloc[:,1:]
Y_test = en_fi__test.iloc[:,0] 

model_en_fi_ = LinearRegression()
model_en_fi_.fit(X,Y)

final_model_en_fi = LinearRegression()
final_model_en_fi.fit(X_full, Y_full)
    
result = pd.DataFrame(Y_test)
result["regression_values"] = model_en_fi_.predict(X_test)

In [None]:
result.corr()

## THE OVERALL BEST CORRELATIONS PER LANGUAGE PAIR 

##### RU EN
* Regression with all 12 metrics included (0.368)

##### DE EN
* ROUGE L Fmeasure Pearson (0.328)

##### CS EN
* Regression with all 12 metrics included (0.483)

##### ZH EN
* Regression with all 12 metrics included (0.360)

##### EN ZH
* BLEU Corpus Pearson (0.424)

##### EN FI
* ROUGE 1 Precision Pearson (0.549)

# Final Metric Function

In [None]:
from langdetect import detect

#importing the data

path_corpus_test = "/Users/franz/Downloads/testset/"

ru_en_test = pd.read_csv(path_corpus_test + "ru-en/scores.csv")
de_en_test = pd.read_csv(path_corpus_test + "de-en/scores.csv")
cs_en_test = pd.read_csv(path_corpus_test + "cs-en/scores.csv")
zh_en_test = pd.read_csv(path_corpus_test + "zh-en/scores.csv")
en_zh_test = pd.read_csv(path_corpus_test + "en-zh/scores.csv")
en_fi_test = pd.read_csv(path_corpus_test + "en-fi/scores.csv")

In [None]:
#fill the nan
ru_en_test.iloc[9191,1] = "This is a dummy text, as the nan has to be filled!"

In [None]:
#this is the final functions of this metric

def metric(element):
    # detect the language pair contained in the dataframe
    pair = detect(element.iloc[0,0]) + "_" + detect(element.iloc[0,1])
    
    predicted_scores = []
    
    #check if pair in list, if so, all 12 metrics need to be computed to feed to the respective model
    if pair in ["ru_en", "cs_en", "zh-cn_en"]:
        
        all_12_df = pd.DataFrame()
        
        bleu_star_scores = []
        bleu_sentence_scores = []
        bleu_corpus_scores = []
        r_1_p = []
        r_1_r = []
        r_1_f = []
        r_2_p = []
        r_2_r = []
        r_2_f = []
        r_l_p = []
        r_l_r = []
        r_l_f = []

        
        for i in range(element.shape[0]):
            #calculating the bleu star scores for the translations in comparison to their respective reference
            reference = element.loc[i,"reference"]
            translation = element.loc[i,"translation"]
            bleu_star_scores.append(BLEU_star_compact(reference, translation))
            
            #calculating the bleu sentence scores for the translations in comparison to their respective reference
            reference = [element.loc[i,"reference"].split()]
            translation = element.loc[i,"translation"].split()
            bleu_sentence_scores.append(sentence_bleu(reference, translation,weights=(0.25, 0.25, 0.25, 0.25)))
            
            #calculating the bleu corpus scores for the translations in comparison to their respective reference
            reference = [element.loc[i,"reference"].split()]
            translation = element.loc[i,"translation"].split()
            while len(reference) < len(translation):
                reference.append(" ")
            while len(reference) > len(translation):
                translation.append(" ")
            bleu_corpus_scores.append(corpus_bleu(reference, translation))
        
            #calculating all the ROUGE scores for the translations in comparison to their respective reference
            scorer = rouge_scorer.RougeScorer(['rougeL', 'rouge1', 'rouge2'], use_stemmer=True)
            scores = scorer.score(element.loc[i,"reference"], element.loc[i,"translation"])
            
            r_1_p.append(scores["rouge1"].precision)
            r_1_r.append(scores["rouge1"].recall)
            r_1_f.append(scores["rouge1"].fmeasure)
            
            r_2_p.append(scores["rouge2"].precision)
            r_2_r.append(scores["rouge2"].recall)
            r_2_f.append(scores["rouge2"].fmeasure)
            
            r_l_p.append(scores["rougeL"].precision)
            r_l_r.append(scores["rougeL"].recall)
            r_l_f.append(scores["rougeL"].fmeasure)
            
        all_12_df["BLEU"] = bleu_star_scores
        all_12_df["BLEU_s"] = bleu_sentence_scores
        all_12_df["BLEU_c"] = bleu_corpus_scores
        all_12_df["ROUGE1_precision"] = r_1_p
        all_12_df["ROUGE1_recall"] = r_1_r
        all_12_df["ROUGE1_fmeasure"] = r_1_f
        all_12_df["ROUGE2_precision"] = r_2_p
        all_12_df["ROUGE2_recall"] = r_2_r
        all_12_df["ROUGE2_fmeasure"] = r_2_f
        all_12_df["ROUGEL_precision"] = r_l_p
        all_12_df["ROUGEL_recall"] = r_l_r
        all_12_df["ROUGEL_fmeasure"] = r_l_f
            
        #the following part to be changed to the full model if applied to the test set, as it will yiel better results
        if pair == "ru_en":
            return model_ru_en_.predict(all_12_df)
        elif pair == "cs_en":
            return model_cs_en_.predict(all_12_df)
        elif pair == "zh-cn_en":
            return model_zh_en_.predict(all_12_df)
                
    elif pair in ["de_en", "en_fi"]:
        for i in range(element.shape[0]):
            scorer = rouge_scorer.RougeScorer(['rougeL', 'rouge1', 'rouge2'], use_stemmer=True)
            scores = scorer.score(element.loc[i,"reference"], element.loc[i,"translation"])
            if pair == "de_en":
                predicted_scores.append(scores["rougeL"].fmeasure)
            elif pair == "en_fi":
                predicted_scores.append(scores["rouge1"].precision)
        return predicted_scores
                
    elif pair == "en_zh-cn":
        for i in range(element.shape[0]):
            reference = [element.loc[i,"reference"].split()]
            translation = element.loc[i,"translation"].split()
            while len(reference) < len(translation):
                reference.append(" ")
            while len(reference) > len(translation):
                translation.append(" ")
            predicted_scores.append(corpus_bleu(reference, translation))
        return predicted_scores
        
    

## Testing on Development set

In [None]:
#importing the data

path_corpus = "/Users/franz/Desktop/TM Project/corpus/"

ru_en__ = pd.read_csv(path_corpus + "ru-en/scores.csv")
de_en__ = pd.read_csv(path_corpus + "de-en/scores.csv")
cs_en__ = pd.read_csv(path_corpus + "cs-en/scores.csv")
zh_en__ = pd.read_csv(path_corpus + "zh-en/scores.csv")
en_zh__ = pd.read_csv(path_corpus + "en-zh/scores.csv")
en_fi__ = pd.read_csv(path_corpus + "en-fi/scores.csv")


ru_en__ = ru_en__[ru_en__.index.isin(ru_en__test.index.tolist())].reset_index(drop=True)
de_en__ = de_en__[de_en__.index.isin(de_en__test.index.tolist())].reset_index(drop=True)
cs_en__ = cs_en__[cs_en__.index.isin(cs_en__test.index.tolist())].reset_index(drop=True)
zh_en__ = zh_en__[zh_en__.index.isin(zh_en__test.index.tolist())].reset_index(drop=True)
en_zh__ = en_zh__[en_zh__.index.isin(en_zh__test.index.tolist())].reset_index(drop=True)
en_fi__ = en_fi__[en_fi__.index.isin(en_fi__test.index.tolist())].reset_index(drop=True)

In [None]:
ru_en__["scores"] = metric(ru_en__)
de_en__["scores"] = metric(de_en__)
cs_en__["scores"] = metric(cs_en__)
zh_en__["scores"] = metric(zh_en__)
en_zh__["scores"] = metric(en_zh__)
en_fi__["scores"] = metric(en_fi__)

In [None]:
p = []
k = []
for element in [ru_en__, de_en__, cs_en__, zh_en__, en_zh__, en_fi__]:
    p.append(element.corr().iloc[3,0])
    k.append(element.corr(method="kendall").iloc[3,0])

In [None]:
#final scores for the dev set
r_lex = pd.DataFrame([p, k], columns = ["ru_en", "de_en","cs_en","zh_en", "en_zh", "en_fi"], index=["Pearson", "Kendall"])
r_lex["mean"] = r_lex.T.mean()
r_lex 




## Exporting scores for the testset

In [None]:
ru_en_test["scores"] = metric(ru_en_test)
de_en_test["scores"] = metric(de_en_test)
cs_en_test["scores"] = metric(cs_en_test)
zh_en_test["scores"] = metric(zh_en_test)
en_zh_test["scores"] = metric(en_zh_test)
en_fi_test["scores"] = metric(en_fi_test)

In [None]:
ru_en_test.to_csv(path_corpus_test + "ru-en/scores_added.csv")
de_en_test.to_csv(path_corpus_test + "de-en/scores_added.csv")
cs_en_test.to_csv(path_corpus_test + "cs-en/scores_added.csv")
zh_en_test.to_csv(path_corpus_test + "zh-en/scores_added.csv")
en_zh_test.to_csv(path_corpus_test + "en-zh/scores_added.csv")
en_fi_test.to_csv(path_corpus_test + "en-fi/scores_added.csv")