# TM Project

### Importing the data

In [1]:
import pandas as pd
import numpy as np

In [2]:
#importing the data

path_corpus = "/Users/franz/Desktop/TM Project/corpus/"

ru_en = pd.read_csv(path_corpus + "ru-en/scores.csv")
de_en = pd.read_csv(path_corpus + "de-en/scores.csv")
cs_en = pd.read_csv(path_corpus + "cs-en/scores.csv")
zh_en = pd.read_csv(path_corpus + "zh-en/scores.csv")
en_zh = pd.read_csv(path_corpus + "en-zh/scores.csv")
en_fi = pd.read_csv(path_corpus + "en-fi/scores.csv")

ru_en_ = ru_en.copy()
de_en_ = de_en.copy()
cs_en_ = cs_en.copy()
zh_en_ = zh_en.copy()
en_zh_ = en_zh.copy()
en_fi_ = en_fi.copy()

In [3]:
de_en.head()

Unnamed: 0,source,reference,translation,z-score,avg-score,annotators
0,"Ihr Zeitlupentempo maßen sie, als sie vor Spit...",Her timeless pace measures them when they equi...,Their slow speed was measured by researchers o...,-0.345024,76.0,1
1,"Er sagte, dass die Bereiche ruhige Treffpunkte...",He said the areas offer quiet meeting points b...,He said the spaces provided calm meeting point...,0.9038,97.5,2
2,Für die Geschäftsleute an der B 27 ist es nur ...,"For businessmen at the B 27, it's only a small...",This is only a small consolation for businesse...,0.700503,94.0,1
3,Diese Fähigkeit sei möglicherweise angeboren o...,This ability may be born or developed with gen...,"This ability may be innate, or may develop as ...",-1.256572,51.5,2
4,Weil sie Wassertemperaturen um die sechs Grad ...,Because they prefer water temperatures around ...,They generally only come to the surface in win...,0.293909,87.0,2


### Data exploration

In [4]:
descriptions = ["Russian into English", "German into English", "Czech into English", "Chinese into English", "English into Chinese", "English into Finish"]

In [5]:
rows = []
zscores = []
avgscores = []
annots = []

i = 0

for element in [ru_en, de_en, cs_en, zh_en, en_zh, en_fi]:
    rows.append(element.shape[0])
    zscores.append(np.round(element["z-score"].mean(),2))
    avgscores.append(np.round(element["avg-score"].mean(), 2))
    annots.append(np.round(element["annotators"].mean(),2))
    i += 1                   
    
exploration_df = pd.DataFrame([rows, zscores, avgscores, annots]).T.rename(columns={0:"rows", 1:"avg z-score", 2:"avg avg-score", 3:"avg annotators"})
exploration_df["description"] = descriptions
exploration_df = exploration_df.set_index("description")
exploration_df

Unnamed: 0_level_0,rows,avg z-score,avg avg-score,avg annotators
description,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Russian into English,17980.0,0.01,74.5,1.3
German into English,21704.0,0.0,71.85,1.5
Czech into English,11585.0,-0.03,69.24,1.89
Chinese into English,26419.0,-0.05,66.06,1.42
English into Chinese,10221.0,-0.06,65.98,1.58
English into Finish,6748.0,-0.14,45.12,1.23


In [6]:
exploration_df.corr()

Unnamed: 0,rows,avg z-score,avg avg-score,avg annotators
rows,1.0,0.597505,0.579839,-0.105454
avg z-score,0.597505,1.0,0.975645,0.310459
avg avg-score,0.579839,0.975645,1.0,0.41711
avg annotators,-0.105454,0.310459,0.41711,1.0


As there are only 6 different types of translations, these correlations might be not very meaningful!

# Lexical metrics

## BLEU Score - Part 1

In [7]:
from collections import Counter

# a more "pythonic" way to compute BLUE_star 

def BLEU_star_compact(refs, candidate):
    refs = [refs.split()]
    candidate = candidate.split()

    return sum([min(count, max([ref[word] for ref in [Counter(ref) for ref in refs]])) for word, count in Counter(candidate).items()])/len(candidate)

In [8]:
overall_results = {}

correlations_p = []
correlations_k = []

co = 0
for element in [ru_en, de_en, cs_en, zh_en, en_zh, en_fi]:
    
    bleu_scores = []

    #calculating the bleu scores for the translations in comparison to their respective reference
    for i in range(element.shape[0]): #element.shape[0]
        reference = element.loc[i,"reference"]
        translation = element.loc[i,"translation"]
        bleu_scores.append(BLEU_star_compact(reference, translation))

    #add the bleu scores to the dataframe
    development_df = element.copy() #element.shape[0]
    development_df["BLEU"] = bleu_scores
    correlations_p.append(development_df.corr(method="pearson").iloc[-1:,0].values[0])
    correlations_k.append(development_df.corr(method="kendall").iloc[-1:,0].values[0])
    
    if co == 0:
        ru_en_["BLEU"] = bleu_scores
    elif co == 1:
        de_en_["BLEU"] = bleu_scores
    elif co == 2:
        cs_en_["BLEU"] = bleu_scores
    elif co == 3:
        zh_en_["BLEU"] = bleu_scores
    elif co == 4:
        en_zh_["BLEU"] = bleu_scores
    elif co == 5:
        en_fi_["BLEU"] = bleu_scores
    co += 1
        


print("\033[1mCorrelation between z-score and BLEU score\n")
i = 0
for element in correlations_p:
    print("\033[1m", descriptions[i] + ":",  "\033[0mPearson:", np.round(element,4), "| Kendall:", np.round(correlations_k[i],4))
    i += 1

print("\n\033[1mOverall:\033[0m Average Pearson:", np.round(sum(correlations_p)/len(correlations_p),4),
         "| Average Kendall:", np.round(sum(correlations_k)/len(correlations_k),4))

overall_results["BLEU Star Pearson"] = correlations_p
overall_results["BLEU Star Kendall"] = correlations_k

[1mCorrelation between z-score and BLEU score

[1m Russian into English: [0mPearson: 0.3337 | Kendall: 0.2284
[1m German into English: [0mPearson: 0.2987 | Kendall: 0.2104
[1m Czech into English: [0mPearson: 0.4252 | Kendall: 0.2886
[1m Chinese into English: [0mPearson: 0.3132 | Kendall: 0.2118
[1m English into Chinese: [0mPearson: 0.0256 | Kendall: 0.0046
[1m English into Finish: [0mPearson: 0.5084 | Kendall: 0.3383

[1mOverall:[0m Average Pearson: 0.3174 | Average Kendall: 0.2137


## BLEU Score - Part 2

### 1st Try (sentence_bleu)

In [9]:
from nltk.translate.bleu_score import sentence_bleu

correlations_p = []
correlations_k = []

co = 0
for element in [ru_en, de_en, cs_en, zh_en, en_zh, en_fi]:
    
    bleu_scores = []

    #calculating the bleu scores for the translations in comparison to their respective reference
    for i in range(element.shape[0]): #element.shape[0]
        reference = [element.loc[i,"reference"].split()]
        translation = element.loc[i,"translation"].split()
        bleu_scores.append(sentence_bleu(reference, translation,weights=(0.25, 0.25, 0.25, 0.25)))

    #add the bleu scores to the dataframe
    development_df = element.copy() #element.shape[0]
    development_df["BLEU"] = bleu_scores
    correlations_p.append(development_df.corr(method="pearson").iloc[-1:,0].values[0])
    correlations_k.append(development_df.corr(method="kendall").iloc[-1:,0].values[0])
    
    if co == 0:
        ru_en_["BLEU_s"] = bleu_scores
    elif co == 1:
        de_en_["BLEU_s"] = bleu_scores
    elif co == 2:
        cs_en_["BLEU_s"] = bleu_scores
    elif co == 3:
        zh_en_["BLEU_s"] = bleu_scores
    elif co == 4:
        en_zh_["BLEU_s"] = bleu_scores
    elif co == 5:
        en_fi_["BLEU_s"] = bleu_scores

    co += 1
    
print("\033[1mCorrelation between z-score and BLEU score\n")
i = 0
for element in correlations_p:
    print("\033[1m", descriptions[i] + ":",  "\033[0mPearson:", np.round(element,4), "| Kendall:", np.round(correlations_k[i],4))
    i += 1

print("\n\033[1mOverall:\033[0m Average Pearson:", np.round(sum(correlations_p)/len(correlations_p),4),
         "| Average Kendall:", np.round(sum(correlations_k)/len(correlations_k),4))

overall_results["BLEU Sentence Pearson"] = correlations_p
overall_results["BLEU Sentence Kendall"] = correlations_k

The hypothesis contains 0 counts of 4-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()
The hypothesis contains 0 counts of 3-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()
The hypothesis contains 0 counts of 2-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()


[1mCorrelation between z-score and BLEU score

[1m Russian into English: [0mPearson: 0.2541 | Kendall: 0.1899
[1m German into English: [0mPearson: 0.2419 | Kendall: 0.1769
[1m Czech into English: [0mPearson: 0.2924 | Kendall: 0.2256
[1m Chinese into English: [0mPearson: 0.2458 | Kendall: 0.1791
[1m English into Chinese: [0mPearson: 0.0141 | Kendall: 0.0045
[1m English into Finish: [0mPearson: 0.2623 | Kendall: 0.2896

[1mOverall:[0m Average Pearson: 0.2184 | Average Kendall: 0.1776


### 2nd Try (corpus_bleu)

In [10]:
from nltk.translate.bleu_score import corpus_bleu

correlations_p = []
correlations_k = []

co = 0
for element in [ru_en, de_en, cs_en, zh_en, en_zh, en_fi]:
    
    bleu_scores = []

    #calculating the bleu scores for the translations in comparison to their respective reference
    for i in range(element.shape[0]): #element.shape[0]
        reference = [element.loc[i,"reference"].split()]
        translation = element.loc[i,"translation"].split()
        while len(reference) < len(translation):
            reference.append(" ")
        while len(reference) > len(translation):
            translation.append(" ")
        bleu_scores.append(corpus_bleu(reference, translation))

    #add the bleu scores to the dataframe
    development_df = element.copy() #element.shape[0]
    development_df["BLEU"] = bleu_scores
    correlations_p.append(development_df.corr(method="pearson").iloc[-1:,0].values[0])
    correlations_k.append(development_df.corr(method="kendall").iloc[-1:,0].values[0])
    
    if co == 0:
        ru_en_["BLEU_c"] = bleu_scores
    elif co == 1:
        de_en_["BLEU_c"] = bleu_scores
    elif co == 2:
        cs_en_["BLEU_c"] = bleu_scores
    elif co == 3:
        zh_en_["BLEU_c"] = bleu_scores
    elif co == 4:
        en_zh_["BLEU_c"] = bleu_scores
    elif co == 5:
        en_fi_["BLEU_c"] = bleu_scores
    co += 1
    i += 1


print("\033[1mCorrelation between z-score and BLEU score\n")
i = 0
for element in correlations_p:
    print("\033[1m", descriptions[i] + ":",  "\033[0mPearson", np.round(element,4), "| Kendall:", np.round(correlations_k[i],4))
    i += 1

print("\n\033[1mOverall:\033[0m Average Pearson:", np.round(sum(correlations_p)/len(correlations_p),4),
         "| Average Kendall:", np.round(sum(correlations_k)/len(correlations_k),4))

overall_results["BLEU Corpus Pearson"] = correlations_p
overall_results["BLEU Corpus Kendall"] = correlations_k

[1mCorrelation between z-score and BLEU score

[1m Russian into English: [0mPearson 0.0539 | Kendall: 0.0628
[1m German into English: [0mPearson 0.0065 | Kendall: 0.0394
[1m Czech into English: [0mPearson 0.0923 | Kendall: 0.0982
[1m Chinese into English: [0mPearson 0.0547 | Kendall: 0.0484
[1m English into Chinese: [0mPearson 0.4243 | Kendall: 0.2997
[1m English into Finish: [0mPearson 0.2355 | Kendall: 0.1723

[1mOverall:[0m Average Pearson: 0.1445 | Average Kendall: 0.1201


## ROUGE Score

### ROUGE 1

In [11]:
from rouge_score import rouge_scorer

results_p = pd.DataFrame()
results_k = pd.DataFrame()

correlations_p = []
correlations_k = []
j = 0
co = 0
for element in [ru_en, de_en, cs_en, zh_en, en_zh, en_fi]:
    
    precisions = []
    recalls = []
    fmeasures = []

    #calculating the rouge scores for the translations in comparison to their respective reference
    for i in range(element.shape[0]): #element.shape[0]
        scorer = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], use_stemmer=True)
        scores = scorer.score(element.loc[i,"reference"], element.loc[i,"translation"])
        precisions.append(scores["rouge1"].precision)
        recalls.append(scores["rouge1"].recall)
        fmeasures.append(scores["rouge1"].fmeasure)

    #add the rouge scores to the dataframe
    development_df = element.copy() #element.shape[0]
    development_df["ROUGE precision"] = precisions
    development_df["ROUGE recall"] = recalls
    development_df["ROUGE fmeasure"] = fmeasures
    correlations_p.append(development_df.corr(method="pearson").iloc[-1:,0].values[0])
    correlations_k.append(development_df.corr(method="kendall").iloc[-1:,0].values[0])
    col_name = descriptions[j]
    j += 1
    results_p[col_name] = pd.Series(development_df.corr(method="pearson").iloc[0,3:])
    results_k[col_name] = pd.Series(development_df.corr(method="kendall").iloc[0,3:])
    
    if co == 0:
        ru_en_["ROUGE1_precision"] = precisions
        ru_en_["ROUGE1_recall"] = recalls
        ru_en_["ROUGE1_fmeasure"] = fmeasures
    elif co == 1:
        de_en_["ROUGE1_precision"] = precisions
        de_en_["ROUGE1_recall"] = recalls
        de_en_["ROUGE1_fmeasure"] = fmeasures
    elif co == 2:
        cs_en_["ROUGE1_precision"] = precisions
        cs_en_["ROUGE1_recall"] = recalls
        cs_en_["ROUGE1_fmeasure"] = fmeasures
    elif co == 3:
        zh_en_["ROUGE1_precision"] = precisions
        zh_en_["ROUGE1_recall"] = recalls
        zh_en_["ROUGE1_fmeasure"] = fmeasures
    elif co == 4:
        en_zh_["ROUGE1_precision"] = precisions
        en_zh_["ROUGE1_recall"] = recalls
        en_zh_["ROUGE1_fmeasure"] = fmeasures
    elif co == 5:
        en_fi_["ROUGE1_precision"] = precisions
        en_fi_["ROUGE1_recall"] = recalls
        en_fi_["ROUGE1_fmeasure"] = fmeasures
    co += 1


print("\033[1mPearson Correlation between z-score and ROUGE measures (ROUGE 1) \n")


overall_results["ROUGE 1 Precision Pearson"] = results_p.iloc[0,:6].values.tolist()
overall_results["ROUGE 1 Recall Pearson"] = results_p.iloc[1,:6].values.tolist()
overall_results["ROUGE 1 Fmeasure Pearson"] = results_p.iloc[2,:6].values.tolist()

results_p["Average"] = results_p.T.mean()
results_p

[1mPearson Correlation between z-score and ROUGE measures (ROUGE 1) 



Unnamed: 0,Russian into English,German into English,Czech into English,Chinese into English,English into Chinese,English into Finish,Average
ROUGE precision,0.344109,0.316624,0.456718,0.331985,0.074533,0.549302,0.345545
ROUGE recall,0.294337,0.294263,0.390546,0.277619,0.075061,0.51322,0.307508
ROUGE fmeasure,0.341309,0.326557,0.450511,0.328195,0.080697,0.54454,0.345302


In [12]:
print("\033[1mKendall Tau Correlation between z-score and ROUGE measures (ROUGE 1) \n")

overall_results["ROUGE 1 Precision Kendall"] = results_k.iloc[0,:6].values.tolist()
overall_results["ROUGE 1 Recall Kendall"] = results_k.iloc[1,:6].values.tolist()
overall_results["ROUGE 1 Fmeasure Kendall"] = results_k.iloc[2,:6].values.tolist()

results_k["Average"] = results_k.T.mean()
results_k

[1mKendall Tau Correlation between z-score and ROUGE measures (ROUGE 1) 



Unnamed: 0,Russian into English,German into English,Czech into English,Chinese into English,English into Chinese,English into Finish,Average
ROUGE precision,0.233533,0.219705,0.307255,0.220957,0.050288,0.363198,0.232489
ROUGE recall,0.200764,0.203551,0.260756,0.181468,0.050626,0.332239,0.204901
ROUGE fmeasure,0.233055,0.225082,0.302951,0.216623,0.054556,0.354801,0.231178


### ROUGE 2

In [13]:
from rouge_score import rouge_scorer

results_p = pd.DataFrame()
results_k = pd.DataFrame()

correlations_p = []
correlations_k = []
j = 0
co = 0
for element in [ru_en, de_en, cs_en, zh_en, en_zh, en_fi]:
    
    precisions = []
    recalls = []
    fmeasures = []

    #calculating the rouge scores for the translations in comparison to their respective reference
    for i in range(element.shape[0]): #element.shape[0]
        scorer = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], use_stemmer=True)
        scores = scorer.score(element.loc[i,"reference"], element.loc[i,"translation"])
        precisions.append(scores["rouge2"].precision)
        recalls.append(scores["rouge2"].recall)
        fmeasures.append(scores["rouge2"].fmeasure)

    #add the rouge scores to the dataframe
    development_df = element.copy() #element.shape[0]
    development_df["ROUGE precision"] = precisions
    development_df["ROUGE recall"] = recalls
    development_df["ROUGE fmeasure"] = fmeasures
    correlations_p.append(development_df.corr(method="pearson").iloc[-1:,0].values[0])
    correlations_k.append(development_df.corr(method="kendall").iloc[-1:,0].values[0])
    col_name = descriptions[j]
    j += 1
    results_p[col_name] = pd.Series(development_df.corr(method="pearson").iloc[0,3:])
    results_k[col_name] = pd.Series(development_df.corr(method="kendall").iloc[0,3:])
    
    if co == 0:
        ru_en_["ROUGE2_precision"] = precisions
        ru_en_["ROUGE2_recall"] = recalls
        ru_en_["ROUGE2_fmeasure"] = fmeasures
    elif co == 1:
        de_en_["ROUGE2_precision"] = precisions
        de_en_["ROUGE2_recall"] = recalls
        de_en_["ROUGE2_fmeasure"] = fmeasures
    elif co == 2:
        cs_en_["ROUGE2_precision"] = precisions
        cs_en_["ROUGE2_recall"] = recalls
        cs_en_["ROUGE2_fmeasure"] = fmeasures
    elif co == 3:
        zh_en_["ROUGE2_precision"] = precisions
        zh_en_["ROUGE2_recall"] = recalls
        zh_en_["ROUGE2_fmeasure"] = fmeasures
    elif co == 4:
        en_zh_["ROUGE2_precision"] = precisions
        en_zh_["ROUGE2_recall"] = recalls
        en_zh_["ROUGE2_fmeasure"] = fmeasures
    elif co == 5:
        en_fi_["ROUGE2_precision"] = precisions
        en_fi_["ROUGE2_recall"] = recalls
        en_fi_["ROUGE2_fmeasure"] = fmeasures
    co += 1


print("\033[1mPearson Correlation between z-score and ROUGE measures (ROUGE 2) \n")

overall_results["ROUGE 2 Precision Pearson"] = results_p.iloc[0,:6].values.tolist()
overall_results["ROUGE 2 Recall Pearson"] = results_p.iloc[1,:6].values.tolist()
overall_results["ROUGE 2 Fmeasure Pearson"] = results_p.iloc[2,:6].values.tolist()

results_p["Average"] = results_p.T.mean()
results_p

[1mPearson Correlation between z-score and ROUGE measures (ROUGE 2) 



Unnamed: 0,Russian into English,German into English,Czech into English,Chinese into English,English into Chinese,English into Finish,Average
ROUGE precision,0.326187,0.301237,0.412951,0.308727,0.088139,0.46835,0.317599
ROUGE recall,0.301894,0.288012,0.380923,0.285887,0.092338,0.444354,0.298901
ROUGE fmeasure,0.320052,0.301592,0.405594,0.304461,0.095062,0.461915,0.314779


In [14]:
print("\033[1mKendall Tau Correlation between z-score and ROUGE measures (ROUGE 2) \n")

overall_results["ROUGE 2 Precision Kendall"] = results_k.iloc[0,:6].values.tolist()
overall_results["ROUGE 2 Recall Kendall"] = results_k.iloc[1,:6].values.tolist()
overall_results["ROUGE 2 Fmeasure Kendall"] = results_k.iloc[2,:6].values.tolist()

results_k["Average"] = results_k.T.mean()
results_k

[1mKendall Tau Correlation between z-score and ROUGE measures (ROUGE 2) 



Unnamed: 0,Russian into English,German into English,Czech into English,Chinese into English,English into Chinese,English into Finish,Average
ROUGE precision,0.229159,0.213632,0.286637,0.208257,0.067758,0.320441,0.220981
ROUGE recall,0.213027,0.20272,0.262969,0.192289,0.06884,0.304527,0.207395
ROUGE fmeasure,0.224302,0.211171,0.278814,0.203703,0.069508,0.314192,0.216948


### ROUGE L

In [15]:
from rouge_score import rouge_scorer

results_p = pd.DataFrame()
results_k = pd.DataFrame()

correlations_p = []
correlations_k = []
j = 0
co = 0
for element in [ru_en, de_en, cs_en, zh_en, en_zh, en_fi]:
    
    precisions = []
    recalls = []
    fmeasures = []

    #calculating the rouge scores for the translations in comparison to their respective reference
    for i in range(element.shape[0]): #element.shape[0]
        scorer = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], use_stemmer=True)
        scores = scorer.score(element.loc[i,"reference"], element.loc[i,"translation"])
        precisions.append(scores["rougeL"].precision)
        recalls.append(scores["rougeL"].recall)
        fmeasures.append(scores["rougeL"].fmeasure)

    #add the rouge scores to the dataframe
    development_df = element.copy() #element.shape[0]
    development_df["ROUGE precision"] = precisions
    development_df["ROUGE recall"] = recalls
    development_df["ROUGE fmeasure"] = fmeasures
    correlations_p.append(development_df.corr(method="pearson").iloc[-1:,0].values[0])
    correlations_k.append(development_df.corr(method="kendall").iloc[-1:,0].values[0])
    col_name = descriptions[j]
    j += 1
    results_p[col_name] = pd.Series(development_df.corr(method="pearson").iloc[0,3:])
    results_k[col_name] = pd.Series(development_df.corr(method="kendall").iloc[0,3:])
    
    if co == 0:
        ru_en_["ROUGEL_precision"] = precisions
        ru_en_["ROUGEL_recall"] = recalls
        ru_en_["ROUGEL_fmeasure"] = fmeasures
    elif co == 1:
        de_en_["ROUGEL_precision"] = precisions
        de_en_["ROUGEL_recall"] = recalls
        de_en_["ROUGEL_fmeasure"] = fmeasures
    elif co == 2:
        cs_en_["ROUGEL_precision"] = precisions
        cs_en_["ROUGEL_recall"] = recalls
        cs_en_["ROUGEL_fmeasure"] = fmeasures
    elif co == 3:
        zh_en_["ROUGEL_precision"] = precisions
        zh_en_["ROUGEL_recall"] = recalls
        zh_en_["ROUGEL_fmeasure"] = fmeasures
    elif co == 4:
        en_zh_["ROUGEL_precision"] = precisions
        en_zh_["ROUGEL_recall"] = recalls
        en_zh_["ROUGEL_fmeasure"] = fmeasures
    elif co == 5:
        en_fi_["ROUGEL_precision"] = precisions
        en_fi_["ROUGEL_recall"] = recalls
        en_fi_["ROUGEL_fmeasure"] = fmeasures
    co += 1
    


print("\033[1mPearson Correlation between z-score and ROUGE measures (ROUGE L) \n")

overall_results["ROUGE L Precision Pearson"] = results_p.iloc[0,:6].values.tolist()
overall_results["ROUGE L Recall Pearson"] = results_p.iloc[1,:6].values.tolist()
overall_results["ROUGE L Fmeasure Pearson"] = results_p.iloc[2,:6].values.tolist()

results_p["Average"] = results_p.T.mean()
results_p

[1mPearson Correlation between z-score and ROUGE measures (ROUGE L) 



Unnamed: 0,Russian into English,German into English,Czech into English,Chinese into English,English into Chinese,English into Finish,Average
ROUGE precision,0.355554,0.321503,0.470087,0.347083,0.076326,0.540008,0.35176
ROUGE recall,0.315,0.300898,0.411121,0.305689,0.077659,0.505371,0.31929
ROUGE fmeasure,0.352986,0.327532,0.461447,0.344989,0.082974,0.535137,0.350844


In [16]:
print("\033[1mKendall Tau Correlation between z-score and ROUGE measures (ROUGE L) \n")

overall_results["ROUGE L Precision Kendall"] = results_k.iloc[0,:6].values.tolist()
overall_results["ROUGE L Recall Kendall"] = results_k.iloc[1,:6].values.tolist()
overall_results["ROUGE L Fmeasure Kendall"] = results_k.iloc[2,:6].values.tolist()

results_k["Average"] = results_k.T.mean()
results_k

[1mKendall Tau Correlation between z-score and ROUGE measures (ROUGE L) 



Unnamed: 0,Russian into English,German into English,Czech into English,Chinese into English,English into Chinese,English into Finish,Average
ROUGE precision,0.247499,0.228066,0.318498,0.235286,0.050533,0.3557,0.239264
ROUGE recall,0.221562,0.212349,0.278319,0.20733,0.051499,0.326671,0.216288
ROUGE fmeasure,0.246683,0.22988,0.311631,0.231499,0.054923,0.347729,0.237058


## RESULTS TABLE

In [17]:
pearsons = []
kendalls = []

for element in list(overall_results.keys()):
    if element.endswith("Pearson"):
        pearsons.append(element)
    elif element.endswith("Kendall"):
        kendalls.append(element)
        
dict_pearson = { your_key: overall_results[your_key] for your_key in pearsons }
dict_kendall = { your_key: overall_results[your_key] for your_key in kendalls }

pearson_df = pd.DataFrame(list(dict_pearson.values()), index=list(dict_pearson.keys()), columns=descriptions)
pearson_df

Unnamed: 0,Russian into English,German into English,Czech into English,Chinese into English,English into Chinese,English into Finish
BLEU Star Pearson,0.333658,0.298661,0.425182,0.313196,0.02559,0.508383
BLEU Sentence Pearson,0.254081,0.241926,0.292354,0.245847,0.014064,0.262296
BLEU Corpus Pearson,0.053877,0.006482,0.09234,0.054715,0.424316,0.235517
ROUGE 1 Precision Pearson,0.344109,0.316624,0.456718,0.331985,0.074533,0.549302
ROUGE 1 Recall Pearson,0.294337,0.294263,0.390546,0.277619,0.075061,0.51322
ROUGE 1 Fmeasure Pearson,0.341309,0.326557,0.450511,0.328195,0.080697,0.54454
ROUGE 2 Precision Pearson,0.326187,0.301237,0.412951,0.308727,0.088139,0.46835
ROUGE 2 Recall Pearson,0.301894,0.288012,0.380923,0.285887,0.092338,0.444354
ROUGE 2 Fmeasure Pearson,0.320052,0.301592,0.405594,0.304461,0.095062,0.461915
ROUGE L Precision Pearson,0.355554,0.321503,0.470087,0.347083,0.076326,0.540008


In [18]:
pearson_evaluation = pd.DataFrame(pearson_df.idxmax(), columns=["Metric with highest correlation"])
pearson_evaluation["Value"] = pearson_df.max()
pearson_evaluation

Unnamed: 0,Metric with highest correlation,Value
Russian into English,ROUGE L Precision Pearson,0.355554
German into English,ROUGE L Fmeasure Pearson,0.327532
Czech into English,ROUGE L Precision Pearson,0.470087
Chinese into English,ROUGE L Precision Pearson,0.347083
English into Chinese,BLEU Corpus Pearson,0.424316
English into Finish,ROUGE 1 Precision Pearson,0.549302


In [19]:
kendall_df = pd.DataFrame(list(dict_kendall.values()), index=list(dict_kendall.keys()), columns=descriptions)
kendall_df

Unnamed: 0,Russian into English,German into English,Czech into English,Chinese into English,English into Chinese,English into Finish
BLEU Star Kendall,0.228402,0.210405,0.28862,0.21182,0.004606,0.338281
BLEU Sentence Kendall,0.189865,0.176866,0.225613,0.1791,0.004496,0.289645
BLEU Corpus Kendall,0.062795,0.039423,0.098247,0.048396,0.299679,0.172336
ROUGE 1 Precision Kendall,0.233533,0.219705,0.307255,0.220957,0.050288,0.363198
ROUGE 1 Recall Kendall,0.200764,0.203551,0.260756,0.181468,0.050626,0.332239
ROUGE 1 Fmeasure Kendall,0.233055,0.225082,0.302951,0.216623,0.054556,0.354801
ROUGE 2 Precision Kendall,0.229159,0.213632,0.286637,0.208257,0.067758,0.320441
ROUGE 2 Recall Kendall,0.213027,0.20272,0.262969,0.192289,0.06884,0.304527
ROUGE 2 Fmeasure Kendall,0.224302,0.211171,0.278814,0.203703,0.069508,0.314192
ROUGE L Precision Kendall,0.247499,0.228066,0.318498,0.235286,0.050533,0.3557


In [20]:
kendall_evaluation = pd.DataFrame(kendall_df.idxmax(), columns=["Metric with highest correlation"])
kendall_evaluation["Value"] = kendall_df.max()
kendall_evaluation

Unnamed: 0,Metric with highest correlation,Value
Russian into English,ROUGE L Precision Kendall,0.247499
German into English,ROUGE L Fmeasure Kendall,0.22988
Czech into English,ROUGE L Precision Kendall,0.318498
Chinese into English,ROUGE L Precision Kendall,0.235286
English into Chinese,BLEU Corpus Kendall,0.299679
English into Finish,ROUGE 1 Precision Kendall,0.363198


## COMBINATION - Predicting the scores for the testset with the best respective metric

In [21]:
#importing the data

path_corpus_test = "/Users/franz/Downloads/testset/"

ru_en_test = pd.read_csv(path_corpus_test + "ru-en/scores.csv")
de_en_test = pd.read_csv(path_corpus_test + "de-en/scores.csv")
cs_en_test = pd.read_csv(path_corpus_test + "cs-en/scores.csv")
zh_en_test = pd.read_csv(path_corpus_test + "zh-en/scores.csv")
en_zh_test = pd.read_csv(path_corpus_test + "en-zh/scores.csv")
en_fi_test = pd.read_csv(path_corpus_test + "en-fi/scores.csv")

Throughout all the test set, there is only one field containing a nan, which is ru_en_test.iloc[9191,1]. As the metric score cannot be computed without a reference, this row has to be deleted!

In [22]:
i = 0 
j = 0
for element in ru_en_test.isna()["reference"].tolist():
    if element == True:
        j = i
    i +=1
    
pd.DataFrame(ru_en_test.iloc[j,:]).T


Unnamed: 0,source,reference,translation
9191,Кот-тяжеловес по кличке Мистер Красавчик нашел...,,A heavyweight cat that goes by the nickname of...


In [23]:
ru_en_test = ru_en_test.dropna().reset_index(drop=True)

In [24]:
from langdetect import detect

for element in [ru_en_test, de_en_test, cs_en_test, zh_en_test, en_zh_test, en_fi_test]:
    
    # detect the language pair contained in the dataframe
    pair = detect(element.iloc[0,0]) + "_" + detect(element.iloc[0,1])
    
    predicted_scores = []
    
    if pair in ["ru_en", "cs_en", "de_en", "zh-cn_en", "en_fi"]:
        for i in range(element.shape[0]):
            scorer = rouge_scorer.RougeScorer(['rougeL', 'rouge1'], use_stemmer=True)
            scores = scorer.score(element.loc[i,"reference"], element.loc[i,"translation"])
            if pair in ["ru_en", "cs_en"]:
                predicted_scores.append(scores["rougeL"].precision)
            elif pair == "de_en":
                predicted_scores.append(scores["rougeL"].fmeasure)
            elif pair in ["zh-cn_en", "en_fi"]:
                predicted_scores.append(scores["rouge1"].precision)
                
    elif pair == "en_zh-cn":
        for i in range(element.shape[0]):
            reference = [element.loc[i,"reference"].split()]
            translation = element.loc[i,"translation"].split()
            while len(reference) < len(translation):
                reference.append(" ")
            while len(reference) > len(translation):
                translation.append(" ")
            predicted_scores.append(corpus_bleu(reference, translation))
        
            
    element["predicted_score"] = predicted_scores

The hypothesis contains 0 counts of 4-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()
The hypothesis contains 0 counts of 2-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()
The hypothesis contains 0 counts of 3-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()


In [25]:
for element in [ru_en_test, de_en_test, cs_en_test, zh_en_test, en_zh_test, en_fi_test]:
    display(element.head())

Unnamed: 0,source,reference,translation,predicted_score
0,Через полчаса обуглившийся клубень достают и п...,"After half an hour, the charred tuber is taken...","After half-an-hour, the charred tuber is retri...",0.8
1,"Здесь никто не думает отменять смертную казнь,...","Here, no one thinks to abolish the death penal...","Here, no one is concerned with abolishing the ...",0.8
2,"Собеседники ""Известий"" в ОНФ отмечают, что док...","The interlocutors of"" Izvestiya ""in the onf no...",Izvestia’s sources in the ONF note that the re...,0.625
3,На древней Венере могли существовать океаны.,On the ancient Venus could exist in the oceans.,Oceans could have existed on ancient Venus.,0.428571
4,До этого момента убийства оставались лишь исто...,"Up to this point, the murders were just a stor...","Up until this point, the murders have remained...",0.5


Unnamed: 0,source,reference,translation,predicted_score
0,Das Publikum ist fast gleichmäßig zwischen Sch...,The audience is almost evenly split between bl...,The audience is almost evenly split between bl...,1.0
1,Du kannst ihre Energie durch den Bildschirm sp...,"You can feel their energy through the screen. """"","You can feel her energy through the screen.""",0.875
2,"Da die Adresse unbekannt ist, wird die Mithilf...","As the address is unknown, the help of the pop...","As the address is unknown, the assistance of t...",0.903226
3,"Arsenal-Manager Arsene Wenger, dessen Verein i...","Arsenal manager Arsene Wenger, whose club is o...","Arsenal manager Arsene Wenger, whose club is o...",1.0
4,Landwirtschaftsminister im Interview - Wie sch...,Agriculture Minister in the interview - How do...,Minister of Agriculture in interview – How do ...,0.733333


Unnamed: 0,source,reference,translation,predicted_score
0,"Památník, důstojné pietní místo, stojí vůlí dě...","The monument, a dignified piecemeal place, sta...","The memorial, a solemn place of commemoration,...",0.522727
1,Pracovník centra Čang Č-čung sdělil agentuře N...,Centre worker Zhang Zu-chung told the New Chin...,Centre worker Chang Chi-Chung told New China t...,0.625
2,Veterináři nicméně odeberou namátkové vzorky v...,"However, veterinarians take random samples of ...","However, veterinarians are taking samples of e...",0.59375
3,Uživatel @TheePharoah jí neustále retweetoval ...,User @ TheePharoah constantly retweeted her po...,A user with the handle @TheePharoah was being ...,0.384615
4,Lucii bylo tehdy pouhých 19 let a rozhodně net...,Lucia was only 19 at the time and certainly ha...,"At that time, Lucie was only 19 years old, and...",0.285714


Unnamed: 0,source,reference,translation,predicted_score
0,已经批准筹建的，暂停批准开业,"Where the preparation has been approved, the a...",Approval of opening on these establishments wi...,0.555556
1,王丰源在首发式发言中说，来美国前想找本书看看别人的经验，但他翻遍新华书店没找到关于留学美国中...,"In his opening speech, Mr. Wang said he wanted...",Wang Fengyuan spoke at the launch of his new b...,0.509804
2,“如果你不致力于创造透明文化，你会失去人才，”维特拉诺说道。,"""if you're not committed to creating a culture...","""If you're not committed to creating a culture...",0.789474
3,不过前提是多国联军先停止对也门的袭击。,"The premise, however, is that the coalition fo...","However, the premise is that the multinational...",0.8
4,“在此之前，我和前男友住在骑士桥的一个更大的房子里，”乔安妮说道。,"""before that, my ex and I lived in a bigger ho...","""Before this, I was living with my ex in Knigh...",0.647059


Unnamed: 0,source,reference,translation,predicted_score
0,The future and the destinies of the citizens o...,世界上每个国家公民的未来和命运日益联系在一起。,世界各国人民前途命运越来越紧密地联系在一起。,0.268354
1,"After all that hard work, the finished result ...",经过那么多的努力，最终的结果现在已经可以揭晓了。,经过这么艰辛的工作，最终的结果现在才得以公布。,0.33951
2,Author: researcher of Suning Institute of Fina...,作者：苏宁金融研究所研究员，财经专栏作家，财经评论员。,作者：苏宁金融研究院特约研究员，财经专栏作家，财经评论员。,0.833079
3,“The Great Wall” tells the story of a Chinese ...,《长城》讲述了古代一支中国精锐部队在世界著名的中国长城上与怪物桃蒂英勇作战的故事。,《长城》讲述了在古代，一支中国精英部队为保卫人类，在举世闻名的长城上与怪兽饕餮进行生死决战的故事。,0.365893
4,Our comrades from the Political Bureau should ...,政治局同志要学习历史，讲道理，不能混淆公、私利益，叫白黑，模糊义与利的界限，处理基于裙带关系...,中央政治局的同志都应该明史知理，不能颠倒了公私、混淆了是非、模糊了义利、放纵了亲情，要带头树...,0.150515


Unnamed: 0,source,reference,translation,predicted_score
0,One local resident who did not wish to be name...,"Eräs paikallinen asukas, joka ei halunnut nime...",Toisen nimettömänä pysyttelevän asukkaan mukaa...,0.25
1,"Still, she clings to a chant she's committed t...",Silti hän takertuu chant hän on sitoutunut mui...,"Silti hän luottaa edelleen iskulauseeseen, jon...",0.625
2,"I don't want to be asked, 'What were you doing...","En halua, että minulta kysytään: ""Mitä te teit...","En halua, että kenenkään tarvitsee kysyä minul...",0.363636
3,"""I wouldn't say it was a lie – that's a pretty...","""En sanoisi, että se oli valhe - se on aika ro...","En sanoisi, että se oli valhe, se on aika kova...",0.916667
4,Kari Kola took part in the opening ceremony of...,Kari Kola osallistui valon vuoden avajaisiin v...,Kari Kola oli mukana Valon teemavuoden avajais...,0.5


## LINEAR REGRESSION ON RESPECTIVE TOP METRICS

In [26]:
from sklearn import linear_model

In [27]:
path_corpus = "/Users/franz/Desktop/TM Project/corpus/"

ru_en = pd.read_csv(path_corpus + "ru-en/scores.csv")
de_en = pd.read_csv(path_corpus + "de-en/scores.csv")
cs_en = pd.read_csv(path_corpus + "cs-en/scores.csv")
zh_en = pd.read_csv(path_corpus + "zh-en/scores.csv")
en_zh = pd.read_csv(path_corpus + "en-zh/scores.csv")
en_fi = pd.read_csv(path_corpus + "en-fi/scores.csv")

In [28]:
#top 3 metrics by language pair
for i in range(6):
    display(pd.DataFrame(pearson_df.iloc[:,i].sort_values(ascending = False).head(5)))

Unnamed: 0,Russian into English
ROUGE L Precision Pearson,0.355554
ROUGE L Fmeasure Pearson,0.352986
ROUGE 1 Precision Pearson,0.344109
ROUGE 1 Fmeasure Pearson,0.341309
BLEU Star Pearson,0.333658


Unnamed: 0,German into English
ROUGE L Fmeasure Pearson,0.327532
ROUGE 1 Fmeasure Pearson,0.326557
ROUGE L Precision Pearson,0.321503
ROUGE 1 Precision Pearson,0.316624
ROUGE 2 Fmeasure Pearson,0.301592


Unnamed: 0,Czech into English
ROUGE L Precision Pearson,0.470087
ROUGE L Fmeasure Pearson,0.461447
ROUGE 1 Precision Pearson,0.456718
ROUGE 1 Fmeasure Pearson,0.450511
BLEU Star Pearson,0.425182


Unnamed: 0,Chinese into English
ROUGE L Precision Pearson,0.347083
ROUGE L Fmeasure Pearson,0.344989
ROUGE 1 Precision Pearson,0.331985
ROUGE 1 Fmeasure Pearson,0.328195
BLEU Star Pearson,0.313196


Unnamed: 0,English into Chinese
BLEU Corpus Pearson,0.424316
ROUGE 2 Fmeasure Pearson,0.095062
ROUGE 2 Recall Pearson,0.092338
ROUGE 2 Precision Pearson,0.088139
ROUGE L Fmeasure Pearson,0.082974


Unnamed: 0,English into Finish
ROUGE 1 Precision Pearson,0.549302
ROUGE 1 Fmeasure Pearson,0.54454
ROUGE L Precision Pearson,0.540008
ROUGE L Fmeasure Pearson,0.535137
ROUGE 1 Recall Pearson,0.51322


### Russian into English

In [29]:
l_precision = []
l_fmeasure = []
precision_1 = []
fmeasure_1 = []
bleu_star = []

for i in range(ru_en.shape[0]):
    scorer = rouge_scorer.RougeScorer(['rougeL', 'rouge1'], use_stemmer=True)
    scores = scorer.score(ru_en.loc[i,"reference"], ru_en.loc[i,"translation"])
    l_precision.append(scores["rougeL"].precision)
    l_fmeasure.append(scores["rougeL"].fmeasure)
    precision_1.append(scores["rouge1"].precision)
    fmeasure_1.append(scores["rouge1"].fmeasure)

    reference = ru_en.loc[i,"reference"]
    translation = ru_en.loc[i,"translation"]
    bleu_star.append(BLEU_star_compact(reference, translation))
    
ru_en["l_precision"] = l_precision
ru_en["l_fmeasure"] = l_fmeasure
ru_en["precision_1"] = precision_1
ru_en["fmeasure_1"] = fmeasure_1
ru_en["bleu_star"] = bleu_star


In [30]:
ru_en_scores = ru_en[["l_precision", "l_fmeasure", "precision_1", "fmeasure_1", "bleu_star", "z-score"]]

In [31]:
from scipy.stats import zscore

for element in ["l_precision", "l_fmeasure", "precision_1", "fmeasure_1", "bleu_star"]:
    ru_en_scores[element] = zscore(ru_en_scores[element])


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  ru_en_scores[element] = zscore(ru_en_scores[element])


In [32]:
ru_en_train = ru_en_scores.iloc[:int(ru_en_scores.shape[0]*0.7),:]
ru_en_test = ru_en_scores.iloc[int(ru_en_scores.shape[0]*0.7):,:]

In [33]:
import numpy as np
from sklearn.linear_model import LinearRegression

X = ru_en_train.iloc[:,:-1]
Y = ru_en_train.iloc[:,-1]
X_test = ru_en_test.iloc[:,:-1]
Y_test = ru_en_test.iloc[:,-1]

model_ru_en = LinearRegression()
model_ru_en.fit(X,Y)

result = pd.DataFrame(Y_test)
result["regression_values"] = model_ru_en.predict(X_test)

In [34]:
result.corr()

Unnamed: 0,z-score,regression_values
z-score,1.0,0.362087
regression_values,0.362087,1.0


### German into English

In [35]:
l_fmeasure = []
fmeasure_1 = []
l_precision = []
l_recall = []
precision_1 = []


for i in range(de_en.shape[0]):
    scorer = rouge_scorer.RougeScorer(['rougeL', 'rouge1'], use_stemmer=True)
    scores = scorer.score(de_en.loc[i,"reference"], de_en.loc[i,"translation"])
    l_fmeasure.append(scores["rougeL"].fmeasure)
    fmeasure_1.append(scores["rouge1"].fmeasure)
    l_precision.append(scores["rougeL"].precision)
    l_recall.append(scores["rougeL"].recall)
    precision_1.append(scores["rouge1"].precision)
    
de_en["l_fmeasure"] = l_fmeasure
de_en["fmeasure_1"] = fmeasure_1
de_en["l_precision"] = l_precision
de_en["l_recall"] = l_recall
de_en["precision_1"] = precision_1

de_en_scores = de_en[["l_fmeasure", "fmeasure_1", "l_precision", "l_recall", "precision_1", "z-score"]]

for element in ["l_fmeasure", "fmeasure_1", "l_precision", "l_recall", "precision_1"]:
    de_en_scores[element] = zscore(de_en_scores[element])
    
de_en_train = de_en_scores.iloc[:int(de_en_scores.shape[0]*0.7),:]
de_en_test = de_en_scores.iloc[int(de_en_scores.shape[0]*0.7):,:]

X = de_en_train.iloc[:,:-1]
Y = de_en_train.iloc[:,-1]
X_test = de_en_test.iloc[:,:-1]
Y_test = de_en_test.iloc[:,-1]

model_de_en = LinearRegression()
model_de_en.fit(X,Y)

result = pd.DataFrame(Y_test)
result["regression_values"] = model_de_en.predict(X_test)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  de_en_scores[element] = zscore(de_en_scores[element])


In [36]:
l_fmeasure = []
fmeasure_1 = []
l_precision = []
l_recall = []
precision_1 = []


for i in range(de_en.shape[0]):
    scorer = rouge_scorer.RougeScorer(['rougeL', 'rouge1'], use_stemmer=True)
    scores = scorer.score(de_en.loc[i,"reference"], de_en.loc[i,"translation"])
    l_fmeasure.append(scores["rougeL"].fmeasure)
    fmeasure_1.append(scores["rouge1"].fmeasure)
    l_precision.append(scores["rougeL"].precision)
    l_recall.append(scores["rougeL"].recall)
    precision_1.append(scores["rouge1"].precision)
    
de_en["l_fmeasure"] = l_fmeasure
de_en["fmeasure_1"] = fmeasure_1
de_en["l_precision"] = l_precision
de_en["l_recall"] = l_recall
de_en["precision_1"] = precision_1

de_en_scores = de_en[["l_fmeasure", "fmeasure_1", "l_precision", "l_recall", "precision_1", "z-score"]]

for element in ["l_fmeasure", "fmeasure_1", "l_precision", "l_recall", "precision_1"]:
    de_en_scores[element] = zscore(de_en_scores[element])
    
de_en_train = de_en_scores.iloc[:int(de_en_scores.shape[0]*0.7),:]
de_en_test = de_en_scores.iloc[int(de_en_scores.shape[0]*0.7):,:]

X = de_en_train.iloc[:,:-1]
Y = de_en_train.iloc[:,-1]
X_test = de_en_test.iloc[:,:-1]
Y_test = de_en_test.iloc[:,-1]

model_de_en = LinearRegression()
model_de_en.fit(X,Y)

result = pd.DataFrame(Y_test)
result["regression_values"] = model_de_en.predict(X_test)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  de_en_scores[element] = zscore(de_en_scores[element])


In [37]:
result.corr()

Unnamed: 0,z-score,regression_values
z-score,1.0,0.324474
regression_values,0.324474,1.0


### Czech into English

In [38]:
l_precision = []
l_fmeasure = []
precision_1 = []
fmeasure_1 = []
bleu_star = []

for i in range(cs_en.shape[0]):
    scorer = rouge_scorer.RougeScorer(['rougeL', 'rouge1'], use_stemmer=True)
    scores = scorer.score(cs_en.loc[i,"reference"], cs_en.loc[i,"translation"])
    l_precision.append(scores["rougeL"].precision)
    l_fmeasure.append(scores["rougeL"].fmeasure)
    precision_1.append(scores["rouge1"].precision)
    fmeasure_1.append(scores["rouge1"].fmeasure)

    reference = cs_en.loc[i,"reference"]
    translation = cs_en.loc[i,"translation"]
    bleu_star.append(BLEU_star_compact(reference, translation))
    
cs_en["l_precision"] = l_precision
cs_en["l_fmeasure"] = l_fmeasure
cs_en["precision_1"] = precision_1
cs_en["fmeasure_1"] = fmeasure_1
cs_en["bleu_star"] = bleu_star

cs_en_scores = cs_en[["l_precision", "l_fmeasure", "precision_1", "fmeasure_1", "bleu_star", "z-score"]]

for element in ["l_precision", "l_fmeasure", "precision_1", "fmeasure_1", "bleu_star"]:
    cs_en_scores[element] = zscore(cs_en_scores[element])
    
cs_en_train = cs_en_scores.iloc[:int(cs_en_scores.shape[0]*0.7),:]
cs_en_test = cs_en_scores.iloc[int(cs_en_scores.shape[0]*0.7):,:]

X = cs_en_train.iloc[:,:-1]
Y = cs_en_train.iloc[:,-1]
X_test = cs_en_test.iloc[:,:-1]
Y_test = cs_en_test.iloc[:,-1]
X_full = cs_en_scores.iloc[:,:-1]
Y_full = cs_en_scores.iloc[:,-1]

model_cs_en = LinearRegression()
model_cs_en.fit(X,Y)

result = pd.DataFrame(Y_test)
result["regression_values"] = model_cs_en.predict(X_test)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  cs_en_scores[element] = zscore(cs_en_scores[element])


In [39]:
result.corr()

Unnamed: 0,z-score,regression_values
z-score,1.0,0.48009
regression_values,0.48009,1.0


### Chinese into English

In [40]:
l_precision = []
l_fmeasure = []
precision_1 = []
fmeasure_1 = []
bleu_star = []

for i in range(zh_en.shape[0]):
    scorer = rouge_scorer.RougeScorer(['rougeL', 'rouge1'], use_stemmer=True)
    scores = scorer.score(zh_en.loc[i,"reference"], zh_en.loc[i,"translation"])
    l_precision.append(scores["rougeL"].precision)
    l_fmeasure.append(scores["rougeL"].fmeasure)
    precision_1.append(scores["rouge1"].precision)
    fmeasure_1.append(scores["rouge1"].fmeasure)

    reference = zh_en.loc[i,"reference"]
    translation = zh_en.loc[i,"translation"]
    bleu_star.append(BLEU_star_compact(reference, translation))
    
zh_en["l_precision"] = l_precision
zh_en["l_fmeasure"] = l_fmeasure
zh_en["precision_1"] = precision_1
zh_en["fmeasure_1"] = fmeasure_1
zh_en["bleu_star"] = bleu_star

zh_en_scores = zh_en[["l_precision", "l_fmeasure", "precision_1", "fmeasure_1", "bleu_star", "z-score"]]

for element in ["l_precision", "l_fmeasure", "precision_1", "fmeasure_1", "bleu_star"]:
    zh_en_scores[element] = zscore(zh_en_scores[element])
    
zh_en_train = zh_en_scores.iloc[:int(zh_en_scores.shape[0]*0.7),:]
zh_en_test = zh_en_scores.iloc[int(zh_en_scores.shape[0]*0.7):,:]

X = zh_en_train.iloc[:,:-1]
Y = zh_en_train.iloc[:,-1]
X_test = zh_en_test.iloc[:,:-1]
Y_test = zh_en_test.iloc[:,-1]

model_zh_en = LinearRegression()
model_zh_en.fit(X,Y)

result = pd.DataFrame(Y_test)
result["regression_values"] = model_zh_en.predict(X_test)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  zh_en_scores[element] = zscore(zh_en_scores[element])


In [41]:
result.corr()

Unnamed: 0,z-score,regression_values
z-score,1.0,0.353457
regression_values,0.353457,1.0


### English into Chinese 

As only the BLEU Corpus yields acceptable results, there's no need for a Regression for this language pair!

### English into Finish 

In [42]:

l_fmeasure = []
fmeasure_1 = []
l_precision = []
recall_1 = []
precision_1 = []


for i in range(en_fi.shape[0]):
    scorer = rouge_scorer.RougeScorer(['rougeL', 'rouge1'], use_stemmer=True)
    scores = scorer.score(en_fi.loc[i,"reference"], en_fi.loc[i,"translation"])
    l_fmeasure.append(scores["rougeL"].fmeasure)
    fmeasure_1.append(scores["rouge1"].fmeasure)
    l_precision.append(scores["rougeL"].precision)
    recall_1.append(scores["rouge1"].recall)
    precision_1.append(scores["rouge1"].precision)
    
en_fi["l_fmeasure"] = l_fmeasure
en_fi["fmeasure_1"] = fmeasure_1
en_fi["l_precision"] = l_precision
en_fi["recall_1"] = recall_1
en_fi["precision_1"] = precision_1

en_fi_scores = en_fi[["l_fmeasure", "fmeasure_1", "l_precision", "recall_1", "precision_1", "z-score"]]

for element in ["l_fmeasure", "fmeasure_1", "l_precision", "recall_1", "precision_1"]:
    en_fi_scores[element] = zscore(en_fi_scores[element])
    
en_fi_train = en_fi_scores.iloc[:int(en_fi_scores.shape[0]*0.7),:]
en_fi_test = en_fi_scores.iloc[int(en_fi_scores.shape[0]*0.7):,:]

X = en_fi_train.iloc[:,:-1]
Y = en_fi_train.iloc[:,-1]
X_test = en_fi_test.iloc[:,:-1]
Y_test = en_fi_test.iloc[:,-1]

model_en_fi = LinearRegression()
model_en_fi.fit(X,Y)

result = pd.DataFrame(Y_test)
result["regression_values"] = model_en_fi.predict(X_test)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  en_fi_scores[element] = zscore(en_fi_scores[element])


In [43]:
result.corr()

Unnamed: 0,z-score,regression_values
z-score,1.0,0.458622
regression_values,0.458622,1.0


# REGRESSION ON TOP OF ALL LEXICAL METRICS

### RU EN


In [44]:
ru_en_ = ru_en_.drop(columns=["source", "reference", "translation", "avg-score", "annotators"])

In [45]:
for element in ['BLEU', 'BLEU_s', 'BLEU_c', 'ROUGE1_precision', 'ROUGE1_recall', 'ROUGE1_fmeasure', 'ROUGE2_precision', 'ROUGE2_recall', 'ROUGE2_fmeasure', 'ROUGEL_precision', 'ROUGEL_recall', 'ROUGEL_fmeasure']:
    ru_en_[element] = zscore(ru_en_[element])
    
ru_en__train = ru_en_.iloc[:int(ru_en_.shape[0]*0.7),:]
ru_en__test = ru_en_.iloc[int(ru_en_.shape[0]*0.7):,:]

X = ru_en__train.iloc[:,1:]
Y = ru_en__train.iloc[:,0]
X_test = ru_en__test.iloc[:,1:]
Y_test = ru_en__test.iloc[:,0] 
X_full = ru_en_.iloc[:,1:]
Y_full = ru_en_.iloc[:,0] 

model_ru_en_ = LinearRegression()
model_ru_en_.fit(X,Y)

final_model_ru_en = LinearRegression()
final_model_ru_en.fit(X_full, Y_full)
    
result = pd.DataFrame(Y_test)
result["regression_values"] = model_ru_en_.predict(X_test)

In [46]:
result.corr()

Unnamed: 0,z-score,regression_values
z-score,1.0,0.368027
regression_values,0.368027,1.0


### DE EN

In [47]:
de_en_ = de_en_.drop(columns=["source", "reference", "translation", "avg-score", "annotators"])

for element in ['BLEU', 'BLEU_s', 'BLEU_c', 'ROUGE1_precision', 'ROUGE1_recall', 'ROUGE1_fmeasure', 'ROUGE2_precision', 'ROUGE2_recall', 'ROUGE2_fmeasure', 'ROUGEL_precision', 'ROUGEL_recall', 'ROUGEL_fmeasure']:
    de_en_[element] = zscore(de_en_[element])
    
de_en__train = de_en_.iloc[:int(de_en_.shape[0]*0.7),:]
de_en__test = de_en_.iloc[int(de_en_.shape[0]*0.7):,:]

X = de_en__train.iloc[:,1:]
Y = de_en__train.iloc[:,0]
X_test = de_en__test.iloc[:,1:]
Y_test = de_en__test.iloc[:,0] 

model_de_en_ = LinearRegression()
model_de_en_.fit(X,Y)
    
result = pd.DataFrame(Y_test)
result["regression_values"] = model_de_en_.predict(X_test)

In [48]:
result.corr()

Unnamed: 0,z-score,regression_values
z-score,1.0,0.327479
regression_values,0.327479,1.0


### CS EN

In [49]:
cs_en_ = cs_en_.drop(columns=["source", "reference", "translation", "avg-score", "annotators"])

for element in ['BLEU', 'BLEU_s', 'BLEU_c', 'ROUGE1_precision', 'ROUGE1_recall', 'ROUGE1_fmeasure', 'ROUGE2_precision', 'ROUGE2_recall', 'ROUGE2_fmeasure', 'ROUGEL_precision', 'ROUGEL_recall', 'ROUGEL_fmeasure']:
    cs_en_[element] = zscore(cs_en_[element])
    
cs_en__train = cs_en_.iloc[:int(cs_en_.shape[0]*0.7),:]
cs_en__test = cs_en_.iloc[int(cs_en_.shape[0]*0.7):,:]

X = cs_en__train.iloc[:,1:]
Y = cs_en__train.iloc[:,0]
X_test = cs_en__test.iloc[:,1:]
Y_test = cs_en__test.iloc[:,0] 
X_full = cs_en_.iloc[:,1:]
Y_full = cs_en_.iloc[:,0] 

model_cs_en_ = LinearRegression()
model_cs_en_.fit(X,Y)

final_model_cs_en = LinearRegression()
final_model_cs_en.fit(X_full, Y_full)
    
result = pd.DataFrame(Y_test)
result["regression_values"] = model_cs_en_.predict(X_test)

In [50]:
result.corr()

Unnamed: 0,z-score,regression_values
z-score,1.0,0.482638
regression_values,0.482638,1.0


### ZH EN

In [51]:
zh_en_ = zh_en_.drop(columns=["source", "reference", "translation", "avg-score", "annotators"])

for element in ['BLEU', 'BLEU_s', 'BLEU_c', 'ROUGE1_precision', 'ROUGE1_recall', 'ROUGE1_fmeasure', 'ROUGE2_precision', 'ROUGE2_recall', 'ROUGE2_fmeasure', 'ROUGEL_precision', 'ROUGEL_recall', 'ROUGEL_fmeasure']:
    zh_en_[element] = zscore(zh_en_[element])
    
zh_en__train = zh_en_.iloc[:int(zh_en_.shape[0]*0.7),:]
zh_en__test = zh_en_.iloc[int(zh_en_.shape[0]*0.7):,:]

X = zh_en__train.iloc[:,1:]
Y = zh_en__train.iloc[:,0]
X_test = zh_en__test.iloc[:,1:]
Y_test = zh_en__test.iloc[:,0] 
X_full = zh_en_.iloc[:,1:]
Y_full = zh_en_.iloc[:,0] 

model_zh_en_ = LinearRegression()
model_zh_en_.fit(X,Y)

final_model_zh_en = LinearRegression()
final_model_zh_en.fit(X_full, Y_full)
    
result = pd.DataFrame(Y_test)
result["regression_values"] = model_zh_en_.predict(X_test)

In [52]:
result.corr()

Unnamed: 0,z-score,regression_values
z-score,1.0,0.360435
regression_values,0.360435,1.0


### EN ZH

In [53]:
en_zh_ = en_zh_.drop(columns=["source", "reference", "translation", "avg-score", "annotators"])

for element in ['BLEU', 'BLEU_s', 'BLEU_c', 'ROUGE1_precision', 'ROUGE1_recall', 'ROUGE1_fmeasure', 'ROUGE2_precision', 'ROUGE2_recall', 'ROUGE2_fmeasure', 'ROUGEL_precision', 'ROUGEL_recall', 'ROUGEL_fmeasure']:
    en_zh_[element] = zscore(en_zh_[element])
    
en_zh__train = en_zh_.iloc[:int(en_zh_.shape[0]*0.7),:]
en_zh__test = en_zh_.iloc[int(en_zh_.shape[0]*0.7):,:]

X = en_zh__train.iloc[:,1:]
Y = en_zh__train.iloc[:,0]
X_test = en_zh__test.iloc[:,1:]
Y_test = en_zh__test.iloc[:,0] 

model_en_zh_ = LinearRegression()
model_en_zh_.fit(X,Y)
    
result = pd.DataFrame(Y_test)
result["regression_values"] = model_en_zh_.predict(X_test)

In [54]:
result.corr()

Unnamed: 0,z-score,regression_values
z-score,1.0,0.41002
regression_values,0.41002,1.0


### EN FI

In [55]:
en_fi_ = en_fi_.drop(columns=["source", "reference", "translation", "avg-score", "annotators"])

for element in ['BLEU', 'BLEU_s', 'BLEU_c', 'ROUGE1_precision', 'ROUGE1_recall', 'ROUGE1_fmeasure', 'ROUGE2_precision', 'ROUGE2_recall', 'ROUGE2_fmeasure', 'ROUGEL_precision', 'ROUGEL_recall', 'ROUGEL_fmeasure']:
    en_fi_[element] = zscore(en_fi_[element])
    
en_fi__train = en_fi_.iloc[:int(en_fi_.shape[0]*0.7),:]
en_fi__test = en_fi_.iloc[int(en_fi_.shape[0]*0.7):,:]

X = en_fi__train.iloc[:,1:]
Y = en_fi__train.iloc[:,0]
X_test = en_fi__test.iloc[:,1:]
Y_test = en_fi__test.iloc[:,0] 

model_en_fi_ = LinearRegression()
model_en_fi_.fit(X,Y)

final_model_en_fi = LinearRegression()
final_model_en_fi.fit(X_full, Y_full)
    
result = pd.DataFrame(Y_test)
result["regression_values"] = model_en_fi_.predict(X_test)

In [56]:
result.corr()

Unnamed: 0,z-score,regression_values
z-score,1.0,0.474168
regression_values,0.474168,1.0


## THE OVERALL BEST CORRELATIONS PER LANGUAGE PAIR 

##### RU EN
* Regression with all 12 metrics included (0.368)

##### DE EN
* ROUGE L Fmeasure Pearson (0.328)

##### CS EN
* Regression with all 12 metrics included (0.483)

##### ZH EN
* Regression with all 12 metrics included (0.360)

##### EN ZH
* BLEU Corpus Pearson (0.424)

##### EN FI
* ROUGE 1 Precision Pearson (0.549)

# Final Metric Function

In [57]:
from langdetect import detect

#importing the data

path_corpus_test = "/Users/franz/Downloads/testset/"

ru_en_test = pd.read_csv(path_corpus_test + "ru-en/scores.csv")
de_en_test = pd.read_csv(path_corpus_test + "de-en/scores.csv")
cs_en_test = pd.read_csv(path_corpus_test + "cs-en/scores.csv")
zh_en_test = pd.read_csv(path_corpus_test + "zh-en/scores.csv")
en_zh_test = pd.read_csv(path_corpus_test + "en-zh/scores.csv")
en_fi_test = pd.read_csv(path_corpus_test + "en-fi/scores.csv")

In [58]:
#fill the nan
ru_en_test.iloc[9191,1] = "This is a dummy text, as the nan has to be filled!"

In [59]:
def metric(element):
    # detect the language pair contained in the dataframe
    pair = detect(element.iloc[0,0]) + "_" + detect(element.iloc[0,1])
    
    predicted_scores = []
    
    #check if pair in list, if so, all 12 metrics need to be computed to feed to the respective model
    if pair in ["ru_en", "cs_en", "zh-cn_en"]:
        
        all_12_df = pd.DataFrame()
        
        bleu_star_scores = []
        bleu_sentence_scores = []
        bleu_corpus_scores = []
        r_1_p = []
        r_1_r = []
        r_1_f = []
        r_2_p = []
        r_2_r = []
        r_2_f = []
        r_l_p = []
        r_l_r = []
        r_l_f = []

        
        for i in range(element.shape[0]):
            #calculating the bleu star scores for the translations in comparison to their respective reference
            reference = element.loc[i,"reference"]
            translation = element.loc[i,"translation"]
            bleu_star_scores.append(BLEU_star_compact(reference, translation))
            
            #calculating the bleu sentence scores for the translations in comparison to their respective reference
            reference = [element.loc[i,"reference"].split()]
            translation = element.loc[i,"translation"].split()
            bleu_sentence_scores.append(sentence_bleu(reference, translation,weights=(0.25, 0.25, 0.25, 0.25)))
            
            #calculating the bleu corpus scores for the translations in comparison to their respective reference
            reference = [element.loc[i,"reference"].split()]
            translation = element.loc[i,"translation"].split()
            while len(reference) < len(translation):
                reference.append(" ")
            while len(reference) > len(translation):
                translation.append(" ")
            bleu_corpus_scores.append(corpus_bleu(reference, translation))
        
            #calculating all the ROUGE scores for the translations in comparison to their respective reference
            scorer = rouge_scorer.RougeScorer(['rougeL', 'rouge1', 'rouge2'], use_stemmer=True)
            scores = scorer.score(element.loc[i,"reference"], element.loc[i,"translation"])
            
            r_1_p.append(scores["rouge1"].precision)
            r_1_r.append(scores["rouge1"].recall)
            r_1_f.append(scores["rouge1"].fmeasure)
            
            r_2_p.append(scores["rouge2"].precision)
            r_2_r.append(scores["rouge2"].recall)
            r_2_f.append(scores["rouge2"].fmeasure)
            
            r_l_p.append(scores["rougeL"].precision)
            r_l_r.append(scores["rougeL"].recall)
            r_l_f.append(scores["rougeL"].fmeasure)
            
        all_12_df["BLEU"] = bleu_star_scores
        all_12_df["BLEU_s"] = bleu_sentence_scores
        all_12_df["BLEU_c"] = bleu_corpus_scores
        all_12_df["ROUGE1_precision"] = r_1_p
        all_12_df["ROUGE1_recall"] = r_1_r
        all_12_df["ROUGE1_fmeasure"] = r_1_f
        all_12_df["ROUGE2_precision"] = r_2_p
        all_12_df["ROUGE2_recall"] = r_2_r
        all_12_df["ROUGE2_fmeasure"] = r_2_f
        all_12_df["ROUGEL_precision"] = r_l_p
        all_12_df["ROUGEL_recall"] = r_l_r
        all_12_df["ROUGEL_fmeasure"] = r_l_f
            
        if pair == "ru_en":
            return final_model_ru_en.predict(all_12_df)
        elif pair == "cs_en":
            return final_model_cs_en.predict(all_12_df)
        elif pair == "zh-cn_en":
            return final_model_zh_en.predict(all_12_df)
                
    elif pair in ["de_en", "en_fi"]:
        for i in range(element.shape[0]):
            scorer = rouge_scorer.RougeScorer(['rougeL', 'rouge1', 'rouge2'], use_stemmer=True)
            scores = scorer.score(element.loc[i,"reference"], element.loc[i,"translation"])
            if pair == "de_en":
                predicted_scores.append(scores["rougeL"].fmeasure)
            elif pair == "en_fi":
                predicted_scores.append(scores["rouge1"].precision)
                
    elif pair == "en_zh-cn":
        for i in range(element.shape[0]):
            reference = [element.loc[i,"reference"].split()]
            translation = element.loc[i,"translation"].split()
            while len(reference) < len(translation):
                reference.append(" ")
            while len(reference) > len(translation):
                translation.append(" ")
            predicted_scores.append(corpus_bleu(reference, translation))
        
    return predicted_scores

In [62]:
ru_en_test["scores"] = metric(ru_en_test)
de_en_test["scores"] = metric(de_en_test)
cs_en_test["scores"] = metric(cs_en_test)
zh_en_test["scores"] = metric(zh_en_test)
en_zh_test["scores"] = metric(en_zh_test)
en_fi_test["scores"] = metric(en_fi_test)

The hypothesis contains 0 counts of 2-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()
The hypothesis contains 0 counts of 3-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()
The hypothesis contains 0 counts of 4-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()
The hypothesis contains 0 counts of 4-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()
The hypothesis contains 0 counts of 3-gram overlaps.
Therefore the BLEU score evaluates to 0, indepe

In [63]:
ru_en_test.to_csv(path_corpus_test + "ru-en/scores_added.csv")
de_en_test.to_csv(path_corpus_test + "de-en/scores_added.csv")
cs_en_test.to_csv(path_corpus_test + "cs-en/scores_added.csv")
zh_en_test.to_csv(path_corpus_test + "zh-en/scores_added.csv")
en_zh_test.to_csv(path_corpus_test + "en-zh/scores_added.csv")
en_fi_test.to_csv(path_corpus_test + "en-fi/scores_added.csv")