# Text Mining Group Project
## DE-EN Corpus

##### TOC for Implemented Metrics.

---

In [1]:
# Necessary Installs.
# !pip install rouge

In [2]:
# Imports
import pandas as pd
from collections import Counter
from rouge import Rouge
import string
from nltk.translate import chrf_score
from nltk.tokenize import word_tokenize

Load Dataset

In [3]:
df1 = pd.read_csv("scores.csv")

In [4]:
df1.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 21704 entries, 0 to 21703
Data columns (total 6 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   source       21704 non-null  object 
 1   reference    21704 non-null  object 
 2   translation  21704 non-null  object 
 3   z-score      21704 non-null  float64
 4   avg-score    21704 non-null  float64
 5   annotators   21704 non-null  int64  
dtypes: float64(2), int64(1), object(3)
memory usage: 1017.5+ KB


In [5]:
df1.head()

Unnamed: 0,source,reference,translation,z-score,avg-score,annotators
0,"Ihr Zeitlupentempo maßen sie, als sie vor Spit...",Her timeless pace measures them when they equi...,Their slow speed was measured by researchers o...,-0.345024,76.0,1
1,"Er sagte, dass die Bereiche ruhige Treffpunkte...",He said the areas offer quiet meeting points b...,He said the spaces provided calm meeting point...,0.9038,97.5,2
2,Für die Geschäftsleute an der B 27 ist es nur ...,"For businessmen at the B 27, it's only a small...",This is only a small consolation for businesse...,0.700503,94.0,1
3,Diese Fähigkeit sei möglicherweise angeboren o...,This ability may be born or developed with gen...,"This ability may be innate, or may develop as ...",-1.256572,51.5,2
4,Weil sie Wassertemperaturen um die sechs Grad ...,Because they prefer water temperatures around ...,They generally only come to the surface in win...,0.293909,87.0,2


---
### PreProcessing

In [6]:
# Create two dataframes. One is unprocessed, the other is preprocessed to remove punctuation and be lowercased.
# (Future note: maybe also remove stopwords?)
df_u = df1.copy()
df_p = df1.copy()

for x in ["source","reference","translation"]:
    # lowercase.
    df_p[x] = df1[x].str.lower()
    # Remove punct.
    df_p[x] = df1[x].map(lambda s: s.translate(str.maketrans("","",string.punctuation))
                           .lower()
                          )
    
df_dict = {"df_u":df_u,"df_p":df_p}

In [7]:
list(df_dict.values())[1]

Unnamed: 0,source,reference,translation,z-score,avg-score,annotators
0,ihr zeitlupentempo maßen sie als sie vor spitz...,her timeless pace measures them when they equi...,their slow speed was measured by researchers o...,-0.345024,76.0,1
1,er sagte dass die bereiche ruhige treffpunkte ...,he said the areas offer quiet meeting points b...,he said the spaces provided calm meeting point...,0.903800,97.5,2
2,für die geschäftsleute an der b 27 ist es nur ...,for businessmen at the b 27 its only a small c...,this is only a small consolation for businesse...,0.700503,94.0,1
3,diese fähigkeit sei möglicherweise angeboren o...,this ability may be born or developed with gen...,this ability may be innate or may develop as t...,-1.256572,51.5,2
4,weil sie wassertemperaturen um die sechs grad ...,because they prefer water temperatures around ...,they generally only come to the surface in win...,0.293909,87.0,2
...,...,...,...,...,...,...
21699,lt cmdr patrick evans ein pressesprecher des p...,lt cmdr patrick evans a press officer at the p...,lt cmdr patrick evans a pentagon spokesman sai...,1.246459,100.0,1
21700,um ein beispiel zu geben wenn ich ihn etwas fr...,to give an example if i ask him something that...,to give an example if i ask him what happened ...,0.792878,98.0,1
21701,ein grund dafür dass nicht alle nachbarn das a...,one reason that not all neighbours view this a...,one reason for not all neighbours seeing this ...,0.597068,76.0,1
21702,der gewinn vor zinsen und steuern erhöhte sich...,profit before interest and tax increased from ...,profits before interest and taxes increased fr...,-0.305719,61.0,1


---
### Metrics

--- 
Bleu Metric

In [8]:
def my_first_BLEU(reference,translation):
    """
    Expects lists of strings for both reference and translation.
    Returns the score 
    """
    
    # Let word be every unique word in the translation.
    # Can be done by setting up a Counter object.
    t_c = Counter(word_tokenize(translation))
    words = sorted(t_c)
    
    refs_c = Counter(word_tokenize(reference))
    
    # Let Covered be the minimum amt of times a word appears in the reference, compared to R(w).
    # Let D(word) be how many times a unique word appears in the candidate translation.
    # Let R(word) be the largest numer of times the word appears in any one reference.
    
    covered = 0
    
    for word in words:
        covered += min(t_c[word],refs_c[word])

    

    # Let total be the number of words in translation.
    total = sum(t_c.values())
    
    BLEU_score = covered / total
    
    return BLEU_score


In [9]:
bleu_scores_list = ["basic bleu"]

for df in list(df_dict.values()):
    for key in bleu_scores_list:
        df[(key+" score")] = df.apply(lambda row: my_first_BLEU(row["reference"],row["translation"]),axis=1)

---
Rouge metric as described in

https://towardsdatascience.com/the-ultimate-performance-metric-in-nlp-111df6c64460

And
https://pypi.org/project/rouge-metric/

In [10]:
rouge = Rouge()

In [11]:
# Example Cell
model_out = df1["translation"][0]
reference = df1["reference"][0]
# The get scores method returns three metrics, F1 score, p precision and recall r.
# For each unigram,bigram and Longest sequence.
rouge.get_scores(model_out,reference)

[{'rouge-1': {'f': 0.2580645111342353, 'p': 0.25, 'r': 0.26666666666666666},
  'rouge-2': {'f': 0.20689654673008337, 'p': 0.2, 'r': 0.21428571428571427},
  'rouge-l': {'f': 0.2580645111342353, 'p': 0.25, 'r': 0.26666666666666666}}]

In [12]:
# Get scores.
# For every df considered;
for df in list(df_dict.values()):
    # For the entire model, model_out and reference need to be lists of strings.
    model_out = df["translation"].to_list()
    reference = df["reference"].to_list()
    rouge_scores = rouge.get_scores(model_out,reference)
    # For each of the scores calculated, output a new column in the df with the f1 scores.
    for key in rouge_scores[0].keys():
        df[(key+" score")] = pd.Series([score[key]["f"] for score in rouge_scores])

---
#### chrF metric

Check the paper here: https://www.aclweb.org/anthology/W15-3049.pdf

The general formula for the CHRF score is:

`CHRFBeta = (1 + Beta**2) * ((chrP * chrR) / (Beta**2*chrP + chrR))`

where:
* chrP is the percentage of n-grams in the hypothesis which have a counterpart in the reference.
* chrR is the percentage of character n-grams in the reference which are also present in the hypothesis.
* Beta is a parameter which assigns beta times more importance to recall than to precision (if beta == 1, they have the same importance).

In [13]:
# I was surprised, but this works exactly like it's intended. Makes a new column with the chrF score for each row of the df.
# The default n-gram values are min == 1, max == 6. 
# The default beta is 3.

# Moreover, it is worthwhile to mention chrf uses its own tokenization with whitespaces.
# All parameters to test chrf scores with. feel free to play around with this and test out different combinations.
# Note: this takes a few minutes to run.
min_len = [1,2]
max_len = [6,10]
beta = [1,3]

for df in list(df_dict.values()):
    chrf_scores=[]
    for min_l in min_len:
        for max_l in max_len:
            for b in beta:
                append_str = "chrf_b" + str(b) + "_n" + str(min_l) + str(max_l)
                chrf_scores.append(append_str)
                df[append_str] = df.apply(lambda row: chrf_score.sentence_chrf(row["reference"],row["translation"],min_len=min_l,max_len=max_l,beta=b),axis=1)

df_p.loc[:,chrf_scores]

Unnamed: 0,chrf_b1_n16,chrf_b3_n16,chrf_b1_n110,chrf_b3_n110,chrf_b1_n26,chrf_b3_n26,chrf_b1_n210,chrf_b3_n210
0,0.354196,0.344048,0.264071,0.256397,0.265999,0.258291,0.205058,0.199015
1,0.722251,0.736029,0.670436,0.683623,0.692265,0.705593,0.648020,0.660891
2,0.624240,0.642156,0.523872,0.539153,0.571527,0.588040,0.483435,0.497643
3,0.468525,0.534034,0.356754,0.407908,0.416143,0.475160,0.315234,0.361187
4,0.672770,0.670044,0.569104,0.566760,0.622249,0.619712,0.529518,0.527322
...,...,...,...,...,...,...,...,...
21699,0.737914,0.707710,0.665307,0.637799,0.700702,0.671923,0.636566,0.610150
21700,0.646419,0.646419,0.515826,0.515826,0.596655,0.596655,0.473670,0.473670
21701,0.637505,0.636003,0.513471,0.512251,0.581916,0.580539,0.468806,0.467688
21702,0.737063,0.788352,0.618948,0.662790,0.702465,0.751672,0.586603,0.628461


---
### Comparison of Applied Metrics
Because the numeric system used for all of these can be different, the best way to compare them is by checking the correlation with the annotator's scores.

In [14]:
# Initialize a dict to be transformed to a df later, for score comparison.
corr_list = ["pearson","kendall","spearman"]
scores_dict = {}

for df_name in df_dict.keys():
    for corr in corr_list:
        scores_dict[corr+"_"+df_name] = []
    
scores_index = []

In [15]:
# Thankfully, Pandas has a corr method.

# for each declared corr method, compute the corr between each computed metric and the avg-score column for each considered df.
for df_name in df_dict.keys():
    for corr in corr_list:
        for key in rouge_scores[0].keys():
            scores_dict[corr+"_"+df_name].append(df_dict[df_name].loc[:,(key+ " score")].corr(df_dict[df_name].loc[:,"avg-score"],method=corr))
        for chrf_score in set(chrf_scores):
            scores_dict[corr+"_"+df_name].append(df_dict[df_name].loc[:,chrf_score].corr(df_dict[df_name].loc[:,"avg-score"],method=corr))
        for bleu_score in bleu_scores_list:
            scores_dict[corr+"_"+df_name].append(df_dict[df_name].loc[:,(key+ " score")].corr(df_dict[df_name].loc[:,"avg-score"],method=corr))


# Build also a list that will be used to create the index for the scores dataframe.
scores_index.extend(list(rouge_scores[0].keys()))
scores_index.extend(chrf_scores)
scores_index.extend(bleu_scores_list)

In [16]:
scores_df = pd.DataFrame(scores_dict,index=scores_index)
scores_df

Unnamed: 0,pearson_df_u,kendall_df_u,spearman_df_u,pearson_df_p,kendall_df_p,spearman_df_p
rouge-1,0.269043,0.191316,0.279845,0.279074,0.198456,0.289822
rouge-2,0.254397,0.181336,0.264759,0.26538,0.18952,0.276403
rouge-l,0.276579,0.199476,0.291566,0.286976,0.206043,0.300573
chrf_b1_n16,0.289846,0.204569,0.298631,0.285805,0.202187,0.295058
chrf_b3_n16,0.302785,0.213614,0.311414,0.299859,0.211883,0.30878
chrf_b1_n110,0.280064,0.197065,0.287937,0.278051,0.195869,0.285981
chrf_b3_n110,0.286733,0.202177,0.295201,0.282249,0.199416,0.29109
chrf_b1_n26,0.289979,0.203781,0.297473,0.288823,0.203123,0.296289
chrf_b3_n26,0.291699,0.204655,0.298707,0.290286,0.203818,0.297261
chrf_b1_n210,0.279565,0.196433,0.287036,0.277226,0.194981,0.284703
