# Text Mining Group Project
## EN-FI Corpus

##### Notebook for the production of the test CSV.

---

In [1]:
# Imports
import pandas as pd
import numpy as np
from collections import Counter
from rouge import Rouge
import string
from nltk.translate import chrf_score
from nltk.tokenize import word_tokenize
from sklearn.preprocessing import StandardScaler

In [2]:
# Choose the best metric here.
best_metric = "chrf_b1_n110"

Load Dataset

In [3]:
df1 = pd.read_csv("scores.csv")

In [4]:
df1.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8097 entries, 0 to 8096
Data columns (total 3 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   source       8097 non-null   object
 1   reference    8097 non-null   object
 2   translation  8097 non-null   object
dtypes: object(3)
memory usage: 189.9+ KB


In [5]:
df1.head()

Unnamed: 0,source,reference,translation
0,One local resident who did not wish to be name...,"Eräs paikallinen asukas, joka ei halunnut nime...",Toisen nimettömänä pysyttelevän asukkaan mukaa...
1,"Still, she clings to a chant she's committed t...",Silti hän takertuu chant hän on sitoutunut mui...,"Silti hän luottaa edelleen iskulauseeseen, jon..."
2,"I don't want to be asked, 'What were you doing...","En halua, että minulta kysytään: ""Mitä te teit...","En halua, että kenenkään tarvitsee kysyä minul..."
3,"""I wouldn't say it was a lie – that's a pretty...","""En sanoisi, että se oli valhe - se on aika ro...","En sanoisi, että se oli valhe, se on aika kova..."
4,Kari Kola took part in the opening ceremony of...,Kari Kola osallistui valon vuoden avajaisiin v...,Kari Kola oli mukana Valon teemavuoden avajais...


---
### PreProcessing

In [6]:
# Check for empty or sparse reference / translation, and drop them.
for column in ["source","reference","translation"]:
    print(column)
    bad_idx = [idx for idx in np.where(df1[column].str.len()<=2)[0]]
    if bad_idx != []:
        print(df1.iloc[bad_idx])
    print(f"Bad idx: {bad_idx}")
#    df1 = df1.drop(index=bad_idx)

source
Bad idx: []
reference
Bad idx: []
translation
Bad idx: []


In [7]:
# Create two dataframes. One is unprocessed, the other is preprocessed to remove punctuation and be lowercased.
df_u = df1.copy()
df_p = df1.copy()

for x in ["source","reference","translation"]:
    # lowercase.
    df_p[x] = df1[x].str.lower()
    # Remove punct.
    df_p[x] = df1[x].map(lambda s: s.translate(str.maketrans("","",string.punctuation))
                           .lower()
                          )
    
df_dict = {"df_u":df_u,"df_p":df_p}

In [8]:
## Initialize a scaler for later.
scaler = StandardScaler()

---
### Metrics

--- 
Bleu Metric

In [9]:
def my_first_BLEU(reference,translation):
    """
    Expects lists of strings for both reference and translation.
    Returns the score 
    """
    
    # Let word be every unique word in the translation.
    # Can be done by setting up a Counter object.
    t_c = Counter(word_tokenize(translation))
    words = sorted(t_c)
    
    refs_c = Counter(word_tokenize(reference))
    
    # Let Covered be the minimum amt of times a word appears in the reference, compared to R(w).
    # Let D(word) be how many times a unique word appears in the candidate translation.
    # Let R(word) be the largest numer of times the word appears in any one reference.
    
    covered = 0
    
    for word in words:
        covered += min(t_c[word],refs_c[word])

    

    # Let total be the number of words in translation.
    total = sum(t_c.values())
    
    BLEU_score = covered / total
    
    return BLEU_score


In [10]:
bleu_scores_list = ["basic bleu"]

for df in list(df_dict.values()):
    for key in bleu_scores_list:
        # Apply the function to get a column of the scores.
        df[(key+" score")] = df.apply(lambda row: my_first_BLEU(row["reference"],row["translation"]),axis=1)
        # Also add a z score column.
        df[(key+ "_zscore")] = scaler.fit_transform(df[(key+" score")].to_numpy().reshape(-1,1)).flatten()

---
Rouge metric as described in

https://towardsdatascience.com/the-ultimate-performance-metric-in-nlp-111df6c64460

And
https://pypi.org/project/rouge-metric/

In [11]:
rouge = Rouge()

In [12]:
# Example Cell
model_out = df1["translation"][0]
reference = df1["reference"][0]
# The get scores method returns three metrics, F1 score, p precision and recall r.
# For each unigram,bigram and Longest sequence.
rouge.get_scores(model_out,reference)

[{'rouge-1': {'f': 0.09999999505000023,
   'p': 0.1111111111111111,
   'r': 0.09090909090909091},
  'rouge-2': {'f': 0.05263157400277054,
   'p': 0.058823529411764705,
   'r': 0.047619047619047616},
  'rouge-l': {'f': 0.1025640975936886,
   'p': 0.1111111111111111,
   'r': 0.09523809523809523}}]

In [13]:
# Get scores.
# For every df considered;
for df in list(df_dict.values()):
    # For the entire model, model_out and reference need to be lists of strings.
    model_out = df["translation"].to_list()
    reference = df["reference"].to_list()
    rouge_scores = rouge.get_scores(model_out,reference)
    # For each of the scores calculated, output a new column in the df with the f1 scores.
    for key in rouge_scores[0].keys():
        df[(key+" score")] = pd.Series([score[key]["f"] for score in rouge_scores])
        # Also add a z score column.
        df[(key+ "_zscore")] = scaler.fit_transform(df[(key+" score")].to_numpy().reshape(-1,1)).flatten()

---
#### chrF metric

Check the paper here: https://www.aclweb.org/anthology/W15-3049.pdf

The general formula for the CHRF score is:

`CHRFBeta = (1 + Beta**2) * ((chrP * chrR) / (Beta**2*chrP + chrR))`

where:
* chrP is the percentage of n-grams in the hypothesis which have a counterpart in the reference.
* chrR is the percentage of character n-grams in the reference which are also present in the hypothesis.
* Beta is a parameter which assigns beta times more importance to recall than to precision (if beta == 1, they have the same importance).

In [14]:
# The default n-gram values are min == 1, max == 6. 
# The default beta is 3.

# Moreover, it is worthwhile to mention chrf uses its own tokenization with whitespaces.
# Note: this takes a few minutes to run.
min_len = [1,2]
max_len = [6,10]
beta = [1,3]

for df in list(df_dict.values()):
    chrf_scores=[]
    for min_l in min_len:
        for max_l in max_len:
            for b in beta:
                append_str = "chrf_b" + str(b) + "_n" + str(min_l) + str(max_l)
                chrf_scores.append(append_str)
                df[append_str] = df.apply(lambda row: chrf_score.sentence_chrf(row["reference"],row["translation"],min_len=min_l,max_len=max_l,beta=b),axis=1)
                # Also add a z score column.
                df[(append_str+ "_zscore")] = scaler.fit_transform(df[append_str].to_numpy().reshape(-1,1)).flatten()

df_p.loc[:,chrf_scores]

Unnamed: 0,chrf_b1_n16,chrf_b3_n16,chrf_b1_n110,chrf_b3_n110,chrf_b1_n26,chrf_b3_n26,chrf_b1_n210,chrf_b3_n210
0,0.317356,0.294147,0.207913,0.192650,0.221429,0.205138,0.142460,0.131924
1,0.506854,0.481985,0.403381,0.383304,0.444014,0.422069,0.356973,0.339053
2,0.424447,0.455123,0.278711,0.298956,0.341532,0.366419,0.216454,0.232324
3,0.841195,0.824519,0.779825,0.763583,0.818958,0.802507,0.760652,0.744584
4,0.596405,0.685343,0.464651,0.535068,0.547381,0.629671,0.422775,0.487442
...,...,...,...,...,...,...,...,...
8092,0.474875,0.477859,0.371363,0.373721,0.409850,0.412441,0.323736,0.325806
8093,0.349700,0.352721,0.224179,0.226119,0.247299,0.249445,0.153343,0.154677
8094,0.730910,0.728619,0.635082,0.633066,0.693309,0.691125,0.603545,0.601619
8095,0.381664,0.399414,0.264957,0.277371,0.290908,0.304540,0.201570,0.211102


---
### Producing the testset CSV
Given the produced scores, choose the ones to apply to the CSV.

In [15]:
# We can use either the zscore or the non-standardized scores. 
# Also note either df_u or df_p choice.
df1["metric"]=df_u[best_metric+"_zscore"]

In [16]:
df1.head()

Unnamed: 0,source,reference,translation,metric
0,One local resident who did not wish to be name...,"Eräs paikallinen asukas, joka ei halunnut nime...",Toisen nimettömänä pysyttelevän asukkaan mukaa...,-1.111693
1,"Still, she clings to a chant she's committed t...",Silti hän takertuu chant hän on sitoutunut mui...,"Silti hän luottaa edelleen iskulauseeseen, jon...",0.159216
2,"I don't want to be asked, 'What were you doing...","En halua, että minulta kysytään: ""Mitä te teit...","En halua, että kenenkään tarvitsee kysyä minul...",-0.623668
3,"""I wouldn't say it was a lie – that's a pretty...","""En sanoisi, että se oli valhe - se on aika ro...","En sanoisi, että se oli valhe, se on aika kova...",1.8078
4,Kari Kola took part in the opening ceremony of...,Kari Kola osallistui valon vuoden avajaisiin v...,Kari Kola oli mukana Valon teemavuoden avajais...,0.599592


In [17]:
df1.to_csv("scores.csv",index=False)