# Text Mining Group Project
## RU-EN Testset

##### Notebook for the production of the test CSV.

---

In [1]:
# Imports
import pandas as pd
import numpy as np
from collections import Counter
from rouge import Rouge
import string
from nltk.translate import chrf_score
from nltk.tokenize import word_tokenize
from sklearn.preprocessing import StandardScaler

In [2]:
# Choose the best metric here.
best_metric = "chrf_b1_n16"

Load Dataset

In [3]:
df1 = pd.read_csv("scores.csv")

In [4]:
df1.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 13157 entries, 0 to 13156
Data columns (total 3 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   source       13157 non-null  object
 1   reference    13156 non-null  object
 2   translation  13157 non-null  object
dtypes: object(3)
memory usage: 308.5+ KB


In [6]:
df1.head()

Unnamed: 0,source,reference,translation
0,Через полчаса обуглившийся клубень достают и п...,"After half an hour, the charred tuber is taken...","After half-an-hour, the charred tuber is retri..."
1,"Здесь никто не думает отменять смертную казнь,...","Here, no one thinks to abolish the death penal...","Here, no one is concerned with abolishing the ..."
2,"Собеседники ""Известий"" в ОНФ отмечают, что док...","The interlocutors of"" Izvestiya ""in the onf no...",Izvestia’s sources in the ONF note that the re...
3,На древней Венере могли существовать океаны.,On the ancient Venus could exist in the oceans.,Oceans could have existed on ancient Venus.
4,До этого момента убийства оставались лишь исто...,"Up to this point, the murders were just a stor...","Up until this point, the murders have remained..."


---
### PreProcessing

In [7]:
# Check for empty or sparse reference / translation, and drop them.
for column in ["source","reference","translation"]:
    print(column)
    bad_idx = [idx for idx in np.where(df1[column].str.len()<=2)[0]]
    if bad_idx != []:
        print(df1.iloc[bad_idx])
    print(f"Bad idx: {bad_idx}")
#    df1 = df1.drop(index=bad_idx)

source
Bad idx: []
reference
     source reference translation
7193   Мда.         .        Yep.
Bad idx: [7193]
translation
Bad idx: []


In [8]:
# For this one we use only unprocessed.
df_u = df1.copy()

    
df_dict = {"df_u":df_u}

In [9]:
## Initialize a scaler for later.
scaler = StandardScaler()

---
### Metrics

--- 
Bleu Metric

In [9]:
def my_first_BLEU(reference,translation):
    """
    Expects lists of strings for both reference and translation.
    Returns the score 
    """
    
    # Let word be every unique word in the translation.
    # Can be done by setting up a Counter object.
    t_c = Counter(word_tokenize(translation))
    words = sorted(t_c)
    
    refs_c = Counter(word_tokenize(reference))
    
    # Let Covered be the minimum amt of times a word appears in the reference, compared to R(w).
    # Let D(word) be how many times a unique word appears in the candidate translation.
    # Let R(word) be the largest numer of times the word appears in any one reference.
    
    covered = 0
    
    for word in words:
        covered += min(t_c[word],refs_c[word])

    

    # Let total be the number of words in translation.
    total = sum(t_c.values())
    
    BLEU_score = covered / total
    
    return BLEU_score


In [10]:
bleu_scores_list = ["basic bleu"]

for df in list(df_dict.values()):
    for key in bleu_scores_list:
        # Apply the function to get a column of the scores.
        df[(key+" score")] = df.apply(lambda row: my_first_BLEU(row["reference"],row["translation"]),axis=1)
        # Also add a z score column.
        df[(key+ "_zscore")] = scaler.fit_transform(df[(key+" score")].to_numpy().reshape(-1,1)).flatten()

TypeError: expected string or bytes-like object

---
Rouge metric as described in

https://towardsdatascience.com/the-ultimate-performance-metric-in-nlp-111df6c64460

And
https://pypi.org/project/rouge-metric/

In [None]:
rouge = Rouge()

In [None]:
# Example Cell
model_out = df1["translation"][0]
reference = df1["reference"][0]
# The get scores method returns three metrics, F1 score, p precision and recall r.
# For each unigram,bigram and Longest sequence.
rouge.get_scores(model_out,reference)

In [None]:
# Get scores.
# For every df considered;
for df in list(df_dict.values()):
    # For the entire model, model_out and reference need to be lists of strings.
    model_out = df["translation"].to_list()
    reference = df["reference"].to_list()
    rouge_scores = rouge.get_scores(model_out,reference)
    # For each of the scores calculated, output a new column in the df with the f1 scores.
    for key in rouge_scores[0].keys():
        df[(key+" score")] = pd.Series([score[key]["f"] for score in rouge_scores])
        # Also add a z score column.
        df[(key+ "_zscore")] = scaler.fit_transform(df[(key+" score")].to_numpy().reshape(-1,1)).flatten()

---
#### chrF metric

Check the paper here: https://www.aclweb.org/anthology/W15-3049.pdf

The general formula for the CHRF score is:

`CHRFBeta = (1 + Beta**2) * ((chrP * chrR) / (Beta**2*chrP + chrR))`

where:
* chrP is the percentage of n-grams in the hypothesis which have a counterpart in the reference.
* chrR is the percentage of character n-grams in the reference which are also present in the hypothesis.
* Beta is a parameter which assigns beta times more importance to recall than to precision (if beta == 1, they have the same importance).

In [12]:
# The default n-gram values are min == 1, max == 6. 
# The default beta is 3.

# Moreover, it is worthwhile to mention chrf uses its own tokenization with whitespaces.
# Note: this takes a few minutes to run.
min_len = [1]
max_len = [6]
beta = [1]

for df in list(df_dict.values()):
    chrf_scores=[]
    for min_l in min_len:
        for max_l in max_len:
            for b in beta:
                append_str = "chrf_b" + str(b) + "_n" + str(min_l) + str(max_l)
                chrf_scores.append(append_str)
                df[append_str] = df.apply(lambda row: chrf_score.sentence_chrf(row["reference"],row["translation"],min_len=min_l,max_len=max_l,beta=b),axis=1)
                # Also add a z score column.
                df[(append_str+ "_zscore")] = scaler.fit_transform(df[append_str].to_numpy().reshape(-1,1)).flatten()

TypeError: can only join an iterable

---
### Producing the testset CSV
Given the produced scores, choose the ones to apply to the CSV.

In [None]:
# We can use either the zscore or the non-standardized scores. 
# Also note either df_u or df_p choice.
df1["metric"]=df_u[best_metric+"_zscore"]

In [None]:
df1.head()

In [None]:
df1.to_csv("scores.csv",index=False)