# Text Mining Group Project
## EN-ZH Corpus

##### TOC for Implemented Metrics.

---

In [1]:
# Necessary Installs.
#!pip install rouge

In [2]:
# Imports
import pandas as pd
import numpy as np
from collections import Counter
from rouge import Rouge
import string
from nltk.translate import chrf_score
from nltk.tokenize import word_tokenize
from sklearn.preprocessing import StandardScaler

Load Dataset

In [3]:
df1 = pd.read_csv("scores.csv")

In [4]:
df1.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10221 entries, 0 to 10220
Data columns (total 6 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   source       10221 non-null  object 
 1   reference    10221 non-null  object 
 2   translation  10221 non-null  object 
 3   z-score      10221 non-null  float64
 4   avg-score    10221 non-null  float64
 5   annotators   10221 non-null  int64  
dtypes: float64(2), int64(1), object(3)
memory usage: 479.2+ KB


In [5]:
df1.head()

Unnamed: 0,source,reference,translation,z-score,avg-score,annotators
0,"""In the GISS model's simulation, Venus' slow s...",GSIS的科学家AnthonyDelGenio在新闻稿中解释说：“在GISS模型的模拟模型中...,戈达德太空研究所科学家安东尼·德尔·杰尼奥在新闻发布会上解释说：“在戈达德太空研究所的模型模...,-1.171867,50.0,1
1,Ai Yanhan of China in the Women's 4 x 200m Fre...,中国在英国女性4x200mFreestreyWTE中的最后被称为：“中国14岁的孩子从球下降...,参加女子4x200米自由泳接力赛决赛的中国小将艾衍含被这样描述：“那名14岁的中国小姑娘犯了...,-2.255403,26.5,2
2,"Then came 2012, when nothing much went right f...",然后来到2012年，当她和她的队友们没有什么好处。,2012年，她和她的队友都不被看好。,-2.508996,21.0,1
3,"Since last year, Guodian Group has exported a ...",自去年以来，GoudianGroup从南非通过南非港口出口了163套风力发电项目。,自去年以来，国电集团共计有163套风电项目陆续从连云港港出口南非。,-2.41678,23.0,1
4,"Some alleged that the Kempinski hotel simply ""...","一些人指称，Kempinski旅馆只是""被捕""，以满足阿拉伯客户的要求。",有人认为凯宾斯基酒店简直是为了满足阿拉伯客户的要求而“卑躬屈膝”。,-1.489676,45.0,7


---
### PreProcessing

In [7]:
# Check for empty or sparse reference / translation, and drop them.
# In this case, strings may be very short, so let's just drop those that are empty.
for column in ["source","reference","translation"]:
    print(column)
    bad_idx = [idx for idx in np.where(df1[column].str.len()==0)[0]]
    print(f"Bad idx: {bad_idx}")
    df1 = df1.drop(index=bad_idx)

source
Bad idx: []
reference
Bad idx: []
translation
Bad idx: []


In [8]:
# Create two dataframes. One is unprocessed, the other is preprocessed to remove punctuation and be lowercased.
# (Future note: maybe also remove stopwords?)
df_u = df1.copy()
df_p = df1.copy()

for x in ["source","reference","translation"]:
    # lowercase.
    df_p[x] = df1[x].str.lower()
    # Remove punct.
    df_p[x] = df1[x].map(lambda s: s.translate(str.maketrans("","",string.punctuation))
                           .lower()
                          )
    
df_dict = {"df_u":df_u,"df_p":df_p}

In [9]:
## Initialize a scaler for later.
scaler = StandardScaler()

---
### Metrics

## FOR THE CHINESE CHARACTERS WE NEED A SPECIFIC ROUGE ZH, AS WELL AS A SPECIFIC TOKENIZER FOR BLEU.
## HOWEVER THIS IS UNLIKELY TO BE BETTER THAN CHRF FOR THIS DATASET DUE TO THE CHARACTER-BASED NATURE OF THE LANGUAGE.
### SO I SKIPPED IT FOR NOW.
#### HERES A LINK https://github.com/JialeGuo/py_rouge_zh

--- 
Bleu Metric

In [10]:
def my_first_BLEU(reference,translation):
    """
    Expects lists of strings for both reference and translation.
    Returns the score 
    """
    
    # Let word be every unique word in the translation.
    # Can be done by setting up a Counter object.
    t_c = Counter(word_tokenize(translation))
    words = sorted(t_c)
    
    refs_c = Counter(word_tokenize(reference))
    
    # Let Covered be the minimum amt of times a word appears in the reference, compared to R(w).
    # Let D(word) be how many times a unique word appears in the candidate translation.
    # Let R(word) be the largest numer of times the word appears in any one reference.
    
    covered = 0
    
    for word in words:
        covered += min(t_c[word],refs_c[word])

    

    # Let total be the number of words in translation.
    total = sum(t_c.values())
    
    BLEU_score = covered / total
    
    return BLEU_score


In [11]:
bleu_scores_list = ["basic bleu"]

for df in list(df_dict.values()):
    for key in bleu_scores_list:
        # Apply the function to get a column of the scores.
        df[(key+" score")] = df.apply(lambda row: my_first_BLEU(row["reference"],row["translation"]),axis=1)
        # Also add a z score column.
        df[(key+ "_zscore")] = scaler.fit_transform(df[(key+" score")].to_numpy().reshape(-1,1)).flatten()

---
Rouge metric as described in

https://towardsdatascience.com/the-ultimate-performance-metric-in-nlp-111df6c64460

And
https://pypi.org/project/rouge-metric/

In [12]:
rouge = Rouge()

In [13]:
# Example Cell
model_out = df1["translation"][0]
reference = df1["reference"][0]
# The get scores method returns three metrics, F1 score, p precision and recall r.
# For each unigram,bigram and Longest sequence.
rouge.get_scores(model_out,reference)

[{'rouge-1': {'f': 0.0, 'p': 0.0, 'r': 0.0},
  'rouge-2': {'f': 0.0, 'p': 0.0, 'r': 0.0},
  'rouge-l': {'f': 0.0, 'p': 0.0, 'r': 0.0}}]

In [14]:
# Get scores.
# For every df considered;
for df in list(df_dict.values()):
    # For the entire model, model_out and reference need to be lists of strings.
    model_out = df["translation"].to_list()
    reference = df["reference"].to_list()
    rouge_scores = rouge.get_scores(model_out,reference)
    # For each of the scores calculated, output a new column in the df with the f1 scores.
    for key in rouge_scores[0].keys():
        df[(key+" score")] = pd.Series([score[key]["f"] for score in rouge_scores])
        # Also add a z score column.
        df[(key+ "_zscore")] = scaler.fit_transform(df[(key+" score")].to_numpy().reshape(-1,1)).flatten()

---
#### chrF metric

Check the paper here: https://www.aclweb.org/anthology/W15-3049.pdf

The general formula for the CHRF score is:

`CHRFBeta = (1 + Beta**2) * ((chrP * chrR) / (Beta**2*chrP + chrR))`

where:
* chrP is the percentage of n-grams in the hypothesis which have a counterpart in the reference.
* chrR is the percentage of character n-grams in the reference which are also present in the hypothesis.
* Beta is a parameter which assigns beta times more importance to recall than to precision (if beta == 1, they have the same importance).

In [15]:
# I was surprised, but this works exactly like it's intended. Makes a new column with the chrF score for each row of the df.
# The default n-gram values are min == 1, max == 6. 
# The default beta is 3.

# Moreover, it is worthwhile to mention chrf uses its own tokenization with whitespaces.
# All parameters to test chrf scores with. feel free to play around with this and test out different combinations.
# Note: this takes a few minutes to run.
min_len = [1,2]
max_len = [6,10]
beta = [1,3]

for df in list(df_dict.values()):
    chrf_scores=[]
    for min_l in min_len:
        for max_l in max_len:
            for b in beta:
                append_str = "chrf_b" + str(b) + "_n" + str(min_l) + str(max_l)
                chrf_scores.append(append_str)
                df[append_str] = df.apply(lambda row: chrf_score.sentence_chrf(row["reference"],row["translation"],min_len=min_l,max_len=max_l,beta=b),axis=1)
                # Also add a z score column.
                df[(append_str+ "_zscore")] = scaler.fit_transform(df[append_str].to_numpy().reshape(-1,1)).flatten()

df_p.loc[:,chrf_scores]

Unnamed: 0,chrf_b1_n16,chrf_b3_n16,chrf_b1_n110,chrf_b3_n110,chrf_b1_n26,chrf_b3_n26,chrf_b1_n210,chrf_b3_n210
0,0.169171,0.179917,0.101503,0.107950,1.118287e-01,1.190255e-01,6.212704e-02,6.612526e-02
1,0.134180,0.136271,0.080508,0.081762,7.799717e-02,7.923357e-02,4.333176e-02,4.401865e-02
2,0.370555,0.324404,0.222333,0.194642,3.144338e-01,2.740581e-01,1.746855e-01,1.522545e-01
3,0.237745,0.218189,0.142647,0.130914,1.717801e-01,1.573495e-01,9.543338e-02,8.741637e-02
4,0.254548,0.251340,0.190520,0.188031,2.278456e-01,2.249113e-01,1.685709e-01,1.663145e-01
...,...,...,...,...,...,...,...,...
10216,0.211474,0.246075,0.126885,0.147645,1.229998e-01,1.435039e-01,6.833322e-02,7.972438e-02
10217,0.427104,0.433504,0.295183,0.299671,3.660964e-01,3.716535e-01,2.466318e-01,2.504399e-01
10218,0.322200,0.389644,0.200981,0.243292,2.591676e-01,3.145388e-01,1.524939e-01,1.853058e-01
10219,0.015686,0.017873,0.009412,0.010724,1.000000e-16,1.000000e-16,1.000000e-16,1.000000e-16


---
### Comparison of Applied Metrics
Because the numeric system used for all of these can be different, the best way to compare them is by checking the correlation with the annotator's scores.

In [16]:
# Initialize a dict to be transformed to a df later, for score comparison.
corr_list = ["pearson","kendall"]
scores_dict = {}

for df_name in df_dict.keys():
    for corr in corr_list:
        scores_dict[corr+"_"+df_name] = []
    
scores_index = []

In [17]:
# Thankfully, Pandas has a corr method. Use it on standardized scores obtained previously.

# for each declared corr method, compute the corr between each computed metric and the avg-score column for each considered df.
for df_name in df_dict.keys():
    for corr in corr_list:
        for key in rouge_scores[0].keys():
            scores_dict[corr+"_"+df_name].append(df_dict[df_name].loc[:,(key+ "_zscore")].corr(df_dict[df_name].loc[:,"z-score"],method=corr))
        for chrf_score in set(chrf_scores):
            scores_dict[corr+"_"+df_name].append(df_dict[df_name].loc[:,(chrf_score+"_zscore")].corr(df_dict[df_name].loc[:,"z-score"],method=corr))
        for bleu_score in bleu_scores_list:
            scores_dict[corr+"_"+df_name].append(df_dict[df_name].loc[:,(key+ "_zscore")].corr(df_dict[df_name].loc[:,"z-score"],method=corr))


# Build also a list that will be used to create the index for the scores dataframe.
scores_index.extend(list(rouge_scores[0].keys()))
scores_index.extend(chrf_scores)
scores_index.extend(bleu_scores_list)

In [18]:
scores_df = pd.DataFrame(scores_dict,index=scores_index)
scores_df

Unnamed: 0,pearson_df_u,kendall_df_u,pearson_df_p,kendall_df_p
rouge-1,0.030981,0.017586,0.022735,-0.00053
rouge-2,0.020555,0.011268,0.014217,0.006591
rouge-l,0.031029,0.017586,0.022402,-0.000531
chrf_b1_n16,0.402421,0.296409,0.401513,0.296313
chrf_b3_n16,0.39319,0.303297,0.391742,0.303248
chrf_b1_n110,0.392289,0.289472,0.391224,0.28948
chrf_b3_n110,0.3817,0.293693,0.380012,0.293819
chrf_b1_n26,0.417394,0.299659,0.41671,0.299855
chrf_b3_n26,0.362991,0.28982,0.361257,0.289655
chrf_b1_n210,0.432033,0.310521,0.431516,0.310629


In [19]:
for column in scores_df.columns:
    print(f"Max in {column}:")
    print("{}, with {}".format(scores_df[column].argmax(),scores_df[column].max()))


Max in pearson_df_u:
9, with 0.4320332033424246
Max in kendall_df_u:
9, with 0.310521144780342
Max in pearson_df_p:
9, with 0.4315164961782167
Max in kendall_df_p:
9, with 0.31062865883067975
