# Libraries

In [None]:
import pandas as pd
import spacy
import numpy as np
import Levenshtein
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from collections import Counter
from sklearn.ensemble import RandomForestRegressor
from scipy.stats import pearsonr
from sklearn.decomposition import PCA

import nltk
from nltk.corpus import wordnet
from nltk.corpus import wordnet_ic
from nltk import ngrams
from nltk.metrics import jaccard_distance
from nltk.wsd import lesk
from nltk.corpus import sentiwordnet as swn
nltk.download('wordnet')
nltk.download('wordnet_ic')
nltk.download('sentiwordnet')
ic_brown = wordnet_ic.ic('ic-brown.dat')

nlp = spacy.load("en_core_web_sm")

# Features

## Basic Features

In [None]:
def ratio_of_elements(tokenized_sentence1, tokenized_sentence2) -> float:
    """Given two lists, return the ratio of the number of elements between the two lists"""
    l1 = len(tokenized_sentence1)
    l2 = len(tokenized_sentence2)
    return min(l1,l2)/max(l1,l2)

In [None]:
def get_levenstein_ratio(sentence1: str, sentence2: str) -> float:
    """
    Calculate the similarity between two sentences using the Levenshtein ratio.

    The similarity is calculated as the ratio of the Levenshtein distance between the two sentences and the sum of the lengths of the two sentences.

    Parameters:
    - sentence1: The first sentence, represented as a string
    - sentence2: The second sentence, represented as a string.

    Returns:
    - A float representing the Levenshtein ratio between the two sentences.
    """
    return Levenshtein.ratio(sentence1, sentence2)

In [None]:
# Tokens
def get_jaccard_similarity(sentence1,sentence2) -> float:
    """Returns Jaccard similarity between sentences, they could be tokenized or lemmatized"""
    return 1-jaccard_distance(set(sentence1), set(sentence2))

## Semantic Features

In [None]:
similarity_methods = {
    "wu-palmer": lambda s1, s2: s1.wup_similarity(s2),
    "path": lambda s1, s2: s1.path_similarity(s2),
    "leacock": lambda s1, s2: s1.lch_similarity(s2),
    "lin": lambda s1, s2: s1.lin_similarity(s2, ic_brown)
}

In [None]:
def get_wordnet_pos(category):
    """
    Convert a POS tag from the Spacy tagset to the WordNet tagset.
    """
    if category.startswith('J'):
        return 'a'  # Adjective
    elif category.startswith('V'):
        return 'v'  # Verb
    elif category.startswith('N'):
        return 'n'  # Noun
    elif (category.startswith('R')) and (category != 'RP'):
        # I looked into the RP tag is for particles
        return 'r'  # Adverb
    else:
        return None  # WordNet doesn't handle other POS tags

In [None]:
cache = {}
def get_best_synset_pair(word1, word2, pos, similarity_type) -> float:
    """
    Get the best synset pair for two words.

    This function gets the best synset pair for two words, considering every possible pair of synsets from the two words. The best pair is the one with the highest similarity score.

    Parameters:
    - word1: The first word, represented as a Spacy token.
    - word2: The second word, represented as a Spacy token.
    - pos: A string indicating the part of speech of the words.
    - similarity_type: A string indicating the name of the similarity measure.

    Returns:
    - A float representing the similarity score of the best synset pair.
    """
    # Create a cache key (I do this because it is simetric and to save time)
    cache_key = tuple(sorted([word1.text, word2.text]) + [similarity_type]) 
    if cache_key in cache:
        return cache[cache_key]

    synsets_word1 = wordnet.synsets(word1.text, pos=pos)
    synsets_word2 = wordnet.synsets(word2.text, pos=pos)

    max_sim = 0
    for synset1 in synsets_word1:
        for synset2 in synsets_word2:
            # We skip the satellite adjectives (gives problems for Lin and Leacock)
            if synset1.pos() == 's' or synset2.pos() == 's':
                continue
            sim = similarity_methods[similarity_type](synset1, synset2)
            if sim and sim > max_sim:
                max_sim = sim
    
    cache[cache_key] = max_sim
    # If there is no similarity, we return 0
    return max_sim

In [None]:
def get_sentence_similarities(sentence1, sentence2, similarity_type) -> float:
    """
    Calculate the similarity between two sentences using a specified similarity measure.

    For each sentence, the function gets the best similarity value for each word considering every posible pair, using words from the other sentence. Then the mean of this similarities is computed over the sentence (normalizing by the number of tokens with a valid wordnet postag, not the total number of words) and the output is the mean similarity of the two sentences.

    Parameters:
    - sentence1: The first sentence, represented as a list of tokens.
    - sentence2: The second sentence, represented as a list of tokens.
    - similarity_type: A string indicating the type of similarity measure to use. 
        Options include "wu-palmer", "path", "leacock", and "lin".

    Returns:
    - A float representing the average similarity score between the two sentences.
    """

    # Do it for sentence1
    similarity1 = 0
    den = 0 # We will normalize by the number of words that have a valid POS for WordNet
    for token1 in sentence1:
        similarities = np.array([])
        pos1 = get_wordnet_pos(token1.tag_)
        if not pos1:
            continue
        den +=1
        for token2 in sentence2:
            pos2 = get_wordnet_pos(token2.tag_)
            if (not pos2) or (pos1 != pos2):
                continue
            similarities = np.append(similarities,get_best_synset_pair(token1, token2, pos1, similarity_type))
        if similarities.size > 0:
            similarity1 += np.max(similarities)
    # We average the similarity (even if they don't get a similarity)
    similarity1 = similarity1 / den

    # Do it for sentence2
    similarity2 = 0
    den = 0 # We will normalize by the number of words that have a valid POS for WordNet
    for token2 in sentence2:
        similarities = np.array([])
        pos2 = get_wordnet_pos(token2.tag_)
        if not pos2:
            continue
        den +=1
        for token1 in sentence1:
            pos1 = get_wordnet_pos(token1.tag_)
            if (not pos1) or (pos1 != pos2):
                continue
            similarities = np.append(similarities, get_best_synset_pair(token1, token2, pos2, similarity_type))
        if similarities.size > 0:
            similarity2 += np.max(similarities)
    # We average the similarity (even if they don't get a similarity)
    similarity2 = similarity2 / den
    
    return np.mean(np.array([similarity1, similarity2]))

In [None]:
def lesk_jaccard_similarity(sentence1,sentence2) -> float:
    """
    Returns the Lesk similarity between sentences
    
    Parameters:
    - sentence1: The first sentence, represented as a list of tokens.
    - sentence2: The second sentence, represented as a list of tokens.

    Returns:
    - A float representing the Jaccard similarity between the best synsets of the two sentences, according to the Lesk algorithm.
    """
    synsets1 = [lesk(sentence1, token.text, get_wordnet_pos(token.tag_)) for token in sentence1]
    synsets2 = [lesk(sentence2, token.text, get_wordnet_pos(token.tag_)) for token in sentence2]
    if len(synsets1) == 0 or len(synsets2) == 0:
        return 0
    return 1 - jaccard_distance(set(synsets1), set(synsets2))

In [None]:
def sentiwordnet_difference(sentence1,sentence2,method='pos'):
    """
    This function calculates the absolute difference between the average sentiment scores of the lesk synsets of the two sentences, normalized by the maximum value of the sentiment score.

    Parameters:
    - sentence1: The first sentence, represented as a list of tokens.
    - sentence2: The second sentence, represented as a list of tokens.
    - method: A string indicating the type of sentiment score to use. 
        Options include "pos", "neg", and "obj".

    Returns:
    - A float representing the difference in the mean sentiment scores of the two sentences, normalized by the maximum value of the sentiment score.
    """
    synsets1 = [lesk(sentence1, token.text, get_wordnet_pos(token.tag_)) for token in sentence1]
    synsets2 = [lesk(sentence2, token.text, get_wordnet_pos(token.tag_)) for token in sentence2]
    
    sentisynsets1 = [swn.senti_synset(synset.name()) for synset in synsets1 if synset is not None]
    sentisynsets2 = [swn.senti_synset(synset.name()) for synset in synsets2 if synset is not None]

    l1 = len(sentisynsets1)
    l2 = len(sentisynsets2)

    if len(l1) == 0 or len(l2) == 0:
        return 0
    
    if method == 'pos':
        sum1 = sum([sentiSynset.pos_score() for sentiSynset in sentisynsets1])
        sum2 = sum([sentiSynset.pos_score() for sentiSynset in sentisynsets2])
        if sum1 == 0 and sum2 == 0:
            return abs(sum1 - sum2)
        return abs(sum1 - sum2)/max(l1, l2)
    elif method == 'neg':
        sum1 = sum([sentiSynset.neg_score() for sentiSynset in sentisynsets1])
        sum2 = sum([sentiSynset.neg_score() for sentiSynset in sentisynsets2])
        if sum1 == 0 and sum2 == 0:
            return abs(sum1 - sum2)
        return abs(sum1 - sum2)/max(l1, l2)
    elif method == 'obj':
        sum1 = sum([sentiSynset.obj_score() for sentiSynset in sentisynsets1])
        sum2 = sum([sentiSynset.obj_score() for sentiSynset in sentisynsets2])
        if sum1 == 0 and sum2 == 0:
            return abs(sum1 - sum2)
        return abs(sum1 - sum2)/max(l1, l2)
    else:
        raise ValueError("Error: this method is not supported")

## N-Grams

### Word N-Grams Features

In [None]:
def get_ngram_jaccard_similarity(sentence1, sentence2, n):
    """
    Calculate the similarity between two sentences using n-grams.

    This function calculates the similarity between two sentences using n-grams. The similarity is calculated as the jaccard similarity between the set of n-grams of the two sentences.

    Parameters:
    - sentence1: The first sentence, represented as a list of tokens.
    - sentence2: The second sentence, represented as a list of tokens.
    - n: The size of the n-grams to use.

    Returns:
    - A float representing the Jaccard similarity between the two sentences.
    """
    ngrams1 = set([str(x) for x in list(ngrams(sentence1,n))])
    ngrams2 = set([str(x) for x in list(ngrams(sentence2,n))])
    if len(ngrams1) == 0 and len(ngrams2) == 0:
        return 1
    return 1-jaccard_distance(ngrams1,ngrams2)

In [None]:
def get_ngram_vector_similarity(sentence1, sentence2, n):
    """
    Calculate the similarity between two sentences using n-grams.

    For each sentence, we count the number of times each n-gram appears in the sentence. Then we sum the minimum of the counts of each n-gram match, multiply it by 2 and divide it by the sum of the counts of all n-grams in the two sentences.

    Parameters:
    - sentence1: The first sentence, represented as a list of tokens/lemmas
    - sentence2: The second sentence, represented as a list of tokens/lemmas
    - n: The size of the n-grams to use.
    
    Returns:
    - A float representing the similarity of the two sentences using n-grams.
    """
    if len(sentence1) < n or len(sentence2) < n:
        return 0
    ngrams1 = [str(x) for x in list(ngrams(sentence1,n))]
    ngrams2 = [str(x) for x in list(ngrams(sentence2,n))]
    all_ngrams = set(ngrams1 + ngrams2)

    c1 = Counter(ngrams1)
    c2 = Counter(ngrams2)

    coincidences = 0
    for ngram in all_ngrams:
        if ngram not in ngrams1 or ngram not in ngrams2:
            continue
        coincidences += min(c1[ngram],c2[ngram])
    return 2*coincidences/(sum(c1.values())+sum(c2.values()))
    


### Character N-Gram Features

In [None]:
def get_character_ngram_jaccard_similarity(sentence1, sentence2, n):
    """
    Calculate the similarity between two sentences using character n-grams.

    This function calculates the similarity between two sentences using character n-grams. The similarity is calculated as the jaccard similarity between the set of n-grams of the two sentences.

    Parameters:
    - sentence1: The first sentence, represented as a string
    - sentence2: The second sentence, represented as a string.
    - n: The size of the n-grams to use.

    Returns:
    - A float representing the Jaccard similarity between the two sentences.
    """
    ngrams1 = set([str(x) for x in list(ngrams(sentence1,n))])
    ngrams2 = set([str(x) for x in list(ngrams(sentence2,n))])

    return 1-jaccard_distance(ngrams1,ngrams2)

In [None]:
def get_char_ngram_vector_cosine_similarity(sentence1: str, sentence2: str, n: int) -> float:
    """
    Calculate the cosine similarity between two sentences using character n-grams.

    This function calculates the similarity between two sentences using character n-grams. 
    The similarity is calculated as the cosine similarity between the vectors of the n-grams of the two sentences.

    Parameters:
    - sentence1: The first sentence, represented as a string
    - sentence2: The second sentence, represented as a string.
    - nrange: A tuple representing the range of n-grams to use.

    Returns:
    - A float representing the cosine similarity between the two sentences.
    """
    n_range = (n, n)
    vectorizer = CountVectorizer(analyzer='char', ngram_range=n_range)
    ngrams = vectorizer.fit_transform([sentence1, sentence2])

    # Compute cosine similarity
    return cosine_similarity(ngrams[0], ngrams[1])[0][0]

# Function to get all of the features

In [None]:
def get_features (df):

    features = pd.DataFrame()
    for rowindex, row in df.iterrows():
        features.at[rowindex, 'token_ratio'] = ratio_of_number_of_tokens(row['0_nlp'], row['1_nlp'])
        features.at[rowindex, 'char_ratio'] = ratio_of_number_of_characters(row["0_lower"], row["1_lower"])
        features.at[rowindex, "levenstein_ratio"] = get_levenstein_ratio(row["0_lower"], row["1_lower"])

        features.at[rowindex, 'jaccard_similarity_tokens'] = get_jaccard_similarity([token.text for token in row["0_nlp"]], [token.text for token in row["1_nlp"]])
        features.at[rowindex, 'jaccard_similarity_tokens_no_stops'] = get_jaccard_similarity([token.text for token in row["0_nlp_no_stop"]], [token.text for token in row["1_nlp_no_stop"]])
        features.at[rowindex, 'jaccard_similarity_lemmas'] = get_jaccard_similarity(row["0_lemma"], row["1_lemma"])
        features.at[rowindex, 'jaccard_similarity_lemmas_no_stops'] = get_jaccard_similarity(row["0_lemma_no_stop"], row["1_lemma_no_stop"])

        features.at[rowindex, 'wu-palmer_similarity'] = get_sentence_similarities(row["0_nlp"], row["1_nlp"], "wu-palmer")
        #df.at[rowindex, 'lin_similarity'] = get_sentence_similarities(row["0_nlp"], row["1_nlp"], "lin")
        features.at[rowindex, 'path_similarity'] = get_sentence_similarities(row["0_nlp"], row["1_nlp"], "path")
        features.at[rowindex, 'leacock_similarity'] = get_sentence_similarities(row["0_nlp"], row["1_nlp"], "leacock")
        features.at[rowindex, 'lesk_jaccard_similarity'] = lesk_jaccard_similarity(row["0_nlp"], row["1_nlp"])

        features.at[rowindex, 'unigram_vector_cosine_similarity'] = get_ngram_vector_similarity(row["0_lower"], row["1_lower"], 1)
        features.at[rowindex, 'bigram_vector_cosine_similarity'] = get_ngram_vector_similarity(row["0_lower"], row["1_lower"], 2)
        features.at[rowindex, 'trigram_vector_cosine_similarity'] = get_ngram_vector_similarity(row["0_lower"], row["1_lower"], 3)
        features.at[rowindex, 'quadgram_vector_cosine_similarity'] = get_ngram_vector_similarity(row["0_lower"], row["1_lower"], 4)

        features.at[rowindex, 'token_bigram_jaccard_similarity'] = get_ngram_jaccard_similarity(row["0_nlp"], row["1_nlp"], 2)
        features.at[rowindex, 'token_trigram_jaccard_similarity'] = get_ngram_jaccard_similarity(row["0_nlp"], row["1_nlp"], 3)
        features.at[rowindex, 'token_quadgram_jaccard_similarity'] = get_ngram_jaccard_similarity(row["0_nlp"], row["1_nlp"], 4)

        features.at[rowindex, 'lemma_bigram_jaccard_similarity'] = get_ngram_jaccard_similarity(row["0_lemma"], row["1_lemma"], 2)
        features.at[rowindex, 'lemma_trigram_jaccard_similarity'] = get_ngram_jaccard_similarity(row["0_lemma"], row["1_lemma"], 3)
        features.at[rowindex, 'lemma_quadgram_jaccard_similarity'] = get_ngram_jaccard_similarity(row["0_lemma"], row["1_lemma"], 4)

        features.at[rowindex, 'char_bigram_jaccard_similarity'] = get_character_ngram_jaccard_similarity(row["0_lower"], row["1_lower"], 2)
        features.at[rowindex, 'char_trigram_jaccard_similarity'] = get_character_ngram_jaccard_similarity(row["0_lower"], row["1_lower"], 3)
        features.at[rowindex, 'char_quadgram_jaccard_simmilarity'] = get_character_ngram_jaccard_similarity(row["0_lower"], row["1_lower"], 4)

        features.at[rowindex, 'char_bigram_vector_cosine_similarity'] = get_char_ngram_vector_cosine_similarity(row["0_lower"], row["1_lower"], 2)
        features.at[rowindex, 'char_trigram_vector_cosine_similarity'] = get_char_ngram_vector_cosine_similarity(row["0_lower"], row["1_lower"], 3)
        features.at[rowindex, 'char_quadgram_vector_cosine_similarity'] = get_char_ngram_vector_cosine_similarity(row["0_lower"], row["1_lower"], 4)

        features.at[rowindex, 'sentiwordnet_pos_difference'] = sentiwordnet_correlation(row["0_nlp"], row["1_nlp"], 'pos')
        features.at[rowindex, 'sentiwordnet_neg_difference'] = sentiwordnet_correlation(row["0_nlp"], row["1_nlp"], 'neg')
        # features.at[rowindex, 'sentiwordnet_obj_difference'] = sentiwordnet_correlation(row["0_nlp"], row["1_nlp"], 'obj') # It is directly dependent on the other two
    return features

# Preprocesing

In [None]:
def preprocessing(sentences):
    """Processes the sentences to get the features"""
    # Lowercase
    sentences["0_lower"] = sentences[0].apply(lambda x: ''.join([char.lower() if not char.isdigit() else char for char in x]))
    sentences["1_lower"] = sentences[1].apply(lambda x: ''.join([char.lower() if not char.isdigit() else char for char in x]))

    # Tokens
    sentences["0_nlp"] = sentences["0_lower"].apply(lambda x: nlp(x))
    sentences["1_nlp"] = sentences["1_lower"].apply(lambda x: nlp(x))
    # Remove punctuation
    sentences["0_nlp"] = sentences["0_nlp"].apply(lambda x: [token for token in x if not token.is_punct])
    sentences["1_nlp"] = sentences["1_nlp"].apply(lambda x: [token for token in x if not token.is_punct])
    
    # Filter out stop words
    sentences["0_nlp_no_stop"] = sentences["0_nlp"].apply(lambda x: [token for token in x if not token.is_stop])
    sentences["1_nlp_no_stop"] = sentences["1_nlp"].apply(lambda x: [token for token in x if not token.is_stop])

    # Lemmas
    sentences['0_lemma'] = sentences['0_nlp'].apply(lambda x: [token.lemma for token in x])
    sentences['1_lemma'] = sentences['1_nlp'].apply(lambda x: [token.lemma for token in x])
    # Filter out stop words
    sentences['0_lemma_no_stop'] = sentences['0_nlp_no_stop'].apply(lambda x: [token.lemma for token in x])
    sentences['1_lemma_no_stop'] = sentences['1_nlp_no_stop'].apply(lambda x: [token.lemma for token in x])
    return sentences

In [None]:
# Function to read and clean data, tracking bad line indices
def read_and_clean_data(file_path):
    with open(file_path, 'r') as file:
        lines = file.readlines()

    cleaned_lines = []
    bad_line_indices = []
    for i, line in enumerate(lines):
        fields = line.strip().split('\t')
        if len(fields) == 2:
            cleaned_lines.append(fields)
        else:
            print(f"Skipping bad line: {line.strip()}")
            bad_line_indices.append(i)

    return pd.DataFrame(cleaned_lines, columns=[0, 1]), bad_line_indices

## Training

In [None]:
# Load the train data
sentences_file_paths = [
    "../Data/Train/train/STS.input.SMTeuroparl.txt",
    "../Data/Train/train/STS.input.MSRvid.txt",
    "../Data/Train/train/STS.input.MSRpar.txt",
    ]
gold_standard_file_paths = [
    "../Data/Train/train/STS.gs.SMTeuroparl.txt",
    "../Data/Train/train/STS.gs.MSRvid.txt",
    "../Data/Train/train/STS.gs.MSRpar.txt",
    ]   

# Read and clean the data files into DataFrames, tracking bad line indices
sentences_list = []
bad_line_indices_list = []
for path in sentences_file_paths:
    df, bad_line_indices = read_and_clean_data(path)
    sentences_list.append(df)
    bad_line_indices_list.append(bad_line_indices)

sentences_training: pd.DataFrame = pd.concat(sentences_list, ignore_index=True)

gs_list = [pd.read_csv(path, header=None) for path in gold_standard_file_paths]
# Remove the rows that correspond to bad lines
for i, gs in enumerate(gs_list):
    gs = gs.drop(bad_line_indices_list[i])
gs_training: pd.DataFrame = pd.concat(gs_list, ignore_index=True)

# Preprocess the sentences
sentences_training = preprocessing(sentences_training)

In [None]:
# Get the features (~4mins)
features_training = get_features(sentences_training)

## Testing

In [None]:
grid_search = RandomForestRegressor(max_depth=14, max_features='log2', min_samples_leaf=1, min_samples_split=2, n_estimators=600,random_state=13)
gs_training = np.ravel(gs_training)
grid_search.fit(features_training, gs_training)

In [None]:
# Dataset names
dataset_names = [path.replace('../Data/Test/test-gold/STS.input.','').replace('.txt','') for path in sentences_file_paths_testing]

features_list_testing

pearson_correlations = []
# Testing the model on each dataset
for i, features in enumerate(features_list_testing):
    predictions = grid_search.predict(features)
    pearson_correlation = pearsonr(np.ravel(gs_list_testing[i]), predictions)[0]
    print(f"Testing pearson correlation on {dataset_names[i]}: {pearson_correlation}")
    pearson_correlations.append(pearson_correlation)

# Testing the model on all datasets
predictions = grid_search.predict(pd.concat(features_list_testing, ignore_index=True))
pearson_correlation = pearsonr(np.ravel(pd.concat(gs_list_testing, ignore_index=True)), predictions)[0]
print(f"Testing pearson correlation on all datasets: {pearson_correlation}")

Testing pearson correlation on SMTeuroparl: 0.5734560411396111
Testing pearson correlation on MSRvid: 0.832564485571077
Testing pearson correlation on MSRpar: 0.6124410320005863
Testing pearson correlation on surprise.OnWN: 0.724718580699725
Testing pearson correlation on surprise.SMTnews: 0.5574499423843311
Testing pearson correlation on all datasets: 0.7263711560250121


# Experiment PCA + Random Forest

In [None]:
# Perform PCA on the training features
pca = PCA(n_components=0.99)  # Retain 95% of variance
features_training_pca = pca.fit_transform(features_training)

# Train a Random Forest Regressor on the PCA-transformed data
random_forest = RandomForestRegressor(n_estimators=600, max_depth=14, max_features='log2', min_samples_leaf=1, min_samples_split=2, random_state=13)
random_forest.fit(features_training_pca, gs_training)
# Print the explained variance ratio to understand how much variance is retained
print(f"Explained variance ratio by PCA components: {pca.explained_variance_ratio_}")
print(f"Number of components selected: {pca.n_components_}")


Explained variance ratio by PCA components: [0.51138427 0.15756158 0.12970455 0.06382349 0.04985604 0.02740873
 0.01613636 0.00887821 0.00783592 0.00496913 0.00422658 0.00416992
 0.00279223 0.00243762]
Number of components selected: 14


## Test

In [None]:
# Dataset names
dataset_names = [path.replace('../Data/Test/test-gold/STS.input.','').replace('.txt','') for path in sentences_file_paths_testing]

pearson_correlations = []
# Testing the model on each dataset
for i, features in enumerate(features_list_testing):
    features_pca = pca.transform(features)
    predictions = random_forest.predict(features_pca)
    pearson_correlation = pearsonr(np.ravel(gs_list_testing[i]), predictions)[0]
    print(f"Testing pearson correlation on {dataset_names[i]}: {pearson_correlation}")
    pearson_correlations.append(pearson_correlation)

# Testing the model on all datasets
predictions = random_forest.predict(pca.transform(pd.concat(features_list_testing, ignore_index=True)))
pearson_correlation = pearsonr(np.ravel(pd.concat(gs_list_testing,ignore_index=True)), predictions)[0]
print(f"Testing pearson correlation on all datasets: {pearson_correlation}")

Testing pearson correlation on SMTeuroparl: 0.5224957854709418
Testing pearson correlation on MSRvid: 0.8260860292089873
Testing pearson correlation on MSRpar: 0.5832890181963831
Testing pearson correlation on surprise.OnWN: 0.6936375786320211
Testing pearson correlation on surprise.SMTnews: 0.5827772453424451
Testing pearson correlation on all datasets: 0.7331939172964225
