In [70]:
import json
import re
import string
from collections import Counter
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.metrics import confusion_matrix, accuracy_score, precision_score, recall_score, f1_score
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay
import matplotlib.pyplot as plt
from sentence_transformers import SentenceTransformer
import pandas as pd
from nltk.corpus import wordnet
from nltk.corpus import stopwords




In [104]:
qqp_train = pd.read_csv('qqp-train.csv', dtype=str)
qqp_test = pd.read_csv('qqp-test.csv', dtype=str)

sts_train = pd.read_csv('sts-train.csv', dtype=str)
sts_test = pd.read_csv('sts-test.csv', dtype=str)

msr_train = pd.read_csv('msr-train.csv', sep='\t')
msr_test = pd.read_csv('msr-test.csv', sep='\t')


In [105]:
qqp_train['similar'] = qqp_train['similar'].astype(str)
qqp_test['similar'] = qqp_test['similar'].astype(str)

qqp_train['Sentence1'] = qqp_train['Sentence1'].astype(str)
qqp_test['Sentence1'] = qqp_test['Sentence1'].astype(str)

qqp_train['Sentence2'] = qqp_train['Sentence2'].astype(str)
qqp_test['Sentence2'] = qqp_test['Sentence2'].astype(str)

qqp = pd.concat([qqp_train, qqp_test])

#########################################################

sts_train['Sentence1'] = sts_train['Sentence1'].astype(str)
sts_train['Sentence2'] = sts_train['Sentence2'].astype(str)
sts_train['similar'] = sts_train['similar'].astype(str)


sts_test['Sentence1'] = sts_test['Sentence1'].astype(str)
sts_test['Sentence2'] = sts_test['Sentence2'].astype(str)
sts_test['similar'] = sts_test['similar'].astype(str)

sts = pd.concat([sts_train, sts_test])

#########################################################

msr_train = msr_train.rename(columns={'#1 String': 'Sentence1', '#2 String': 'Sentence2', 'Quality': 'similar'})
msr_test = msr_test.rename(columns={'#1 String': 'Sentence1', '#2 String': 'Sentence2', 'Quality': 'similar'})
msr_train = msr_train.drop('Unnamed: 5', axis=1)
msr_test = msr_test.drop('Unnamed: 5', axis=1)
msr_train = msr_train.dropna()
msr_test = msr_test.dropna()
msr_train = msr_train[~msr_train['Sentence2'].str.contains('\t')]
msr_test = msr_test[~msr_test['Sentence1'].str.contains('\t')]

msr = pd.concat([msr_train, msr_test])

# our stop words:
stop_words = set([
    "the", "a", "an", "of", "to", "in", "for", "with", "on", "at", "from", "by","also", "as","so", "and", "but"
])

In [92]:
def preprocessing(df, beautiful_case = True):
    #stop_words = set(stopwords.words('english'))
    total_common_count=0
    total_noncommon_count=0
    
    average = 0.0
    n = 0.0
    for i, row in df.iterrows():
        S1, S2 = row['Sentence1'], row['Sentence2']
        S1 = S1.lower()
        S2 = S2.lower()
        if beautiful_case:
            S1 = re.sub(r'[^\w\s]', '', S1)
            S2 = re.sub(r'[^\w\s]', '', S2)
            
            S1_tokens = [word for word in S1.split() if word not in stop_words]
            S2_tokens = [word for word in S2.split() if word not in stop_words]

        else: 
            S1_tokens = S1.split()
            S2_tokens = S2.split()
        
        counter1 = Counter(S1_tokens)
        counter2 = Counter(S2_tokens)

        # Find common elements and their counts
        common_elements_count = (counter1 & counter2).items()
        unique_to_list1 = counter1 - counter2
        unique_to_list2 = counter2 - counter1

        # Combine the results to mimic the symmetric difference
        unique_elements = unique_to_list1 + unique_to_list2

        non_common = sum(unique_elements.values())
        total_noncommon_count += non_common
        # Calculate the total number of common elements
        total_common = sum(min(counter1[element], counter2[element]) for element in (counter1 & counter2))
        total_common_count += total_common

        if total_common + non_common == 0:
            print(S1, S2)

        average += total_common / (total_common + non_common)
        
        if average > .8:
            pass
            #print(S1, S2)

        n+= 1

    return total_common_count, total_noncommon_count, average/n

In [93]:
def synonym_extractor(word):
    synonyms = []
    
    for syn in wordnet.synsets(word):
        for lemma in syn.lemmas():
            synonyms.append(lemma.name())

    return synonyms

def synonym_counter(words1, words2):
    # words1 = sentence1.split()
    # words2 = sentence2.split()

    count = 0

    for word1 in words1:
        synonyms = synonym_extractor(word1)
        for word2 in words2:
            if word2 in synonyms:
                count += 1
                
    # we do it again because the synonym relation is not symmetric
    for word2 in words2:
        synonyms = synonym_extractor(word2)
        for word1 in words1:
            if word1 in synonyms:
                count += 1

    return count

def preprocessing_synonym_counter(df, beautiful_case = True):
    total_similar_count=0
    total_nonsimilar_count=0
    
    expected_porcentage = 0.0

    sum = 0.0

    n = 0
    for i, row in df.iterrows():
        S1, S2 = row['Sentence1'], row['Sentence2']
        S1 = S1.lower()
        S2 = S2.lower()
        if beautiful_case:
            S1 = re.sub(r'[^\w\s]', '', S1)
            S2 = re.sub(r'[^\w\s]', '', S2)
            
            S1_tokens = [word for word in S1.split() if word not in stop_words]
            S2_tokens = [word for word in S2.split() if word not in stop_words]

        else: 
            S1_tokens = S1.split()
            S2_tokens = S2.split()


        similar_count = synonym_counter(S1_tokens, S2_tokens)
        total_nonsimilar_count += len(S1_tokens) + len(S2_tokens) - synonym_counter(S1_tokens, S2_tokens)

        expected_porcentage = similar_count / (len(S1_tokens) + len(S2_tokens))

        total_similar_count += similar_count
        
        sum += expected_porcentage
        
        n += 1

    return total_similar_count, total_nonsimilar_count, sum / n

In [94]:
def get_sim_tfidf(s1,s2):
    # Combine the sentences into one list for vectorization
    all_sentences = s1 + s2

    # Initialize a TF-IDF Vectorizer
    vectorizer = TfidfVectorizer()

    # Fit and transform the sentences
    tfidf_matrix = vectorizer.fit_transform(all_sentences)

    # Calculate cosine similarity
    #cos_sim = cosine_similarity([vec_sentence1], [vec_sentence2])[0][0]

    # Define a threshold
    threshold = 0.5
    # Generate embeddings
    model = SentenceTransformer('all-MiniLM-L6-v2')
    embeddings = model.encode(all_sentences)

    # Now, each pair of sentences at the same index can be compared
    similarities_tf = []
    similarities_sbert = []
    for i in range(len(s1)):
        # Compute cosine similarity between corresponding sentence pairs for TF-IDF
        sim_score_tf = cosine_similarity(tfidf_matrix[i], tfidf_matrix[len(s1) + i])[0][0]
        
        # Compute cosine similarity between corresponding sentence pairs for SBERT
        sim_score_sbert = cosine_similarity(
            embeddings[i].reshape(1, -1),  # Reshape embeddings to 2D
            embeddings[len(s1) + i].reshape(1, -1)  # Reshape embeddings to 2D
        )[0][0]

        # Append the binarized similarity scores
        similarities_tf.append(1 if sim_score_tf >= threshold else 0)
        similarities_sbert.append(1 if sim_score_sbert >= threshold else 0)

    return similarities_tf, similarities_sbert

### QQP Section

In [95]:
print(f'Synonym Percentage for QQP: {preprocessing_synonym_counter(qqp)[2]}')
print(f'Common Words Percentage for QQP: {preprocessing(qqp)[2]}')

Synonym Percentage for QQP: 0.35777842667844834
Common Words Percentage for QQP: 0.38048851593150695


### STS Section

In [96]:
print(f'Synonym Percentage for STS: {preprocessing_synonym_counter(sts)[2]}')
print(f'Common Words Percentage for STS: {preprocessing(sts)[2]}')

Synonym Percentage for STS: 0.3515383600414239
Common Words Percentage for STS: 0.3683107783596161


### MSR Section

In [108]:
print(f'Synonym Percentage for MSR: {preprocessing_synonym_counter(msr)[2]}')
print(f'Common Words Percentage for MSR: {preprocessing(msr)[2]}')

Synonym Percentage for MSR: 0.42952693053548047
Common Words Percentage for MSR: 0.4839155134597848


## Without removing anything

### QQP Section

In [97]:
print(f'Synonym Percentage for QQP: {preprocessing_synonym_counter(qqp, beautiful_case=False)[2]}')
print(f'Common Words Percentage for QQP: {preprocessing(qqp, beautiful_case=False)[2]}')

Synonym Percentage for QQP: 0.26190876537563695
Common Words Percentage for QQP: 0.34165718184541033


### STS Section

In [98]:
print(f'Synonym Percentage for STS: {preprocessing_synonym_counter(sts, beautiful_case=False)[2]}')
print(f'Common Words Percentage for STS: {preprocessing(sts, beautiful_case=False)[2]}')

Synonym Percentage for STS: 0.3590200797328076
Common Words Percentage for STS: 0.36522429674991985


### MSR Section

In [109]:
print(f'Synonym Percentage for MSR: {preprocessing_synonym_counter(msr, beautiful_case=False)[2]}')
print(f'Common Words Percentage for MSR: {preprocessing(msr, beautiful_case=False)[2]}')

Synonym Percentage for MSR: 0.32384588813020865
Common Words Percentage for MSR: 0.4418652557801848
