In [1]:
import json
import re
import string
from collections import Counter
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.metrics import confusion_matrix, accuracy_score, precision_score, recall_score, f1_score
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay
import matplotlib.pyplot as plt
from sentence_transformers import SentenceTransformer
import pandas as pd
from nltk.corpus import wordnet


  from .autonotebook import tqdm as notebook_tqdm


In [30]:
qqp_train = pd.read_csv('qqp-train.tsv', sep='\t', dtype=str)
qqp_test = pd.read_csv('qqp-test.tsv', sep='\t', dtype=str)



In [31]:
qqp_train = qqp_train.rename(columns={'question1': 'Sentence1', 'question2': 'Sentence2', 'is_duplicate': 'similar'})
qqp_test = qqp_test.rename(columns={'question1': 'Sentence1', 'question2': 'Sentence2', 'is_duplicate': 'similar'})

In [32]:
qqp_train = qqp_train.groupby('similar', group_keys=False).apply(lambda x: x.sample(2500))
qqp_test = qqp_test.groupby('similar', group_keys=False).apply(lambda x: x.sample(500))

  qqp_train = qqp_train.groupby('similar', group_keys=False).apply(lambda x: x.sample(2500))
  qqp_test = qqp_test.groupby('similar', group_keys=False).apply(lambda x: x.sample(500))


In [33]:
qqp_train = qqp_train.sample(frac=1)
qqp_test = qqp_test.sample(frac=1)

In [34]:
qqp_train = qqp_train.drop_duplicates()
qqp_test = qqp_test.drop_duplicates()

In [35]:
qqp_train_test_merge = qqp_train.merge(qqp_test, how='inner', indicator=True)
qqp_test = qqp_test.drop(qqp_train_test_merge.index)

In [36]:
qqp_train['similar'] = qqp_train['similar'].astype(str)
qqp_test['similar'] = qqp_test['similar'].astype(str)

In [42]:
def preprocessing(df):
    total_common_count=0
    total_noncommon_count=0
    sentence1_array=[]
    sentence2_array=[]
    target_output=[]
    average = 0.0
    n = 0.0
    for i, row in df.iterrows():
        S1, S2 = row['Sentence1'], row['Sentence2']
        
        sentence1_array.append(S1)
        sentence2_array.append(S2)
        target_output.append(int(row['similar']))
        
        #punctuations = [char for char in S1 if char in string.punctuation]
        # print(punctuations)
        #punct.extend(punctuations)
        #S1 = re.sub()

        S1.lower()
        S2.lower()
        
        S1_tokens = S1.split()
        S2_tokens = S2.split()

        counter1 = Counter(S1_tokens)
        counter2 = Counter(S2_tokens)

        # Find common elements and their counts
        common_elements_count = (counter1 & counter2).items()
        unique_to_list1 = counter1 - counter2
        unique_to_list2 = counter2 - counter1

        # Combine the results to mimic the symmetric difference
        unique_elements = unique_to_list1 + unique_to_list2

        non_common = sum(unique_elements.values())
        total_noncommon_count += non_common
        # Calculate the total number of common elements
        total_common = sum(min(counter1[element], counter2[element]) for element in (counter1 & counter2))
        total_common_count += total_common

        average += total_common / (total_common + non_common)

        n+= 1

    return total_common_count, total_noncommon_count, average/n

In [43]:
def synonym_extractor(word):
    synonyms = []
    
    for syn in wordnet.synsets(word):
        for lemma in syn.lemmas():
            synonyms.append(lemma.name())

    return synonyms

def synonym_counter(sentence1, sentence2):
    words1 = sentence1.split()
    words2 = sentence2.split()

    count = 0

    for word1 in words1:
        synonyms = synonym_extractor(word1)
        for word2 in words2:
            if word2 in synonyms:
                count += 1
                
    # we do it again because the synonym relation is not symmetric
    for word2 in words2:
        synonyms = synonym_extractor(word2)
        for word1 in words1:
            if word1 in synonyms:
                count += 1

    return count

def preprocessing_synonym_counter(df):
    total_similar_count=0
    total_nonsimilar_count=0
    
    expected_porcentage = 0.0

    sum = 0.0

    n = 0
    for i, row in df.iterrows():
        S1, S2 = row['Sentence1'], row['Sentence2']

        similar_count = synonym_counter(S1, S2)
        total_nonsimilar_count += len(S1.split()) + len(S2.split()) - synonym_counter(S1, S2)

        expected_porcentage = similar_count / (len(S1.split()) + len(S2.split()))

        total_similar_count += similar_count
        
        sum += expected_porcentage
        
        n += 1

    return total_similar_count, total_nonsimilar_count, sum / n

In [44]:
def get_sim_tfidf(s1,s2):
    # Combine the sentences into one list for vectorization
    all_sentences = s1 + s2

    # Initialize a TF-IDF Vectorizer
    vectorizer = TfidfVectorizer()

    # Fit and transform the sentences
    tfidf_matrix = vectorizer.fit_transform(all_sentences)

    # Calculate cosine similarity
    #cos_sim = cosine_similarity([vec_sentence1], [vec_sentence2])[0][0]

    # Define a threshold
    threshold = 0.5
    # Generate embeddings
    model = SentenceTransformer('all-MiniLM-L6-v2')
    embeddings = model.encode(all_sentences)

    # Now, each pair of sentences at the same index can be compared
    similarities_tf = []
    similarities_sbert = []
    for i in range(len(s1)):
        # Compute cosine similarity between corresponding sentence pairs for TF-IDF
        sim_score_tf = cosine_similarity(tfidf_matrix[i], tfidf_matrix[len(s1) + i])[0][0]
        
        # Compute cosine similarity between corresponding sentence pairs for SBERT
        sim_score_sbert = cosine_similarity(
            embeddings[i].reshape(1, -1),  # Reshape embeddings to 2D
            embeddings[len(s1) + i].reshape(1, -1)  # Reshape embeddings to 2D
        )[0][0]

        # Append the binarized similarity scores
        similarities_tf.append(1 if sim_score_tf >= threshold else 0)
        similarities_sbert.append(1 if sim_score_sbert >= threshold else 0)

    return similarities_tf, similarities_sbert

In [45]:
preprocessing_synonym_counter(qqp_train)

(27231, 80478, 0.2493202622831785)

In [46]:
preprocessing(qqp_train)

(22814, 62081, 0.3238331113106615)