In [1]:
from transformers import TFAutoModel, AutoTokenizer
import tensorflow as tf
import pandas as pd
import numpy as np
import random
from sklearn.model_selection import train_test_split
import nltk
import re
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer, WordNetLemmatizer

In [2]:
df = pd.read_excel('train.xlsx')


In [4]:
df = df.sample(frac=0.1, random_state=42)

In [5]:
df.shape

(2600, 3)

In [6]:
def pairs(dataset):
    tweet_pairs = []
    authors = dataset['user'].unique()

    dic = {author: set() for author in authors}

    total_pairs = len(dataset) // 2
    for author in authors:

        tweets_user = dataset[dataset['user'] == author]['text'].tolist()
        same_author_pairs = min(len(tweets_user) * (len(tweets_user) - 1) // 2, total_pairs // 2)

        sampled_indices_same = np.random.choice(len(tweets_user), size=(same_author_pairs, 2))
        for i1, i2 in sampled_indices_same:


            if (i1 not in dic[author]) and (i2 not in dic[author]):

                tweet_pairs.append((tweets_user[i1], tweets_user[i2], author, author, 1))
                dic[author].add(i1)
                dic[author].add(i2)
                total_pairs -= 1

        diff_author_pairs = total_pairs // (len(authors) - 1)
        for other_author in authors:
            if other_author == author:
                continue
            tweets_by_other_author = dataset[dataset['user'] == other_author]['text'].tolist()

            num_pairs_with_other_author = min(len(tweets_user) * len(tweets_by_other_author), diff_author_pairs)


            sampled_indices_diff = np.random.choice(len(tweets_by_other_author), size=(num_pairs_with_other_author, 2))
            for i1, i2 in sampled_indices_diff:
                if (i1 not in dic[author]) and (i2 not in dic[other_author]):

                    tweet_pairs.append((tweets_user[np.random.randint(len(tweets_user))], tweets_by_other_author[i1], author, other_author, 0))
                    dic[author].add(i1)


                    dic[other_author].add(i2)
                    total_pairs -= 1

        if total_pairs <= 0:
            break

    df = pd.DataFrame(tweet_pairs, columns=['text1', 'text2', 'u1', 'u2', 'isSimilar'])

    return df

NewDf =  pairs(df)

In [7]:
NewDf.head()

Unnamed: 0,text1,text2,u1,u2,isSimilar
0,Got my Mercurial Vapor IX for today's match. H...,I train fast to play fast. Thanks @niketrainin...,Cristiano,Cristiano,1
1,What a night! A great result leads us one step...,pic.twitter.com/pXXUzVe1rQ,Cristiano,Cristiano,1
2,"You asked: Hi, i'm your fan #1 in Paris, can y...",We are excited to win the Copa del Rey today! ...,Cristiano,Cristiano,1
3,Here's a sneak peek of the new @SacoorBrosME C...,Looking forward to meeting Almería on the pitc...,Cristiano,Cristiano,1
4,Today I partner with @PestanaHotels for the co...,Hi everyone. Yesterday’s important win leads u...,Cristiano,Cristiano,1


In [23]:
import torch
from transformers import BertModel, BertTokenizer

class TweetSimilarityModel(torch.nn.Module):
    def __init__(self, device):
        super(TweetSimilarityModel, self).__init__()
        self.bert = BertModel.from_pretrained('bert-base-uncased').to(device)
        self.tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
        self.manhattan_distance = torch.nn.PairwiseDistance(p=1)  # Manhattan distance
        self.classifier = torch.nn.Linear(self.bert.config.hidden_size * 2, 1).to(device)

        self.sigmoid = torch.nn.Sigmoid()

    def forward(self, text1, text2):
        outputs1 = self.bert(**text1)
        outputs2 = self.bert(**text2)

    # Use the [CLS] token representation as the tweet's representation
        tweet_rep1 = outputs1.last_hidden_state[:, 0, :]
        tweet_rep2 = outputs2.last_hidden_state[:, 0, :]

    # Feature extraction
        tweet_rep_concat = torch.cat((tweet_rep1, tweet_rep2), dim=1)

    # Calculate the Manhattan distance between the two tweet representations
        manhattan_dist = self.manhattan_distance(tweet_rep1, tweet_rep2)

    # Pass the concatenated tweet representations through a dense layer to get the similarity score
        similarity_score = self.sigmoid(self.classifier(tweet_rep_concat))

        return similarity_score, manhattan_dist

In [None]:
import torch
from transformers import BertModel, BertTokenizer
from sklearn.metrics import precision_score, recall_score, f1_score
import pandas as pd
import numpy as np
from sklearn.metrics import accuracy_score

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

model = TweetSimilarityModel(device).to(device)
model.eval()

predicted_labels = []
true_labels = []

with torch.no_grad():
    for index, row in NewDf.iterrows():
        # Convert the pandas Series to strings
        text1 = str(row['text1'])
        text2 = str(row['text2'])

        # Convert strings to tensors
        text1 = model.tokenizer(text1, return_tensors='pt', padding=True, truncation=True, max_length=1500).to(device)
        text2 = model.tokenizer(text2, return_tensors='pt', padding=True, truncation=True, max_length=1500).to(device)

        # Get the similarity score and manhattan distance
        similarity_score, _ = model(text1, text2)

        # Round the similarity score to get the predicted label
        predicted_label = torch.round(similarity_score.squeeze()).cpu().numpy()
        predicted_labels.append(predicted_label)

        # Append the true label
        true_labels.append(row['isSimilar'])

# Convert lists to numpy arrays
predicted_labels = np.hstack(predicted_labels)
true_labels = np.array(true_labels)

# Calculate Precision, Recall, and F1 Score
precision = precision_score(true_labels, predicted_labels)
recall = recall_score(true_labels, predicted_labels)
f1 = f1_score(true_labels, predicted_labels)
accuracy = accuracy_score(true_labels, predicted_labels)
print(f"Precision: {precision}, Recall: {recall}, F1 Score: {f1}, Accuracy : {accuracy}")
