In [1]:
import numpy as np
import pandas as pd
import tensorflow as tf
from sklearn.model_selection import train_test_split
from sklearn.metrics import precision_score, recall_score, f1_score
from transformers import TFBertModel, BertTokenizer

In [2]:
tweet_dtr=pd.read_csv("train.xlsx - Sheet1.csv")

In [3]:
tweet_dtr=tweet_dtr.drop('Unnamed: 0', axis=1)

In [4]:
tweet_dtr.head()

Unnamed: 0,user,text
0,ladygaga,Make your reservation now. #GagaAHSHotelhttps:...
1,ladygaga,@DrunkyViviana we shot for 3 days but planned ...
2,ladygaga,me I'm back in the NY GROOVEpic.twitter.com/c...
3,ladygaga,GLEE WAS SO AMAZING! AH!!!!
4,ladygaga,LIVE with @JoJoWright in 5 minutes on @1027KIISFM


In [5]:
from sklearn.model_selection import StratifiedShuffleSplit
import random

# Function to sample pairs of tweets
def sample_tweet_pairs(tweet_dtr, num_pairs):
    same_user_pairs = []
    diff_user_pairs = []

    # Group tweets by user
    grouped = tweet_dtr.groupby('user')

    # Sampling pairs
    for _ in range(num_pairs):
        # Randomly select a user
        user = random.choice(tweet_dtr['user'].unique())
        
        # Select two tweets from the same user
        tweet_pair_same_user = grouped.get_group(user).sample(2, replace=False)['text'].tolist()
        same_user_pairs.append((tweet_pair_same_user[0], tweet_pair_same_user[1], 1))

        # Select two tweets from different users
        other_users = tweet_dtr[tweet_dtr['user'] != user]
        tweet_pair_diff_user = other_users.sample(2, replace=False)['text'].tolist()
        diff_user_pairs.append((tweet_pair_diff_user[0], tweet_pair_diff_user[1], 0))

    return same_user_pairs, diff_user_pairs

# Create tweet pairs
num_pairs = 13000  
same_user_pairs, diff_user_pairs = sample_tweet_pairs(tweet_dtr, num_pairs)

# Combine same-user and different-user pairs
tweet_pairs = same_user_pairs + diff_user_pairs
random.shuffle(tweet_pairs)

# Convert to DataFrame
pairs_df = pd.DataFrame(tweet_pairs, columns=['tweet1', 'tweet2', 'similarity_label'])

# Stratified sampling for balanced representation
splitter = StratifiedShuffleSplit(n_splits=1, test_size=0.2, random_state=42)
for train_index, test_index in splitter.split(pairs_df[['tweet1', 'tweet2']], pairs_df['similarity_label']):
    train_pairs = pairs_df.iloc[train_index]
    test_pairs = pairs_df.iloc[test_index]

# Check the balance of classes in the training data
print("Training data class distribution:")
print(train_pairs['similarity_label'].value_counts())

Training data class distribution:
1    10400
0    10400
Name: similarity_label, dtype: int64


In [6]:
train_pairs.head()

Unnamed: 0,tweet1,tweet2,similarity_label
10068,Back in the studio. #confident in almost 3 hou...,me and @jaxonbieber - unstoppable! haha,1
1575,Seeing a chrome delete is what convinced me to...,Jurgen Klopp has warned his @LFC side to be wa...,0
15117,Join the President for his backyard birthday p...,If you agree that higher education is an econo...,1
2451,I'm ready for all the new solar eclipse wallpa...,"Fight as one, rise together !\nThe new @Portug...",0
13873,I met this grl that looked like Apollonia and ...,"If we are not careful, we will find that knife...",0


In [7]:
train_pairs.shape

(20800, 3)

In [8]:
test_pairs.shape

(5200, 3)

## Data Preprocessing

TEXT CLEANING

In [None]:
# all text to lowercase
train_pairs["tweet1"] = [tweet.lower() for tweet in train_pairs["tweet1"]]
train_pairs["tweet2"] = [tweet.lower() for tweet in train_pairs["tweet2"]]

test_pairs["tweet1"] = [tweet.lower() for tweet in test_pairs["tweet1"]]
test_pairs["tweet2"] = [tweet.lower() for tweet in test_pairs["tweet2"]]

In [10]:
import re
import nltk
from nltk.tokenize import TweetTokenizer

nltk.download('punkt')

# removing special characters sauf hashtags and mentions
def remove_special_chars(text):
    # Initializing TweetTokenizer from NLTK
    tokenizer = TweetTokenizer()

    # Tokenizing the text
    tokens = tokenizer.tokenize(text)

    # empty list to store cleaned tokens
    cleaned_tokens = []

    # regex pattern to remove punctuation symbols and special characters
    pattern = r'[^a-zA-Z0-9#@]'

    for token in tokens:
        # add hashtags and mentions directly to cleaned_tokens
        if token.startswith('#') or token.startswith('@'):
            cleaned_tokens.append(token)
        else:
            # Remove special characters using regex
            cleaned_token = re.sub(pattern, '', token)
            # If the token is not empty after cleaning, add it to cleaned_tokens
            if cleaned_token:
                cleaned_tokens.append(cleaned_token)

    # Join the cleaned tokens back into a single string
    cleaned_text = ' '.join(cleaned_tokens)
    
    return cleaned_text

[nltk_data] Downloading package punkt to C:\Users\NEXUS
[nltk_data]     COMPUTERS\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [None]:
train_pairs["tweet1"] = [remove_special_chars(tweet) for tweet in train_pairs["tweet1"]]
train_pairs["tweet2"] = [remove_special_chars(tweet) for tweet in train_pairs["tweet2"]]

test_pairs["tweet1"] = [remove_special_chars(tweet) for tweet in test_pairs["tweet1"]]
test_pairs["tweet2"] = [remove_special_chars(tweet) for tweet in test_pairs["tweet2"]]

In [12]:
train_pairs.head()

Unnamed: 0,tweet1,tweet2,similarity_label
10068,back in the studio #confident in almost 3 hour...,me and @jaxonbieber unstoppable haha,1
1575,seeing a chrome delete is what convinced me to...,jurgen klopp has warned his @lfc side to be wa...,0
15117,join the president for his backyard birthday p...,if you agree that higher education is an econo...,1
2451,im ready for all the new solar eclipse wallpap...,fight as one rise together the new @portugal n...,0
13873,i met this grl that looked like apollonia and ...,if we are not careful we will find that knife ...,0


In [13]:
def preprocess_text(text):
    #text = text.lower()
    # Remove punctuation, symbols, etc.
    #text = text.replace(".", "").replace("!", "").replace(",", "").replace("?", "")
    # Tokenize
    tokens = text.split()
    return tokens

# Tokenization
def tokenize_text(tokenizer, text):
    return tokenizer.encode_plus(
        text,
        max_length=64,  # Assuming max tweet length of 64 tokens
        truncation=True,
        padding='max_length',
        return_tensors='tf'
    )

In [None]:
# Load pre-trained BERT model and tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
transformer_model = TFBertModel.from_pretrained('bert-base-uncased')

In [None]:
# Tokenize text
tokenized_X_train_tweet1 = [tokenize_text(tokenizer, tweet) for tweet in train_pairs['tweet1']]
tokenized_X_train_tweet2 = [tokenize_text(tokenizer, tweet) for tweet in train_pairs['tweet2']]

tokenized_X_test_tweet1 = [tokenize_text(tokenizer, tweet) for tweet in test_pairs['tweet1']]
tokenized_X_test_tweet2 = [tokenize_text(tokenizer, tweet) for tweet in test_pairs['tweet2']]

In [16]:
# MODEL FUNC
def build_model(transformer_model):
    
    # INPUT LAYER
    input_ids1 = tf.keras.layers.Input(shape=(None,), dtype=tf.int32)
    input_ids2 = tf.keras.layers.Input(shape=(None,), dtype=tf.int32)
    
    #EMBEDDING LAYER
    embedding1 = transformer_model(input_ids1)[0][:, 0, :]
    embedding2 = transformer_model(input_ids2)[0][:, 0, :]
    
    # COMPARISION USING MANHATTAN DISTANCE
    distance = tf.keras.layers.Lambda(lambda x: tf.abs(x[0] - x[1]))([embedding1, embedding2])
    
    # DENSE LAYER WITH SIGMOID AS ACTIVATION FUNCTION
    dense_layer = tf.keras.layers.Dense(1, activation='sigmoid')(distance)
    
    model = tf.keras.Model(inputs=[input_ids1, input_ids2], outputs=dense_layer)
    
    model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
    return model

In [20]:
# Getting the input id tensors from the tokenized list of input ids, token ids and attention mask
inputs_1=[i['input_ids'] for i in tokenized_X_train_tweet1 ]
inputs_2=[i['input_ids'] for i in tokenized_X_train_tweet2 ]

In [23]:
# Converting the input id tensors to numpy arrays
inputs_1_array = np.array([tensor.numpy() for tensor in inputs_1])
inputs_2_array = np.array([tensor.numpy() for tensor in inputs_2])

# Ensuring the shapes of inputs_1_array and inputs_2_array are compatible
inputs_1_array = inputs_1_array.squeeze(axis=1)
inputs_2_array = inputs_2_array.squeeze(axis=1)

In [24]:
# Call build_model function which returns the model into model variable
model=build_model(transformer_model)

# Train model
model.fit([inputs_1_array, inputs_2_array], 
          train_pairs['similarity_label'], epochs=1, batch_size=32)



<tensorflow.python.keras.callbacks.History at 0x22e59940508>

In [38]:
# Getting the input id tensors from the tokenized list of input ids, token ids and attention mask
inputs_3=[i['input_ids'] for i in tokenized_X_test_tweet1 ]
inputs_4=[i['input_ids'] for i in tokenized_X_test_tweet2 ]

# Converting the input id tensors to numpy arrays
inputs_3_array = np.array([tensor.numpy() for tensor in inputs_3])
inputs_4_array = np.array([tensor.numpy() for tensor in inputs_4])

# Ensuring the shapes of inputs_3_array and inputs_4_array are compatible
inputs_3_array = inputs_3_array.squeeze(axis=1)
inputs_4_array = inputs_4_array.squeeze(axis=1)

In [40]:
test_loss, test_accuracy=model.evaluate([inputs_3_array,inputs_4_array],test_pairs['similarity_label'])



### TESTING

In [25]:
# Read test data csv
tweet_dte=pd.read_csv("test.xlsx - Sheet1.csv")

In [26]:
tweet_dte.shape

(1300, 3)

In [27]:
tweet_dte.describe()

Unnamed: 0.1,Unnamed: 0
count,1300.0
mean,4930.17
std,4619.120556
min,1.0
25%,1634.5
50%,3644.0
75%,7031.25
max,28213.0


In [28]:
tweet_dte.head()

Unnamed: 0.1,Unnamed: 0,user,text
0,1795,ladygaga,@BarackObama thanku for the support you are se...
1,5903,ladygaga,The first time Tom Ford and Nick Knight worked...
2,1516,ladygaga,I feel absolutely fabulous.pic.twitter.com/NZC...
3,5982,ladygaga,"#BraveCharlie bornthiswayfoundation, an opport..."
4,4148,ladygaga,Chipmunk Cheeks Wisdom Teeth out before tour...


In [29]:
tweet_dte=tweet_dte.drop('Unnamed: 0', axis=1)

In [30]:
tweet_dte.head()

Unnamed: 0,user,text
0,ladygaga,@BarackObama thanku for the support you are se...
1,ladygaga,The first time Tom Ford and Nick Knight worked...
2,ladygaga,I feel absolutely fabulous.pic.twitter.com/NZC...
3,ladygaga,"#BraveCharlie bornthiswayfoundation, an opport..."
4,ladygaga,Chipmunk Cheeks Wisdom Teeth out before tour...


In [35]:
# Create tweet pairs
num_pairs = 650  
same_user_pairs, diff_user_pairs = sample_tweet_pairs(tweet_dte, num_pairs)

# Combine same-user and different-user pairs
tweet_pairs_1 = same_user_pairs + diff_user_pairs
random.shuffle(tweet_pairs_1)

# Convert to DataFrame
test_df = pd.DataFrame(tweet_pairs_1, columns=['tweet1', 'tweet2', 'similarity_label'])


In [36]:
test_df.head()

Unnamed: 0,tweet1,tweet2,similarity_label
0,straight W's for the East leading @Raptors!\n...,It's morrissey night! Where are you?,0
1,Final pic.twitter.com/A0xnTPUPKH,House Intelligence Committee votes to release ...,0
2,You know what… you miss 100% of the shots you ...,I mean... I guess I have to give it a shot.,1
3,Dishes & swishes... buckets & handles... @Step...,My new babies pic.twitter.com/s4ejQD6V4l,0
4,Donovan Mitchell goes coast to coast to beat t...,Had a lot of fun creating The Private Edition ...,0


In [37]:
test_df.shape

(1300, 3)

In [41]:
# Test data to lowercasae
test_df["tweet1"] = [tweet.lower() for tweet in test_df["tweet1"]]
test_df["tweet2"] = [tweet.lower() for tweet in test_df["tweet2"]]

In [42]:
# Remove special characters from test data 
test_df["tweet1"] = [remove_special_chars(tweet) for tweet in test_df["tweet1"]]
test_df["tweet2"] = [remove_special_chars(tweet) for tweet in test_df["tweet2"]]

In [43]:
# Tokenize test data
tokenized_x_test_tweet1 = [tokenize_text(tokenizer, tweet) for tweet in test_df['tweet1']]
tokenized_x_test_tweet2 = [tokenize_text(tokenizer, tweet) for tweet in test_df['tweet2']]

In [45]:
# Getting the input id tensors from the tokenized list of input ids, token ids and attention mask
inputs_5=[i['input_ids'] for i in tokenized_x_test_tweet1 ]
inputs_6=[i['input_ids'] for i in tokenized_x_test_tweet2 ]

# Converting the input id tensors to numpy arrays
inputs_5_array = np.array([tensor.numpy() for tensor in inputs_5])
inputs_6_array = np.array([tensor.numpy() for tensor in inputs_6])

# Ensuring the shapes of inputs_5_array and inputs_6_array are compatible
inputs_5_array = inputs_5_array.squeeze(axis=1)
inputs_6_array = inputs_6_array.squeeze(axis=1)

In [46]:
# PREDICT
predictions=model.predict([inputs_5_array,inputs_6_array])

In [48]:
from sklearn.metrics import precision_score, recall_score, f1_score

# Round off predictions
predicted_labels=np.round(predictions).astype(int)

# Get true labels from test data
true_labels=test_df["similarity_label"]

# Compute precision, recall and f1-score
precision = precision_score(true_labels, predicted_labels)
recall = recall_score(true_labels, predicted_labels)
f1 = f1_score(true_labels, predicted_labels)

print(f"Precision: {precision:.2f}")
print(f"Recall: {recall:.2f}")
print(f"F1 Score: {f1:.2f}")

Precision: 0.50
Recall: 1.00
F1 Score: 0.67
