In [1]:
import numpy as np
import pandas as pd
import tensorflow as tf
from sklearn.model_selection import train_test_split
from sklearn.metrics import precision_score, recall_score, f1_score
from transformers import TFBertModel, BertTokenizer

In [2]:
from google.colab import drive
drive.mount('/content/drive')
# Load dataset
tweet_dtr = pd.read_csv("/content/drive/My Drive/Colab Notebooks/train.xlsx - Sheet1.csv")

Mounted at /content/drive


In [3]:
tweet_dtr=tweet_dtr.drop('Unnamed: 0', axis=1)

In [4]:
from sklearn.model_selection import StratifiedShuffleSplit
import random

# Function to sample pairs of tweets
def sample_tweet_pairs(tweet_dtr, num_pairs):
    same_user_pairs = []
    diff_user_pairs = []

    # Group tweets by user
    grouped = tweet_dtr.groupby('user')

    # Sampling pairs
    for _ in range(num_pairs):
        # Randomly select a user
        user = random.choice(tweet_dtr['user'].unique())

        # Select two tweets from the same user
        tweet_pair_same_user = grouped.get_group(user).sample(2, replace=False)['text'].tolist()
        same_user_pairs.append((tweet_pair_same_user[0], tweet_pair_same_user[1], 1))

        # Select two tweets from different users
        other_users = tweet_dtr[tweet_dtr['user'] != user]
        tweet_pair_diff_user = other_users.sample(2, replace=False)['text'].tolist()
        diff_user_pairs.append((tweet_pair_diff_user[0], tweet_pair_diff_user[1], 0))

    return same_user_pairs, diff_user_pairs

# Create tweet pairs
num_pairs = 13000
same_user_pairs, diff_user_pairs = sample_tweet_pairs(tweet_dtr, num_pairs)

# Combine same-user and different-user pairs
tweet_pairs = same_user_pairs + diff_user_pairs
random.shuffle(tweet_pairs)

# Convert to DataFrame
pairs_df = pd.DataFrame(tweet_pairs, columns=['tweet1', 'tweet2', 'similarity_label'])

# Stratified sampling for balanced representation
splitter = StratifiedShuffleSplit(n_splits=1, test_size=0.2, random_state=42)
for train_index, test_index in splitter.split(pairs_df[['tweet1', 'tweet2']], pairs_df['similarity_label']):
    train_pairs = pairs_df.iloc[train_index]
    test_pairs = pairs_df.iloc[test_index]

# Check the balance of classes in the training data
print("Training data class distribution:")
print(train_pairs['similarity_label'].value_counts())

Training data class distribution:
similarity_label
1    10400
0    10400
Name: count, dtype: int64


In [None]:
# all text to lowercase
train_pairs["tweet1"] = [tweet.lower() for tweet in train_pairs["tweet1"]]
train_pairs["tweet2"] = [tweet.lower() for tweet in train_pairs["tweet2"]]

test_pairs["tweet1"] = [tweet.lower() for tweet in test_pairs["tweet1"]]
test_pairs["tweet2"] = [tweet.lower() for tweet in test_pairs["tweet2"]]

In [6]:
import re
import nltk
from nltk.tokenize import TweetTokenizer
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

nltk.download('stopwords')
nltk.download('wordnet')

nltk.download('punkt')

# removing special characters sauf hashtags and mentions
def remove_special_chars(text):
    # Initializing TweetTokenizer from NLTK
    tokenizer = TweetTokenizer()

    # Tokenizing the text
    tokens = tokenizer.tokenize(text)

    # Create a list of stop words
    stop_words = stopwords.words('english')

    # Create a Lemmatizer object
    lemmatizer = WordNetLemmatizer()

    # empty list to store cleaned tokens
    cleaned_tokens = []

    # regex pattern to remove punctuation symbols and special characters
    pattern = r'[^a-zA-Z0-9#@]'

    for token in tokens:
        # add hashtags and mentions directly to cleaned_tokens
        if token.startswith('#') or token.startswith('@'):
            cleaned_tokens.append(token)
        else:
            # Remove special characters using regex
            cleaned_token = re.sub(pattern, '', token)
            # If the token is not empty after cleaning, add it to cleaned_tokens
            if cleaned_token:
                cleaned_tokens.append(cleaned_token)

    # Remove stop words
    cleaned_token_s = [token for token in cleaned_tokens if token not in stop_words]

    # Lemmatize the words
    cleaned_token_l = [lemmatizer.lemmatize(token) for token in cleaned_token_s]

    # Join the cleaned tokens back into a single string
    cleaned_text = ' '.join(cleaned_token_l)

    return cleaned_text

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


In [None]:
train_pairs["tweet1"] = [remove_special_chars(tweet) for tweet in train_pairs["tweet1"]]
train_pairs["tweet2"] = [remove_special_chars(tweet) for tweet in train_pairs["tweet2"]]

test_pairs["tweet1"] = [remove_special_chars(tweet) for tweet in test_pairs["tweet1"]]
test_pairs["tweet2"] = [remove_special_chars(tweet) for tweet in test_pairs["tweet2"]]

In [8]:
train_pairs.head()

Unnamed: 0,tweet1,tweet2,similarity_label
10124,nice try @rwitherspoon totally tell photoshopp...,congrats jason sudeikis @oliviawilde welcome b...,1
1568,weve got whole lot infrastructure could build ...,live missouri ask @royblunt support bipartisan...,0
15225,new #aca contraception guideline save money wo...,need partisan agenda need commonsense agenda s...,1
2482,best access #nbafinals @cavs @warriors game 1 ...,welcome punderdome httpsgoogl0ii5as pictwitter...,0
13713,today think syrian kid like omar drew dream ba...,@imapurpleninjah intense umm guess im happy,0


In [9]:
def tokenize_text(tokenizer, text):
    return tokenizer.encode_plus(
        text,
        max_length=64,  # Assuming max tweet length of 64 tokens
        truncation=True,
        padding='max_length',
        return_tensors='tf'
    )

In [10]:
import tensorflow as tf
from transformers import BertTokenizer, TFBertModel

# Load pre-trained BERT model and tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
transformer_model = TFBertModel.from_pretrained('bert-base-uncased')


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of the PyTorch model were not used when initializing the TF 2.0 model TFBertModel: ['cls.seq_relationship.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.bias']
- This IS expected if you are initializing TFBertModel from a PyTorch model trained on another task or with another architecture (e.g. initializing a TFBertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFBertModel from a PyTorch model that you expect to be exactly identical (e.g. initializing a TFBertForSequenceClassification model from a BertForSequenceClassification model).
All the weights of TFBertModel were initialized from the PyTorch model.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFBertModel for predictions w

In [11]:
# Tokenize the tweets
tokenized_X_train_tweet1 = [tokenize_text(tokenizer, tweet) for tweet in train_pairs['tweet1']]
tokenized_X_train_tweet2 = [tokenize_text(tokenizer, tweet) for tweet in train_pairs['tweet2']]

tokenized_X_test_tweet1 = [tokenize_text(tokenizer, tweet) for tweet in test_pairs['tweet1']]
tokenized_X_test_tweet2 = [tokenize_text(tokenizer, tweet) for tweet in test_pairs['tweet2']]

#tokenized_X_train_tweet2



In [12]:
# MODEL FUNC
def build_model(transformer_model):

    # INPUT LAYER
    input_ids1 = tf.keras.layers.Input(shape=(None,), dtype=tf.int32)
    input_ids2 = tf.keras.layers.Input(shape=(None,), dtype=tf.int32)

    #EMBEDDING LAYER
    embedding1 = transformer_model(input_ids1)[0][:, 0, :]
    embedding2 = transformer_model(input_ids2)[0][:, 0, :]

    # COMPARISION USING MANHATTAN DISTANCE
    distance = tf.keras.layers.Lambda(lambda x: tf.abs(x[0] - x[1]))([embedding1, embedding2])

    # DENSE LAYER WITH SIGMOID AS ACTIVATION FUNCTION
    dense_layer = tf.keras.layers.Dense(1, activation='sigmoid')(distance)

    model = tf.keras.Model(inputs=[input_ids1, input_ids2], outputs=dense_layer)

    model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
    return model

In [13]:
# Getting the input id tensors from the tokenized list of input ids, token ids and attention mask
inputs_1=[i['input_ids'] for i in tokenized_X_train_tweet1 ]
inputs_2=[i['input_ids'] for i in tokenized_X_train_tweet2 ]

# Converting the input id tensors to numpy arrays
inputs_1_array = np.array([tensor.numpy() for tensor in inputs_1])
inputs_2_array = np.array([tensor.numpy() for tensor in inputs_2])

# Ensuring the shapes of inputs_1_array and inputs_2_array are compatible
inputs_1_array = inputs_1_array.squeeze(axis=1)
inputs_2_array = inputs_2_array.squeeze(axis=1)



In [14]:
# Call build_model function which returns the model into model variable
model=build_model(transformer_model)

# Train model
model.fit([inputs_1_array, inputs_2_array],
          train_pairs['similarity_label'], epochs=10, batch_size=32)

Epoch 1/10




Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<tf_keras.src.callbacks.History at 0x798440185990>

In [15]:
# Getting the input id tensors from the tokenized list of input ids, token ids and attention mask
inputs_3=[i['input_ids'] for i in tokenized_X_test_tweet1 ]
inputs_4=[i['input_ids'] for i in tokenized_X_test_tweet2 ]

# Converting the input id tensors to numpy arrays
inputs_3_array = np.array([tensor.numpy() for tensor in inputs_3])
inputs_4_array = np.array([tensor.numpy() for tensor in inputs_4])

# Ensuring the shapes of inputs_3_array and inputs_4_array are compatible
inputs_3_array = inputs_3_array.squeeze(axis=1)
inputs_4_array = inputs_4_array.squeeze(axis=1)

In [16]:
test_loss, test_accuracy=model.evaluate([inputs_3_array,inputs_4_array],test_pairs['similarity_label'])



In [19]:
# Read test data csv
tweet_dte=pd.read_csv("/content/drive/My Drive/Colab Notebooks/test.xlsx - Sheet1.csv")

In [20]:
tweet_dte.shape

(1300, 3)

In [21]:
tweet_dte.describe()

Unnamed: 0.1,Unnamed: 0
count,1300.0
mean,4930.17
std,4619.120556
min,1.0
25%,1634.5
50%,3644.0
75%,7031.25
max,28213.0


In [22]:
tweet_dte.head()

Unnamed: 0.1,Unnamed: 0,user,text
0,1795,ladygaga,@BarackObama thanku for the support you are se...
1,5903,ladygaga,The first time Tom Ford and Nick Knight worked...
2,1516,ladygaga,I feel absolutely fabulous.pic.twitter.com/NZC...
3,5982,ladygaga,"#BraveCharlie bornthiswayfoundation, an opport..."
4,4148,ladygaga,Chipmunk Cheeks Wisdom Teeth out before tour...


In [23]:
tweet_dte=tweet_dte.drop('Unnamed: 0', axis=1)

In [24]:
tweet_dte.head()

Unnamed: 0,user,text
0,ladygaga,@BarackObama thanku for the support you are se...
1,ladygaga,The first time Tom Ford and Nick Knight worked...
2,ladygaga,I feel absolutely fabulous.pic.twitter.com/NZC...
3,ladygaga,"#BraveCharlie bornthiswayfoundation, an opport..."
4,ladygaga,Chipmunk Cheeks Wisdom Teeth out before tour...


In [25]:
# Create tweet pairs
num_pairs = 650
same_user_pairs, diff_user_pairs = sample_tweet_pairs(tweet_dte, num_pairs)

# Combine same-user and different-user pairs
tweet_pairs_1 = same_user_pairs + diff_user_pairs
random.shuffle(tweet_pairs_1)

# Convert to DataFrame
test_df = pd.DataFrame(tweet_pairs_1, columns=['tweet1', 'tweet2', 'similarity_label'])

In [26]:
test_df.head()

Unnamed: 0,tweet1,tweet2,similarity_label
0,Last month continued the streak of record-brea...,Update: @OnePlus and @MadeByGoogle asked for t...,0
1,'Clinton Campaign Tried to Limit Damage From C...,"I love watching these poor, pathetic people (p...",1
2,.@USTradeRep just announced an agreement in pr...,*watches Dancing with the Stars once* https://...,0
3,PARAGUAY!! I'm coming down in a moment to say ...,"I love my fans so much, thank you for 2 killer...",1
4,I asked President @BillClinton about his wife-...,I'm pretty sure they'll replay this every year...,1


In [27]:
test_df.shape

(1300, 3)

In [28]:
# Test data to lowercasae
test_df["tweet1"] = [tweet.lower() for tweet in test_df["tweet1"]]
test_df["tweet2"] = [tweet.lower() for tweet in test_df["tweet2"]]

# Remove special characters from test data
test_df["tweet1"] = [remove_special_chars(tweet) for tweet in test_df["tweet1"]]
test_df["tweet2"] = [remove_special_chars(tweet) for tweet in test_df["tweet2"]]

# Tokenize test data
tokenized_x_test_tweet1 = [tokenize_text(tokenizer, tweet) for tweet in test_df['tweet1']]
tokenized_x_test_tweet2 = [tokenize_text(tokenizer, tweet) for tweet in test_df['tweet2']]

In [29]:
# Getting the input id tensors from the tokenized list of input ids, token ids and attention mask
inputs_5=[i['input_ids'] for i in tokenized_x_test_tweet1 ]
inputs_6=[i['input_ids'] for i in tokenized_x_test_tweet2 ]

# Converting the input id tensors to numpy arrays
inputs_5_array = np.array([tensor.numpy() for tensor in inputs_5])
inputs_6_array = np.array([tensor.numpy() for tensor in inputs_6])

# Ensuring the shapes of inputs_5_array and inputs_6_array are compatible
inputs_5_array = inputs_5_array.squeeze(axis=1)
inputs_6_array = inputs_6_array.squeeze(axis=1)

In [30]:
# PREDICT
predictions=model.predict([inputs_5_array,inputs_6_array])



In [31]:
from sklearn.metrics import precision_score, recall_score, f1_score

# Round off predictions
predicted_labels=np.round(predictions).astype(int)

# Get true labels from test data
true_labels=test_df["similarity_label"]

# Compute precision, recall and f1-score
precision = precision_score(true_labels, predicted_labels)
recall = recall_score(true_labels, predicted_labels)
f1 = f1_score(true_labels, predicted_labels)

print(f"Precision: {precision:.2f}")
print(f"Recall: {recall:.2f}")
print(f"F1 Score: {f1:.2f}")

Precision: 0.50
Recall: 1.00
F1 Score: 0.67
