In [2]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [3]:
!pip install -U sentence-transformers

Collecting sentence-transformers
  Downloading sentence_transformers-3.3.1-py3-none-any.whl.metadata (10 kB)
Downloading sentence_transformers-3.3.1-py3-none-any.whl (268 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m268.8/268.8 kB[0m [31m7.1 MB/s[0m eta [36m0:00:00[0m:00:01[0m
[?25hInstalling collected packages: sentence-transformers
Successfully installed sentence-transformers-3.3.1


In [4]:
df = pd.read_csv('https://raw.githubusercontent.com/Afag-Ramazanova/Document_Similarity_with_BERT/refs/heads/main/dataset/Synthetic/synthetic_data2.csv')

In [5]:
res_l = df.loc[:, ['doc1', 'doc2']].values.T

'It is a widely accepted fact that a wealthy single man is inevitably in search of a wife.'

## Cosine Similarity using BERT embedding

In [9]:

from transformers import AutoTokenizer, AutoModel
import torch
from sklearn.metrics.pairwise import cosine_similarity

In [10]:
def compute_similarity(sentences, model_name='bert-base-uncased', max_length=512):
    """
    Compute the cosine similarity between the first sentence and all others using a pre-trained BERT model.
    
    Args:
        sentences (list of list of str): A list of sentences where the first one is compared to the rest.
        model_name (str): The pre-trained BERT model name (default: 'bert-base-uncased').
        max_length (int): Maximum sequence length for tokenization (default: 128).
        
    Returns:
        numpy.ndarray: Cosine similarity values between the first sentence and the rest.
    """
    # Load tokenizer and model
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    model = AutoModel.from_pretrained(model_name)
    
    # Tokenization and input preparation
    tokens = {'input_ids': [], 'attention_mask': []}
    for sentence in sentences:
        new_tokens = tokenizer.encode_plus(
            '\n'.join(sentence),
            max_length=max_length,
            truncation=True,
            padding='max_length',
            return_tensors='pt'
        )
        tokens['input_ids'].append(new_tokens['input_ids'][0])
        tokens['attention_mask'].append(new_tokens['attention_mask'][0])
    
    tokens['input_ids'] = torch.stack(tokens['input_ids'])
    tokens['attention_mask'] = torch.stack(tokens['attention_mask'])
    
    # Generate embeddings
    outputs = model(**tokens)
    embeddings = outputs.last_hidden_state
    attention = tokens['attention_mask']
    
    # Mask embeddings
    mask = attention.unsqueeze(-1).expand(embeddings.shape).float()
    masked_embeddings = embeddings * mask
    
    # Compute mean pooling
    summed = torch.sum(masked_embeddings, dim=1)
    counts = torch.clamp(mask.sum(dim=1), min=1e-9)
    mean_pooled = summed / counts
    mean_pooled = mean_pooled.detach().numpy()
    
    # Compute cosine similarity
    similarity = cosine_similarity([mean_pooled[0]], mean_pooled[1:])
    return similarity

In [13]:
compute_similarity(res_l)[0]



array([0.9609797], dtype=float32)

## Document Summarization approach with BERT & Cosine Similarity

In [12]:
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity


# Load pre-trained summarization model
model_name = "facebook/bart-large-cnn"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSeq2SeqLM.from_pretrained(model_name)

# Input text
summary_res = []
for i in range(2):
    text = ';'.join(res_l[i])
    
    # Tokenize and summarize
    inputs = tokenizer.encode("summarize: " + text, return_tensors="pt", max_length=1024, truncation=True)
    summary_ids = model.generate(inputs, max_length=512, min_length=40, length_penalty=2.0, num_beams=4, early_stopping=True)
    summary = tokenizer.decode(summary_ids[0], skip_special_tokens=True)

    summary_res.append(summary)

embedder_model_name = 'sentence-transformers/bert-base-nli-mean-tokens'
embedder_model = SentenceTransformer(embedder_model_name)

embeding_summary = embedder_model.encode(summary_res)
similarity_summary = cosine_similarity(
    [embeding_summary[0]],
    embeding_summary[1:]
)
print("Similarity Percentage = ",similarity_summary[0][0]*100)


config.json:   0%|          | 0.00/1.58k [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]



model.safetensors:   0%|          | 0.00/1.63G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/363 [00:00<?, ?B/s]

modules.json:   0%|          | 0.00/229 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/122 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/3.99k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/625 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/438M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/399 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

added_tokens.json:   0%|          | 0.00/2.00 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

1_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Similarity Percentage =  67.38548874855042


## Bert Semantic Classification Model

In [2]:
import numpy as np
import pandas as pd
import tensorflow as tf
import transformers
import pandas as pd

### Pre-Processing Data

In [3]:
train_df = pd.read_csv("https://raw.githubusercontent.com/Afag-Ramazanova/Document_Similarity_with_BERT/refs/heads/main/dataset/STS/stsbenchmark_train.csv",)
valid_df = pd.read_csv("https://raw.githubusercontent.com/Afag-Ramazanova/Document_Similarity_with_BERT/refs/heads/main/dataset/STS/stsbenchmark_validation.csv")
test_df = pd.read_csv("https://raw.githubusercontent.com/Afag-Ramazanova/Document_Similarity_with_BERT/refs/heads/main/dataset/STS/stsbenchmark_test.csv")

# Shape of the data
print(f"Total train samples : {train_df.shape[0]}")
print(f"Total validation samples: {valid_df.shape[0]}")
print(f"Total test samples: {valid_df.shape[0]}")

Total train samples : 5749
Total validation samples: 1500
Total test samples: 1500


In [5]:
train_df['score_classification'] = train_df['score'].apply(lambda x : 'similar' if x>=3 else "not similar")
valid_df['score_classification'] = valid_df['score'].apply(lambda x : 'similar' if x>=3 else "not similar")
test_df['score_classification'] = test_df['score'].apply(lambda x : 'similar' if x>=3 else "not similar")

In [6]:
train_df["label"] = train_df["score_classification"].apply(
    lambda x: 0 if x == "not similar" else 1 
)
y_train = tf.keras.utils.to_categorical(train_df.label, num_classes=2)

valid_df["label"] = valid_df["score_classification"].apply(
    lambda x: 0 if x == "not similar" else 1 
)
y_val = tf.keras.utils.to_categorical(valid_df.label, num_classes=2)

test_df["label"] = test_df["score_classification"].apply(
    lambda x: 0 if x == "not similar" else 1 
)
y_test = tf.keras.utils.to_categorical(test_df.label, num_classes=2)

In [19]:
labels = ["not similar", "similar"]

In [7]:
train_df.head()

Unnamed: 0,split,genre,dataset,year,sid,score,sentence1,sentence2,score_classification,label
0,train,main-captions,MSRvid,2012test,1,5.0,A plane is taking off.,An air plane is taking off.,similar,1
1,train,main-captions,MSRvid,2012test,4,3.8,A man is playing a large flute.,A man is playing a flute.,similar,1
2,train,main-captions,MSRvid,2012test,5,3.8,A man is spreading shreded cheese on a pizza.,A man is spreading shredded cheese on an uncoo...,similar,1
3,train,main-captions,MSRvid,2012test,6,2.6,Three men are playing chess.,Two men are playing chess.,not similar,0
4,train,main-captions,MSRvid,2012test,9,4.25,A man is playing the cello.,A man seated is playing the cello.,similar,1


In [8]:
filtered_train = train_df[['sentence1', 'sentence2', 'label']]
filtered_valid = valid_df[['sentence1', 'sentence2', 'label']]
filtered_test = test_df[['sentence1', 'sentence2', 'label']]

In [9]:
filtered_train.head()

Unnamed: 0,sentence1,sentence2,label
0,A plane is taking off.,An air plane is taking off.,1
1,A man is playing a large flute.,A man is playing a flute.,1
2,A man is spreading shreded cheese on a pizza.,A man is spreading shredded cheese on an uncoo...,1
3,Three men are playing chess.,Two men are playing chess.,0
4,A man is playing the cello.,A man seated is playing the cello.,1


In [10]:
import os
os.environ["WANDB_DISABLED"] = "true"

In [11]:
import pandas as pd
import torch
from transformers import BertTokenizer, BertForSequenceClassification, Trainer, TrainingArguments
from sklearn.model_selection import train_test_split
from torch.utils.data import Dataset


class CustomDataset(Dataset):
    def __init__(self, dataframe, tokenizer, max_length=128):
        self.data = dataframe
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        row = self.data.iloc[idx]
        encoding = self.tokenizer(
            row["sentence1"],
            row["sentence2"],
            padding="max_length",
            truncation=True,
            max_length=self.max_length,
            return_tensors="pt",
        )
        item = {key: val.squeeze(0) for key, val in encoding.items()}
        item["labels"] = torch.tensor(row["label"], dtype=torch.long)
        return item
        

In [12]:
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
train_dataset = CustomDataset(filtered_train, tokenizer)
test_dataset = CustomDataset(filtered_test, tokenizer)


model = BertForSequenceClassification.from_pretrained("bert-base-uncased", num_labels=2)


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [13]:

# Define training arguments
training_args = TrainingArguments(
    output_dir="./results",
    num_train_epochs=3,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    warmup_steps=500,
    weight_decay=0.01,
    logging_dir="./logs",
    logging_steps=10,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    save_total_limit=2,
    load_best_model_at_end=True,
    report_to=["none"],  # Disable wandb
)

# Define Trainer
trainer = Trainer(
    model=model,                        # The model to be trained
    args=training_args,                 # Training arguments
    train_dataset=train_dataset,        # Training dataset
    eval_dataset=test_dataset,          # Evaluation dataset
    tokenizer=tokenizer,                # Tokenizer for preprocessing
)

# Train the model
trainer.train()




Epoch,Training Loss,Validation Loss
1,0.4604,0.47751
2,0.288,0.420245
3,0.0338,0.601781


TrainOutput(global_step=1080, training_loss=0.32483238599918507, metrics={'train_runtime': 259.2662, 'train_samples_per_second': 66.522, 'train_steps_per_second': 4.166, 'total_flos': 1134469092948480.0, 'train_loss': 0.32483238599918507, 'epoch': 3.0})

In [14]:
# Evaluate the model
evaluation_metrics = trainer.evaluate()
print("Evaluation Metrics:", evaluation_metrics)


Evaluation Metrics: {'eval_loss': 0.420244961977005, 'eval_runtime': 5.9624, 'eval_samples_per_second': 231.283, 'eval_steps_per_second': 14.591, 'epoch': 3.0}


### Saving Model

In [21]:
torch.save(model.state_dict(), 'finalized_model.pth')

### Loading Model

In [22]:
## Code for downloading model from presaved model in there

# Define the device (CPU or GPU)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Load the model architecture
model = BertForSequenceClassification.from_pretrained("bert-base-uncased", num_labels=2)

# Load the saved state_dict and map it to the appropriate device
model.load_state_dict(torch.load('finalized_model.pth', map_location=device))

# Move the model to the selected device
model.to(device)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  model.load_state_dict(torch.load('finalized_model.pth', map_location=device))


BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSdpaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e

### Inference

In [16]:
import torch.nn.functional as F

def predict(sentence1, sentence2, model, tokenizer, device, max_length=128):
    # Tokenize the input
    inputs = tokenizer(
        sentence1,
        sentence2,
        padding="max_length",
        truncation=True,
        max_length=max_length,
        return_tensors="pt",
    ).to(device)  # Move inputs to the same device as the model

    # Run the model
    with torch.no_grad():
        outputs = model(
            input_ids=inputs["input_ids"],
            attention_mask=inputs["attention_mask"],
        )
        logits = outputs.logits

    # Apply softmax to get probabilities
    probs = F.softmax(logits, dim=1)
    predicted_class = torch.argmax(probs).item()

    return predicted_class, probs


In [17]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)  # Move the model to the selected device
model.eval()

BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSdpaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e

In [20]:
# Generate two similar sentences for testing
sentence1 ="The cat is sleeping on the sofa."
sentence2 = "A cat is curled up and napping on the couch."
 
# Perform inference
predicted_class, probabilities = predict(sentence1, sentence2, model, tokenizer, device)

# Print results
print(f"Sentence 1: {sentence1}")
print(f"Sentence 2: {sentence2}")
print(f"Predicted Class: {labels[predicted_class]}")
print(f"Class Probabilities: {probabilities}")

Sentence 1: The cat is sleeping on the sofa.
Sentence 2: A cat is curled up and napping on the couch.
Predicted Class: not similar
Class Probabilities: tensor([[0.8088, 0.1912]], device='cuda:0')
