In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [2]:
!pip install -U sentence-transformers

Collecting sentence-transformers
  Downloading sentence_transformers-3.3.1-py3-none-any.whl.metadata (10 kB)
Downloading sentence_transformers-3.3.1-py3-none-any.whl (268 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m268.8/268.8 kB[0m [31m5.9 MB/s[0m eta [36m0:00:00[0m:00:01[0m
[?25hInstalling collected packages: sentence-transformers
Successfully installed sentence-transformers-3.3.1


In [11]:
df = pd.read_csv('https://raw.githubusercontent.com/Afag-Ramazanova/Document_Similarity_with_BERT/refs/heads/main/dataset/synthetic/synthetic_data.csv')

ParserError: Error tokenizing data. C error: Expected 3 fields in line 3, saw 4


## Cosine Similarity using BERT embedding

In [5]:

from transformers import AutoTokenizer, AutoModel
import torch
from sklearn.metrics.pairwise import cosine_similarity

In [6]:
def compute_similarity(sentences, model_name='bert-base-uncased', max_length=512):
    """
    Compute the cosine similarity between the first sentence and all others using a pre-trained BERT model.
    
    Args:
        sentences (list of list of str): A list of sentences where the first one is compared to the rest.
        model_name (str): The pre-trained BERT model name (default: 'bert-base-uncased').
        max_length (int): Maximum sequence length for tokenization (default: 128).
        
    Returns:
        numpy.ndarray: Cosine similarity values between the first sentence and the rest.
    """
    # Load tokenizer and model
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    model = AutoModel.from_pretrained(model_name)
    
    # Tokenization and input preparation
    tokens = {'input_ids': [], 'attention_mask': []}
    for sentence in sentences:
        new_tokens = tokenizer.encode_plus(
            '\n'.join(sentence),
            max_length=max_length,
            truncation=True,
            padding='max_length',
            return_tensors='pt'
        )
        tokens['input_ids'].append(new_tokens['input_ids'][0])
        tokens['attention_mask'].append(new_tokens['attention_mask'][0])
    
    tokens['input_ids'] = torch.stack(tokens['input_ids'])
    tokens['attention_mask'] = torch.stack(tokens['attention_mask'])
    
    # Generate embeddings
    outputs = model(**tokens)
    embeddings = outputs.last_hidden_state
    attention = tokens['attention_mask']
    
    # Mask embeddings
    mask = attention.unsqueeze(-1).expand(embeddings.shape).float()
    masked_embeddings = embeddings * mask
    
    # Compute mean pooling
    summed = torch.sum(masked_embeddings, dim=1)
    counts = torch.clamp(mask.sum(dim=1), min=1e-9)
    mean_pooled = summed / counts
    mean_pooled = mean_pooled.detach().numpy()
    
    # Compute cosine similarity
    similarity = cosine_similarity([mean_pooled[0]], mean_pooled[1:])
    return similarity

In [8]:
compute_similarity(['This is the test1', 'This is the test2'])[0]



array([0.99510545], dtype=float32)

## Document Summarization approach with BERT & Cosine Similarity

In [9]:
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity


# Load pre-trained summarization model
model_name = "facebook/bart-large-cnn"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSeq2SeqLM.from_pretrained(model_name)

# Input text
summary_res = []
for i in range(2):
    text = ';'.join(res_l[i])
    
    # Tokenize and summarize
    inputs = tokenizer.encode("summarize: " + text, return_tensors="pt", max_length=1024, truncation=True)
    summary_ids = model.generate(inputs, max_length=512, min_length=40, length_penalty=2.0, num_beams=4, early_stopping=True)
    summary = tokenizer.decode(summary_ids[0], skip_special_tokens=True)

    summary_res.append(summary)

embedder_model_name = 'sentence-transformers/bert-base-nli-mean-tokens'
embedder_model = SentenceTransformer(embedder_model_name)

embeding_summary = embedder_model.encode(summary_res)
similarity_summary = cosine_similarity(
    [embeding_summary[0]],
    embeding_summary[1:]
)
print("Similarity Percentage = ",similarity_summary[0][0]*100)


config.json:   0%|          | 0.00/1.58k [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]



model.safetensors:   0%|          | 0.00/1.63G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/363 [00:00<?, ?B/s]

NameError: name 'res_l' is not defined