In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np  # linear algebra
import pandas as pd  # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os

for dirname, _, filenames in os.walk("/kaggle/input"):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All"
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [2]:
pip install -U sentence-transformers

Collecting sentence-transformers
  Downloading sentence_transformers-3.3.1-py3-none-any.whl.metadata (10 kB)
Downloading sentence_transformers-3.3.1-py3-none-any.whl (268 kB)
Installing collected packages: sentence-transformers
Successfully installed sentence-transformers-3.3.1
Note: you may need to restart the kernel to use updated packages.


In [5]:
df = pd.read_csv(
    "/Users/ramazanovaaa/Documents/Duke files/IDS703 NLP/Document_Similarity_with_BERT/dataset/synthetic/synthetic_data2.csv"
)

In [6]:
df

Unnamed: 0.1,Unnamed: 0,id,qid1,qid2,doc1,doc2,is_duplicate
0,0,0,1,2,"It is a truth universally acknowledged, that a...",It is a widely accepted fact that a wealthy si...,
1,1,1,3,4,However little known the feelings or views of ...,No matter how little is known about a man's th...,
2,2,2,5,6,"""My dear Mr. Bennet,"" said his lady to him one...","""Mr. Bennet, my dear,"" his wife remarked one d...",
3,3,3,7,8,Mr. Bennet replied that he had not.,Mr. Bennet responded that he had not heard.,
4,4,4,9,10,"""But it is,"" returned she; ""for Mrs. Long has ...","""But it is true,"" she replied. ""Mrs. Long was ...",
...,...,...,...,...,...,...,...
56,56,56,113,114,Mr. Bennet was so odd a mixture of quick parts...,Mr. Bennet was such a peculiar blend of sharp ...,
57,57,57,115,116,_Her_ mind was less difficult to develope.,Her personality was much easier to understand.,
58,58,58,117,118,"She was a woman of mean understanding, little ...","She was a woman of limited intelligence, scant...",
59,59,59,119,120,When she was discontented she fancied herself ...,"When she felt unhappy, she imagined herself to...",


## Cosine Similarity using BERT embedding

In [7]:
from transformers import AutoTokenizer, AutoModel
import torch
from sklearn.metrics.pairwise import cosine_similarity

In [8]:
def compute_similarity(sentences, model_name="bert-base-uncased", max_length=512):
    """
    Compute the cosine similarity between the first sentence and all others using a pre-trained BERT model.

    Args:
        sentences (list of list of str): A list of sentences where the first one is compared to the rest.
        model_name (str): The pre-trained BERT model name (default: 'bert-base-uncased').
        max_length (int): Maximum sequence length for tokenization (default: 128).

    Returns:
        numpy.ndarray: Cosine similarity values between the first sentence and the rest.
    """
    # Load tokenizer and model
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    model = AutoModel.from_pretrained(model_name)

    # Tokenization and input preparation
    tokens = {"input_ids": [], "attention_mask": []}
    for sentence in sentences:
        new_tokens = tokenizer.encode_plus(
            "\n".join(sentence),
            max_length=max_length,
            truncation=True,
            padding="max_length",
            return_tensors="pt",
        )
        tokens["input_ids"].append(new_tokens["input_ids"][0])
        tokens["attention_mask"].append(new_tokens["attention_mask"][0])

    tokens["input_ids"] = torch.stack(tokens["input_ids"])
    tokens["attention_mask"] = torch.stack(tokens["attention_mask"])

    # Generate embeddings
    outputs = model(**tokens)
    embeddings = outputs.last_hidden_state
    attention = tokens["attention_mask"]

    # Mask embeddings
    mask = attention.unsqueeze(-1).expand(embeddings.shape).float()
    masked_embeddings = embeddings * mask

    # Compute mean pooling
    summed = torch.sum(masked_embeddings, dim=1)
    counts = torch.clamp(mask.sum(dim=1), min=1e-9)
    mean_pooled = summed / counts
    mean_pooled = mean_pooled.detach().numpy()

    # Compute cosine similarity
    similarity = cosine_similarity([mean_pooled[0]], mean_pooled[1:])
    return similarity

In [9]:
compute_similarity(["This is the test1", "This is the test2"])[0]

array([0.9951054], dtype=float32)

## Document Summarization approach with BERT & Cosine Similarity

In [10]:
res_l = [
    ["Today is a very sunny day.", "It's such a bright and sunny day today."],
    [
        "I am hungry, I will get my meal.",
        "I feel like eating something because I am hungry.",
    ],
    [
        "Can you help me with this project?",
        "I need assistance with this task. Can you help?",
    ],
    [
        "The movie was better than I expected.",
        "I didn’t think the movie would be good, but it surprised me.",
    ],
    [
        "The library is open until 8 PM.",
        "The weather is horrible, it's raining all day.",
    ],
    ["Turn left at the next intersection.", "This cake tastes absolutely delicious."],
    [
        "I am feeling very tired after a long day.",
        "The train leaves at 5 PM. Don't be late!",
    ],
    ["What time does the train leave?", "The movie was better than I expected."],
]

In [11]:
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity


# Load pre-trained summarization model
model_name = "facebook/bart-large-cnn"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSeq2SeqLM.from_pretrained(model_name)

# Input text
summary_res = []
for i in range(2):
    text = ";".join(res_l[i])

    # Tokenize and summarize
    inputs = tokenizer.encode(
        "summarize: " + text, return_tensors="pt", max_length=1024, truncation=True
    )
    summary_ids = model.generate(
        inputs,
        max_length=512,
        min_length=40,
        length_penalty=2.0,
        num_beams=4,
        early_stopping=True,
    )
    summary = tokenizer.decode(summary_ids[0], skip_special_tokens=True)

    summary_res.append(summary)

embedder_model_name = "sentence-transformers/bert-base-nli-mean-tokens"
embedder_model = SentenceTransformer(embedder_model_name)

embeding_summary = embedder_model.encode(summary_res)
similarity_summary = cosine_similarity([embeding_summary[0]], embeding_summary[1:])
print("Similarity Percentage = ", similarity_summary[0][0] * 100)

config.json:   0%|          | 0.00/1.58k [00:00<?, ?B/s]

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.63G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/363 [00:00<?, ?B/s]

modules.json:   0%|          | 0.00/229 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/122 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/3.99k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/625 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/438M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/399 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

added_tokens.json:   0%|          | 0.00/2.00 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

1_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

Similarity Percentage =  27.964735


In [12]:
import os
import requests

model_url = (
    "https://drive.google.com/file/d/1FkoiDorRCVekAdzJDTOgQ7-QPhUnligW/view?usp=sharing"
)
local_file = "finalized_model.sav"

if not os.path.exists(local_file):
    print("Downloading model...")
    response = requests.get(model_url)
    with open(local_file, "wb") as file:
        file.write(response.content)
    print("Download complete.")
else:
    print("Model already exists.")

Downloading model...
Download complete.


In [16]:
import joblib

# Load the model
model = joblib.load(local_file)

# Check the type of the model
print(type(model))

EOFError: 

In [17]:
import os

print(f"Downloaded file size: {os.path.getsize('finalized_model.sav')} bytes")

Downloaded file size: 0 bytes


In [18]:
import pickle

with open("finalized_model.sav", "rb") as file:
    model = pickle.load(file)

print("Model loaded successfully!")

EOFError: Ran out of input

In [19]:
import os
import requests

model_url = "https://drive.google.com/file/d/1FkoiDorRCVekAdzJDTOgQ7-QPhUnligW/view?usp=sharing"  # Replace with the correct URL
local_file = "finalized_model.sav"

if not os.path.exists(local_file):
    print("Downloading model...")
    try:
        response = requests.get(model_url, stream=True)
        response.raise_for_status()  # Ensure the request was successful
        with open(local_file, "wb") as file:
            for chunk in response.iter_content(chunk_size=8192):  # Stream in chunks
                file.write(chunk)
        print("Download complete.")
    except requests.exceptions.RequestException as e:
        print(f"Error downloading the file: {e}")
else:
    print("Model already exists.")

Model already exists.


In [13]:
import pickle
import pandas as pd
import torch

from transformers import (
    BertTokenizer,
    BertForSequenceClassification,
    Trainer,
    TrainingArguments,
)
from sklearn.model_selection import train_test_split
from torch.utils.data import Dataset
from transformers import BertTokenizer, BertForSequenceClassification
import torch.nn.functional as F

tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")


def predict(sentence1, sentence2, model, tokenizer, device, max_length=128):
    # Tokenize the input
    inputs = tokenizer(
        sentence1,
        sentence2,
        padding="max_length",
        truncation=True,
        max_length=max_length,
        return_tensors="pt",
    ).to(
        device
    )  # Move inputs to the same device as the model

    # Run the model
    with torch.no_grad():
        outputs = model(
            input_ids=inputs["input_ids"],
            attention_mask=inputs["attention_mask"],
        )
        logits = outputs.logits
    # Apply softmax to get probabilities
    probs = F.softmax(logits, dim=1)
    predicted_class = torch.argmax(probs).item()

    return predicted_class, probs


filename = "/kaggle/input/bert_sentenceclassfier/pytorch/default/1/finalized_model.sav"
labels = ["Not Similar", "Similar"]

loaded_model = pickle.load(open(filename, "rb"))

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
loaded_model.to(device)  # Move the model to the selected device

# Generate two similar sentences for testing
sentence1 = "Today is a very sunny day."
sentence2 = "I am hungry, I will get my meal."

# Perform inference
predicted_class, probabilities = predict(
    sentence1, sentence2, loaded_model, tokenizer, device
)

# Print results
print(f"Sentence 1: {sentence1}")
print(f"Sentence 2: {sentence2}")
print(f"Predicted Class: {labels[predicted_class]}")
print(f"Class Probabilities: {probabilities}")

FileNotFoundError: [Errno 2] No such file or directory: '/kaggle/input/bert_sentenceclassfier/pytorch/default/1/finalized_model.sav'