# Setup Workspace

In [1]:
import pandas as pd
import torch
from torch.utils.data import DataLoader, Dataset
from transformers import BertTokenizer, BertForSequenceClassification, AdamW
from sklearn.model_selection import train_test_split
from sklearn.metrics.pairwise import cosine_similarity

In [2]:
RANDOM_SEED = 420
dataset_file = 'formated_dataframe.csv'

In [3]:
torch.manual_seed(RANDOM_SEED)

<torch._C.Generator at 0x27a7fd3bc70>

# Model Selection [Advanced Pre-Trained Model]

In [4]:
df = None
try:
    %store -r df
except KeyError:
    df = pd.read_csv('formated_dataframe.csv')

In [5]:
train_df, test_df = train_test_split(df, test_size=0.2)
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

In [6]:
class SentencePairDataset(Dataset):
    def __init__(self, dataframe, tokenizer):
        self.len = len(dataframe)
        self.data = dataframe
        self.tokenizer = tokenizer

    def __getitem__(self, index):
        sentence1 = str(self.data.sentence1[index])
        sentence2 = str(self.data.sentence2[index])
        label = int(self.data.label[index])
        inputs = self.tokenizer.encode_plus(
            sentence1,
            sentence2,
            add_special_tokens=True,
            max_length=512,
            padding='max_length',
            return_token_type_ids=True,
            truncation=True
        )
        ids = inputs['input_ids']
        mask = inputs['attention_mask']

        return {
            'ids': torch.tensor(ids, dtype=torch.long),
            'mask': torch.tensor(mask, dtype=torch.long),
            'labels': torch.tensor(label, dtype=torch.long)
        }

    def __len__(self):
        return self.len

In [7]:
train_dataset = SentencePairDataset(train_df, tokenizer)
train_loader = DataLoader(train_dataset, batch_size=4, shuffle=True)

test_dataset = SentencePairDataset(test_df, tokenizer)
test_loader = DataLoader(test_dataset, batch_size=4, shuffle=True)

In [8]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = BertForSequenceClassification.from_pretrained('bert-base-uncased').to(device)
device.type

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to see activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


'cuda'

In [9]:
def train(epoch):
    optimizer = AdamW(model.parameters(), lr=1e-5)

    model.train()
    for _, data in enumerate(train_loader, 0):
        ids = data['ids'].to(device)
        mask = data['mask'].to(device)
        labels = data['labels'].to(device)

        optimizer.zero_grad()
        outputs = model(ids, mask, labels=labels)
        loss = outputs.loss
        loss.backward()
        optimizer.step()


for epoch in range(3):
    train(epoch)

# Save the model
model_path = 'bert_finetuned.pth'
torch.save(model.state_dict(), model_path)



AttributeError: 'DataFrame' object has no attribute 'sentence1'

In [None]:
def get_embedding(sentence):
    inputs = tokenizer(sentence, return_tensors='pt', max_length=512, truncation=True, padding='max_length')
    outputs = model(**inputs)
    return outputs.last_hidden_state[:, 0, :].detach().numpy()

In [None]:
def find_most_similar(query, dataframe):
    query_embedding = get_embedding(query)
    max_similarity = -1
    most_similar_sentence = ""

    for sentence in dataframe['sentence']:
        sentence_embedding = get_embedding(sentence)
        similarity = cosine_similarity(query_embedding, sentence_embedding)

        if similarity > max_similarity:
            max_similarity = similarity
            most_similar_sentence = sentence

    return most_similar_sentence, max_similarity


In [None]:
query = "Do you want to go to a bar?"
most_similar_sentence, similarity_score = find_most_similar(query, df)
print("Most Similar Sentence:", most_similar_sentence)
print("Similarity Score:", similarity_score)