In [24]:
!pip install sentence-transformers



In [25]:
import time
import pandas as pd
import torch
from transformers import BertTokenizer, BertModel
from sentence_transformers import SentenceTransformer

In [26]:
# Define dense retrieval function for Sentence Transformers
def dense_retrieval(query, model, description_embeddings):
    query_embedding = model.encode(query, convert_to_tensor=True)
    similarities = torch.nn.functional.cosine_similarity(query_embedding.unsqueeze(0), description_embeddings)
    sorted_indices = similarities.argsort(descending=True)
    return sorted_indices


In [27]:
# Define dense retrieval function for BERT
def dense_retrieval_bert(query, descriptions, tokenizer, model):
    encoded_query = tokenizer(query, padding=True, truncation=True, return_tensors='pt')
    with torch.no_grad():
        query_outputs = model(**encoded_query)
        query_embedding = torch.mean(query_outputs.last_hidden_state, dim=1).squeeze(0)
        similarities = torch.nn.functional.cosine_similarity(query_embedding, descriptions)
        sorted_indices = similarities.argsort(descending=True)
        return sorted_indices

In [28]:
#sentence transformer
start_time = time.time()

In [29]:
# Load dataset
excel_file_path = "datasetfile.csv"
df = pd.read_csv(excel_file_path, encoding='utf-8')

In [None]:
# Load Sentence Transformer model
model = SentenceTransformer('paraphrase-mpnet-base-v2')
description_embeddings = model.encode(df['Description'].tolist(), convert_to_tensor=True)

In [None]:
# Perform dense retrieval
query = "Trade"
sorted_indices = dense_retrieval(query, model, description_embeddings)
end_time_sentence_transformers = time.time()


In [None]:
# Print top results
print("Top results using Sentence Transformers:")
for idx in sorted_indices[:10]:
    print(df['Description'][idx.item()])

In [None]:
# Code 2: BERT
start_time_bert = time.time()

In [None]:
# Load dataset
excel_file_path = "datasetfile.csv"
df = pd.read_csv(excel_file_path, encoding='utf-8')

In [None]:
# Load BERT tokenizer and model
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertModel.from_pretrained('bert-base-uncased')

In [None]:
# Tokenize and encode descriptions
encoded_descriptions = []
for description in df['Description'].tolist():
    encoded_input = tokenizer(description, padding=True, truncation=True, return_tensors='pt')
    with torch.no_grad():
        outputs = model(**encoded_input)
        embeddings = torch.mean(outputs.last_hidden_state, dim=1).squeeze(0)
        encoded_descriptions.append(embeddings.numpy())

In [None]:
# Convert embeddings to tensor
encoded_descriptions_tensors = [torch.from_numpy(arr) for arr in encoded_descriptions]
embedding_array = torch.stack(encoded_descriptions_tensors).numpy()


In [None]:
# Perform dense retrieval with BERT
query = "Historical era"
descriptions_tensor = torch.tensor(embedding_array)
sorted_indices = dense_retrieval_bert(query, descriptions_tensor, tokenizer, model)
end_time_bert = time.time()

In [None]:
# Print top results
print("Top results using BERT:")
for idx in sorted_indices[:10]:
    print(df['Description'][idx.item()])

In [None]:
# Measure execution times
time_taken_sentence_transformers = end_time_sentence_transformers - start_time
time_taken_bert = end_time_bert - start_time_bert


In [None]:
# Print execution times
print("Time taken by Sentence Transformers:", time_taken_sentence_transformers, "seconds")
print("Time taken by BERT:", time_taken_bert, "seconds")

In [None]:
import matplotlib.pyplot as plt

In [None]:
# Draw graphs for execution time
labels = ['Sentence Transformers', 'BERT']
times = [time_taken_sentence_transformers, time_taken_bert]

plt.figure(figsize=(10, 5))
plt.bar(labels, times, color=['blue', 'orange'])
plt.title('Execution Time Comparison')
plt.xlabel('Models')
plt.ylabel('Time (seconds)')
plt.show()


In [None]:
njk