In [None]:
# Implement Transformer for Medical Text QA

import pandas as pd
import numpy as np
from transformers import BertTokenizer, BertModel
from sklearn.metrics.pairwise import cosine_similarity
from tqdm import tqdm
import re
import os
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
import torch

# Load dataset 
dataset_path = 'articles.csv'
df = pd.read_csv(dataset_path)

# Preprocess the dataset
df['AB'] = df['AB'].astype(str).apply(lambda x: x.lower())

# Load pre-trained BERT tokenizer and model
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertModel.from_pretrained('bert-base-uncased')

# Path to store precomputed embeddings
embeddings_path = 'precomputed_embeddings.npy'

# Set up NLTK stopwords and stemmer
stop_words = set(stopwords.words('english'))
stemmer = PorterStemmer()

# Function to preprocess and embed text in batches
def preprocess_and_embed_batch(texts, batch_size=32):
    embeddings_list = []

    for i in tqdm(range(0, len(texts), batch_size), desc="Computing embeddings in batches"):
        batch_texts = texts[i:i + batch_size]
        batch_embeddings = preprocess_and_embed(batch_texts)
        embeddings_list.append(batch_embeddings)

    return np.concatenate(embeddings_list, axis=0)

# Function to preprocess and embed text
def preprocess_and_embed(texts):
    # Remove stop words and apply stemming
    processed_texts = [' '.join([stemmer.stem(token) for token in tokenizer.tokenize(text) if token not in stop_words]) for text in texts]

    # Tokenize and encode the input text
    inputs = tokenizer(processed_texts, return_tensors='pt', max_length=512, truncation=True, padding=True)

    # Forward pass through the BERT model
    with torch.no_grad():  # This block ensures GPU usage
        outputs = model(**inputs)

    # Extract the embeddings for the [CLS] token
    embeddings = outputs.last_hidden_state[:, 0, :].detach().cpu().numpy()

    return embeddings

def compute_and_save_embeddings(df, embeddings_path, incremental=False):
    if incremental and os.path.exists(embeddings_path):
        # Load existing embeddings
        existing_embeddings = load_embeddings(embeddings_path)

        # Identify new abstracts to process
        new_abstracts = df.loc[~df.index.isin(existing_embeddings.index), 'AB']

        if not new_abstracts.empty:
            new_embeddings = preprocess_and_embed_batch(new_abstracts)
            updated_embeddings = np.concatenate([existing_embeddings, new_embeddings], axis=0)
        else:
            # Nothing new to process
            updated_embeddings = existing_embeddings

    else:
        # Process all abstracts
        updated_embeddings = preprocess_and_embed_batch(df['AB'])

    # Save updated embeddings
    np.save(embeddings_path, updated_embeddings)

def load_embeddings(embeddings_path):
    return np.load(embeddings_path)

def extract_answer_sentence(query, abstract):
    query_tokens = tokenizer.tokenize(query)
    sentences = re.split(r'(?<!\w\.\w.)(?<![A-Z][a-z]\.)(?<=\.|\?)\s', abstract)
    
    for sentence in sentences:
        sentence_tokens = tokenizer.tokenize(sentence)
        if any(token in sentence_tokens for token in query_tokens):
            return sentence
    
    return None

def search_engine(query, df, embeddings, num_results=5):
    # Preprocess the query
    query = query.lower()
    
    # Calculate embeddings for the query
    query_embedding = preprocess_and_embed(query)
    
    # Calculate cosine similarity between the query and dataset abstracts
    similarities = [cosine_similarity(query_embedding, ae.reshape(1, -1))[0][0] for ae in tqdm(embeddings, desc="Calculating similarities")]
    
    # Get the indices of the top N most similar abstracts
    top_indices = sorted(range(len(similarities)), key=lambda i: similarities[i], reverse=True)[:num_results]
    
    # Return the titles, scores, and answer sentences of the top N most relevant articles
    top_articles = []
    for i in top_indices:
        title = df.iloc[i]['TI']
        score = similarities[i]
        abstract = df.iloc[i]['AB']
        answer_sentence = extract_answer_sentence(query, abstract)
        top_articles.append((title, score, answer_sentence))
    
    return top_articles

# Check if precomputed embeddings exist, otherwise compute and save them
if not os.path.exists(embeddings_path):
    compute_and_save_embeddings(df, embeddings_path)

# Load precomputed embeddings
embeddings = load_embeddings(embeddings_path)

# Example usage
query = "What is the treatment for cancer?"
top_articles = search_engine(query, df, embeddings)
for title, score, answer_sentence in top_articles:
    print(f"Title: {title}, Score: {score}")
    print(f"Answer Sentence: {answer_sentence}\n")


In [None]:
# with misspelling
query = "What is the treatent for lung caner?"
top_articles = search_engine(query, df, embeddings)
for title, score, answer_sentence in top_articles:
    print(f"Title: {title}, Score: {score}")
    print(f"Answer Sentence: {answer_sentence}\n")

print("="*100)
# without misspelling 
query = "What is the treatment for lung cancer?"
top_articles = search_engine(query, df, embeddings)
for title, score, answer_sentence in top_articles:
    print(f"Title: {title}, Score: {score}")
    print(f"Answer Sentence: {answer_sentence}\n")