In [None]:
# Implement Transformer for Medical Text QA

import pandas as pd
import numpy as np
from transformers import BertTokenizer, BertModel
from sklearn.metrics.pairwise import cosine_similarity
from tqdm import tqdm
import re
import os
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
import torch

# Load dataset 
dataset_path = 'articles.csv'
df = pd.read_csv(dataset_path)

# Preprocess the dataset
df['AB'] = df['AB'].astype(str).apply(lambda x: x.lower())

# Load pre-trained BERT tokenizer and model
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertModel.from_pretrained('bert-base-uncased')

# Path to store precomputed embeddings
embeddings_path = 'precomputed_embeddings.npy'

# Set up NLTK stopwords and stemmer
stop_words = set(stopwords.words('english'))
stemmer = PorterStemmer()

# Function to preprocess and embed text in batches
def preprocess_and_embed_batch(texts, batch_size=32):
    embeddings_list = []

    for i in tqdm(range(0, len(texts), batch_size), desc="Computing embeddings in batches"):
        batch_texts = texts[i:i + batch_size]
        batch_embeddings = preprocess_and_embed(batch_texts)
        embeddings_list.append(batch_embeddings)

    return np.concatenate(embeddings_list, axis=0)

# Function to preprocess and embed text
def preprocess_and_embed(texts):
    # Remove stop words and apply stemming
    processed_texts = [' '.join([stemmer.stem(token) for token in tokenizer.tokenize(text) if token not in stop_words]) for text in texts]

    # Tokenize and encode the input text
    inputs = tokenizer(processed_texts, return_tensors='pt', max_length=512, truncation=True, padding=True)

    # Forward pass through the BERT model
    with torch.no_grad():  # This block ensures GPU usage
        outputs = model(**inputs)

    # Extract the embeddings for the [CLS] token
    embeddings = outputs.last_hidden_state[:, 0, :].detach().cpu().numpy()

    return embeddings

def compute_and_save_embeddings(df, embeddings_path, incremental=False):
    if incremental and os.path.exists(embeddings_path):
        # Load existing embeddings
        existing_embeddings = load_embeddings(embeddings_path)

        # Identify new abstracts to process
        new_abstracts = df.loc[~df.index.isin(existing_embeddings.index), 'AB']

        if not new_abstracts.empty:
            new_embeddings = preprocess_and_embed_batch(new_abstracts)
            updated_embeddings = np.concatenate([existing_embeddings, new_embeddings], axis=0)
        else:
            # Nothing new to process
            updated_embeddings = existing_embeddings

    else:
        # Process all abstracts
        updated_embeddings = preprocess_and_embed_batch(df['AB'])

    # Save updated embeddings
    np.save(embeddings_path, updated_embeddings)

def load_embeddings(embeddings_path):
    return np.load(embeddings_path)

def extract_answer_sentence(query, abstract):
    query_tokens = tokenizer.tokenize(query)
    sentences = re.split(r'(?<!\w\.\w.)(?<![A-Z][a-z]\.)(?<=\.|\?)\s', abstract)
    
    for sentence in sentences:
        sentence_tokens = tokenizer.tokenize(sentence)
        if any(token in sentence_tokens for token in query_tokens):
            return sentence
    
    return None

def search_engine(query, df, embeddings, num_results=5):
    # Preprocess the query
    query = query.lower()
    
    # Calculate embeddings for the query
    query_embedding = preprocess_and_embed(query)
    
    # Calculate cosine similarity between the query and dataset abstracts
    similarities = [cosine_similarity(query_embedding, ae.reshape(1, -1))[0][0] for ae in tqdm(embeddings, desc="Calculating similarities")]
    
    # Get the indices of the top N most similar abstracts
    top_indices = sorted(range(len(similarities)), key=lambda i: similarities[i], reverse=True)[:num_results]
    
    # Return the titles, scores, and answer sentences of the top N most relevant articles
    top_articles = []
    for i in top_indices:
        title = df.iloc[i]['TI']
        score = similarities[i]
        abstract = df.iloc[i]['AB']
        answer_sentence = extract_answer_sentence(query, abstract)
        top_articles.append((title, score, answer_sentence))
    
    return top_articles

# Check if precomputed embeddings exist, otherwise compute and save them
if not os.path.exists(embeddings_path):
    compute_and_save_embeddings(df, embeddings_path)

# Load precomputed embeddings
embeddings = load_embeddings(embeddings_path)

# Example usage
query = "What is the treatment for cancer?"
top_articles = search_engine(query, df, embeddings)
for title, score, answer_sentence in top_articles:
    print(f"Title: {title}, Score: {score}")
    print(f"Answer Sentence: {answer_sentence}\n")


In [None]:
# with misspelling
query = "What is the treatent for lung caner?"
top_articles = search_engine(query, df, embeddings)
for title, score, answer_sentence in top_articles:
    print(f"Title: {title}, Score: {score}")
    print(f"Answer Sentence: {answer_sentence}\n")

print("="*100)
# without misspelling 
query = "What is the treatment for lung cancer?"
top_articles = search_engine(query, df, embeddings)
for title, score, answer_sentence in top_articles:
    print(f"Title: {title}, Score: {score}")
    print(f"Answer Sentence: {answer_sentence}\n")

In [None]:
# Embedding the abstracts using BERT and saving them to a file

import pandas as pd
import torch
from transformers import BertTokenizer, BertModel
from sklearn.metrics.pairwise import cosine_similarity
from tqdm import tqdm

# Load pre-trained BERT model and tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertModel.from_pretrained('bert-base-uncased')

# Tokenize and encode the abstracts
def encode_abstracts_sliding_window(abstracts, window_size=512, stride=256):
    encoded_abstracts = []

    for abstract in tqdm(abstracts, desc="Encoding Abstracts", unit="abstract"):
        tokens = tokenizer.tokenize(abstract)
        total_length = len(tokens)

        # Determine the number of overlapping windows
        num_windows = abs(total_length - window_size) // stride + 1

        for i in range(0, num_windows * stride, stride):
            # Extract a window of tokens
            window_tokens = tokens[i:i + window_size]

            # Convert tokens back to a string
            window_text = tokenizer.convert_tokens_to_string(window_tokens)

            # Tokenize and encode the window
            inputs = tokenizer(window_text, return_tensors="pt", padding=True, truncation=True)
            with torch.no_grad():
                outputs = model(**inputs)

            encoded_abstracts.append(outputs.last_hidden_state.mean(dim=1))

    if not encoded_abstracts:
        print("No encoded abstracts found.")
    return torch.cat(encoded_abstracts, dim=0)


# Function to save encoded abstracts
def save_encoded_abstracts(encoded_abstracts, filename):
    torch.save(encoded_abstracts, filename)

# Function to load encoded abstracts
def load_encoded_abstracts(filename):
    return torch.load(filename)

# Example: Load, encode, and save each part separately
for i in tqdm(range(1, 11), desc="Processing Parts", unit="part"):
    file_path = f'sub_data_{i}.csv'
    df_part = pd.read_csv(file_path)

    # Encode abstracts
    encoded_abstracts_part = encode_abstracts_sliding_window(df_part['Combined_Info'])

    # Save encoded abstracts
    save_encoded_abstracts(encoded_abstracts_part, f'encoded_data_part_{i}.pt')

# Load and concatenate encoded abstracts from all parts
encoded_abstracts_parts = []
for i in tqdm(range(1, 11), desc="Loading Parts", unit="part"):
    encoded_abstracts_part = load_encoded_abstracts(f'encoded_data_part_{i}.pt')
    encoded_abstracts_parts.append(encoded_abstracts_part)

# Concatenate the parts
encoded_abstracts = torch.cat(encoded_abstracts_parts, dim=0)

# Save the encoded_abstracts tensor
torch.save(encoded_abstracts, 'encoded_data.pt')



In [None]:
"""retrieving the most similar abstracts to a question and then generating an answer based on those abstracts is a 
reasonable strategy for a Question Answering (QA) system. While the SQuAD (Stanford Question Answering Dataset) 
is typically used for training and evaluating QA models, you can adapt your approach to leverage 
the idea of retrieving relevant passages or abstracts and then generating answers.

Steps:

    1-Retrieve Similar Abstracts:
        Use a method (such as cosine similarity) to retrieve the top N most similar abstracts to a given question from your collection of abstracts.

    2-Generate Answers:
        For each of the retrieved abstracts, use a QA model to generate answers to the question.
        Fine-tune a pre-trained QA model on your specific dataset, considering the structure of your abstracts and questions.

    3-Combine Answers:
        Aggregate or combine the answers generated from different abstracts to provide a final answer."""

from transformers import BertForQuestionAnswering, BertTokenizer
from sklearn.metrics.pairwise import cosine_similarity
import torch
from tqdm import tqdm

# Load pre-trained BERT model and tokenizer
qa_model = BertForQuestionAnswering.from_pretrained('bert-large-uncased-whole-word-masking-finetuned-squad')
qa_tokenizer = BertTokenizer.from_pretrained('bert-large-uncased-whole-word-masking-finetuned-squad')


# Tokenize and encode the abstracts
def encode_abstracts_sliding_window(abstracts, window_size=512, stride=256):
    encoded_abstracts = []

    for abstract in tqdm(abstracts, desc="Encoding Abstracts", unit="abstract"):
        tokens = tokenizer.tokenize(abstract)
        total_length = len(tokens)

        # Determine the number of overlapping windows
        num_windows = abs(total_length - window_size) // stride + 1

        for i in range(0, num_windows * stride, stride):
            # Extract a window of tokens
            window_tokens = tokens[i:i + window_size]

            # Convert tokens back to a string
            window_text = tokenizer.convert_tokens_to_string(window_tokens)

            # Tokenize and encode the window
            inputs = tokenizer(window_text, return_tensors="pt", padding=True, truncation=True)
            with torch.no_grad():
                outputs = model(**inputs)

            encoded_abstracts.append(outputs.last_hidden_state.mean(dim=1))

    if not encoded_abstracts:
        print("No encoded abstracts found.")
    return torch.cat(encoded_abstracts, dim=0)


# Function to load encoded abstracts
def load_encoded_abstracts(filename):
    return torch.load(filename)

# Function to retrieve top k similar abstracts
def retrieve_top_k_abstracts(query, abstracts, df, k=5):
    # Encode the query using the sliding window approach (as before)
    query_embedding = encode_abstracts_sliding_window([query])
    
    # Calculate cosine similarity between the query and encoded abstracts
    similarities = cosine_similarity(query_embedding, abstracts)
    
    # Get the indices of the top k most similar abstracts
    top_k_indices = similarities.argsort()[0, -k:][::-1]

    if len(top_k_indices) == 0:
        print("No matching abstracts found.")
        return []

    # Print some information for debugging
    print("Top k PMIDs:", df.index[top_k_indices].tolist())
    print("Abstract lengths:", [len(df.loc[pmid, 'Combined_Info']) for pmid in df.index[top_k_indices]])

    return top_k_indices


# Function to generate answers using the QA model
def generate_answers(question, abstracts, df):
    answers = []

    for index in abstracts:
        # Get the PMID
        pmid = df.index[index]

        # Get the abstract text
        abstract_text = df.loc[pmid, 'Combined_Info']

        # Tokenize and encode the question and abstract
        inputs = qa_tokenizer(question, abstract_text, return_tensors="pt", max_length=512, truncation=True)
        
        # Perform inference with the QA model
        with torch.no_grad():
            outputs = qa_model(**inputs)

        # Get the predicted answer
        answer_start = torch.argmax(outputs.start_logits)
        answer_end = torch.argmax(outputs.end_logits) + 1
        answer = qa_tokenizer.convert_tokens_to_string(qa_tokenizer.convert_ids_to_tokens(inputs["input_ids"][0][answer_start:answer_end]))

        answers.append(answer)

    return answers

# Using PMID as the index column
df_part = pd.read_csv('data_1.csv', index_col='PMID')
# df_part = pd.read_csv('articles.csv', index_col='PMID', usecols=['TI', 'AB', 'FAU', 'DP', 'OT', 'JT', 'MH'])


# Example usage
encoded_abstracts = load_encoded_abstracts('encoded_data.pt')
question = "what is Artificial Intelligence?"
top_k_abstracts = retrieve_top_k_abstracts(question, encoded_abstracts, df_part, k=5)

# Print the top 5 similar abstracts
print("Top 5 Similar Abstracts:")
for index in top_k_abstracts:
    pmid = df_part.index[index]
    print("PMID:", pmid)
    print("Abstract:", df_part.loc[pmid, 'Combined_Info'])

answers = generate_answers(question, top_k_abstracts, df_part)

# Display the generated answers
print("\nGenerated Answers:")
for answer in answers:
    print(answer)
#[TODO] check if the retrieved data is correct as sometimes it produces some IDs that are not part of the dataset, e.g. query = "who is Chenq?"