In [None]:
# NO NEED TO RUN THIS CELL AS THE DATA IS ALREADY COLLECTED AND SAVE UNDER articles.csv

# Data Crawler that works exactly like a human and go one by one through the articles and save the abstracts and references in XML format

from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.service import Service as ChromeService
from selenium.common.exceptions import NoSuchElementException
from webdriver_manager.chrome import ChromeDriverManager

import re
import pandas as pd
from tqdm import tqdm

# Navigate to the website
url = "https://pubmed.ncbi.nlm.nih.gov/?term=intelligence+%5BTitle%2Fabstract%5D&filter=simsearch1.fha&filter=years.2013-2023&sort=date&size=200"
driver = webdriver.Chrome(service=ChromeService(ChromeDriverManager().install()))
driver.get(url)

# Click the first article to start
driver.find_element(By.XPATH, "//a[@data-ga-action=1]").click()
total_articles = driver.find_element(By.XPATH, "//*[@id='adjacent-navigation']/div[2]/a/span[1]/span[2]").text

# Find the number of total articles
total_articles = re.sub(r"[^3-9]",'', total_articles)

# Specify the chunk size
chunk_size = 1000

# Create empty lists to store data
titles = []
authors = []
abstracts = []
references_list = []
not_found_pages = []

for page in tqdm(range(int(total_articles))):
    # Extract title of the article
    try:
        title = driver.find_element(By.CLASS_NAME, "heading-title")
        if title.is_displayed():
            title = title.text
    except NoSuchElementException:
        title = ''
        pass

    # Extract autors of the article
    try:
        authors_elements = driver.find_elements(By.CLASS_NAME, "full-name")
        author_list = []
        if len(authors_elements) > 0:
            for author in authors_elements:
                author_list.append(author.text)
    except NoSuchElementException:
        author_list.append('')
        pass

    # Extract abstract of the article
    try:
        abstract = driver.find_element(By.ID, "eng-abstract")
        if abstract.is_displayed():
            abstract = abstract.text
    except NoSuchElementException:
        abstract = ''
        pass

    # Check and extract if there is reference or are more references 
    try:
        reference = driver.find_element(By.ID, "references")
        show_all_element = driver.find_element(By.CLASS_NAME, "show-all")
        if show_all_element.is_displayed():
            show_all_element.click()
        if reference.is_displayed():
            references = driver.find_element(By.CLASS_NAME, "references-list").text
    except NoSuchElementException:
        references = ''
        pass

    # Append data to lists
    titles.append(title)
    authors.append(author_list)
    abstracts.append(abstract)
    references_list.append(references)

    if (page + 1) % chunk_size == 0 or page + 1 == int(total_articles):
        # Create a DataFrame
        data = {
                    'Title': pd.Series(titles),
                    'Authors': pd.Series(authors),
                    'Abstracts': pd.Series(abstracts),
                    'References': pd.Series(references_list)
                }
        df = pd.DataFrame(data)

        # Save DataFrame to CSV
        chunk_number = (page + 1) // chunk_size
        csv_filename = f'pubmed_data_chunk_{chunk_number}.csv'
        df.to_csv(csv_filename, index=False)

        # Clear lists for the next chunk
        titles = []
        authors = []
        abstracts = []
        references_list = []
    
    # Navigate to the next article
    try:
        next_page = driver.find_element(By.XPATH, "//div[@class='next side-link visible']")
        if next_page.is_displayed():
            next_page.click()
    except NoSuchElementException:
        not_found_pages.append(page)
        print(f"{page = } not found!")
        pass

# Close the browser
driver.quit()

In [None]:
# Data Preprocessing to have a CSV file with the following columns and respective data in each row:
# which will be filtered later to have just useful columns and non dupplicated PMID

"""
df shape before cleaning:(74243, 77)
df shape after cleaning:(57560, 15)

Index(['PMID', 'STAT', 'DRDT', 'CTDT', 'PB', 'DP', 'TI', 'BTI', 'AB', 'CI',
       'FED', 'ED', 'FAU', 'AU', 'AD', 'LA', 'PT', 'PL', 'OTO', 'OT', 'EDAT',
       'CRDT', 'AID', 'OWN', 'DCOM', 'LR', 'IS', 'VI', 'IP', 'PG', 'LID',
       'DEP', 'TA', 'JT', 'JID', 'SB', 'MH', 'MHDA', 'PHST', 'PST', 'SO', 'GR',
       'PMC', 'MID', 'COIS', 'TT', 'RN', 'OID', 'SI', 'ISBN', 'CTI', 'CN',
       'FIR', 'IR', 'AUID', 'EIN', 'CIN', 'PS', 'FPS', 'CON', 'UOF', 'UIN',
       'RIN', 'IRAD', 'EFR', 'OAB', 'OABL', 'PMCR', 'CP', 'ECI', 'DRIN', 'RF',
       'EN', 'ROF', 'RPI', 'RPF', 'DDIN'],
      dtype='object')


Guide to the abbreviations:

    PMID: PubMed IDentifier - Unique identifier for a PubMed record.

    TI: Title - The title of the article.

    AB: Abstract - A brief summary of the article's content.

    PB: Publisher - The organization responsible for publishing the article.

    FAU: Full Author(s) - Full names of the authors.

    FED: Full Editor(s) - Full names of the editors.

    DP: Date of Publication - Date when the article was published.

    OTO: Other Term Owner - Owner of other terms.

    OT: Other Term - Additional terms or keywords associated with the article.

    OWN: Owner - Owner of the article.

    DCOM: Date Completed - Date when the article was completed.

    LR: Last Revision - Last revision date.

    JT: Journal Title - Full title of the journal.

    MH: MeSH Terms - Medical Subject Headings.

    ISBN: International Standard Book Number - ISBN of the article.

[Removed]    STAT: Status - Indicates the status of the publication.

[Removed]    DRDT: Date Received by Database Transfer - Date when the record was received by the database.

[Removed]    CTDT: Current Temporary Date - Current temporary date of the record.

[Removed]    BTI: Book Title Indicator - Indicates that the article is part of a book.

[Removed]    CI: Copyright Information - Information about the copyright holder.

[Removed]    ED: Editor - Abbreviation for the editor.

[Removed]    AU: Author - Abbreviation for the author.

[Removed]    AD: Author's Affiliation - Affiliation or institution of the author.

[Removed]    LA: Language - Language of the article.

[Removed]    PT: Publication Type - Type of publication (e.g., Review, Book Chapter).

[Removed]    PL: Place of Publication - Location where the article was published.

[Removed]    EDAT: Entrez Date - Date the record was added to the Entrez database.

[Removed]    CRDT: Create Date - Date the record was created.

[Removed]    AID: Article Identifier - Identifier associated with the article.

[Removed]    IS: Issue - Issue number of the journal.

[Removed]    VI: Volume - Volume number of the journal.

[Removed]    IP: Issue Part - Part number of the issue.

[Removed]    PG: Page - Page number.

[Removed]    LID: Location IDentifier - Identifier for the location of the article.

[Removed]    DEP: Date of Electronic Publication - Date of electronic publication.

[Removed]    TA: Journal Title (ISO abbreviation) - Title abbreviation of the journal.

[Removed]    JID: Journal ID - Identifier for the journal.

[Removed]    SB: Subset - Subset designation.

[Removed]    MHDA: MeSH Date - MeSH date.

[Removed]    PHST: Publication History Status - Publication history status.

[Removed]    PST: Publication Status - Publication status.

[Removed]    SO: Source - Source of the article.

[Removed]    GR: Grant - Grant information.

[Removed]    PMC: PubMed Central ID - Identifier for PubMed Central.

[Removed]    MID: Manuscript ID - Identifier for the manuscript.

[Removed]    COIS: Conflict of Interest Statement - Statement about potential conflicts of interest.

[Removed]    TT: Type of Test - Type of test.

[Removed]    RN: Registry Number - Registry number.

[Removed]    OID: Organization ID - Identifier for the organization.

[Removed]    SI: Secondary Source ID - Secondary source identifier.

[Removed]    CTI: Current Technology Information - Current technology information.

[Removed]    CN: Contract Number - Contract number.

[Removed]    FIR: Full Investigator(s) - Full names of the investigators.

[Removed]    IR: Investigator - Abbreviation for the investigator.

[Removed]    AUID: Author ID - Identifier for the author.

[Removed]    EIN: Editor's ID - Identifier for the editor.

[Removed]    CIN: Contributor ID - Identifier for the contributor.

[Removed]    PS: Personal Name as Subject - Personal name as subject.

[Removed]    FPS: Full Personal Name as Subject - Full personal name as subject.

[Removed]    CON: Consortium - Consortium information.

[Removed]    UOF: Use of Funds - Use of funds information.

[Removed]    UIN: Unique Identifier - Unique identifier.

[Removed]    RIN: Reviewer ID - Reviewer identifier.

[Removed]    IRAD: Investigator Affiliation Department - Investigator affiliation department.

[Removed]    EFR: EFS (Endoscopic Frequency Standardization) Factor - EFS factor.

[Removed]    OAB: Overall Bank - Overall bank.

[Removed]    OABL: Overall Blood - Overall blood.

[Removed]    PMCR: PubMed Central Release - PubMed Central release information.

[Removed]    CP: Clinical Progress - Clinical progress.

[Removed]    ECI: Early Career Investigator - Early career investigator.

[Removed]    DRIN: Dual Purpose Experimental Purpose Indicator - Dual-purpose experimental purpose indicator.

[Removed]    RF: Release Factor - Release factor.

[Removed]    EN: Endorsement - Endorsement.

[Removed]    ROF: Reviewer's Office - Reviewer's office.

[Removed]    RPI: Reviewer's Position Identifier - Reviewer's position identifier.

[Removed]    RPF: Research Performance Factor - Research performance factor.

[Removed]    DDIN: Degree-Degree Integration Network - Degree-degree integration network.
"""



import re
import pandas as pd

with open('../crawler/articles.txt', 'r', encoding='utf-8') as f:
    input_text = f.read()

# Split articles based on double quotes
articles = re.split(r'\n"\n', input_text.strip())

# Define a function to extract data from each article
def extract_data(article):
    data = {}
    current_key = None
    current_value = ''

    for line in article.split('\n'):
        # matching the key-value pair
        match = re.match(r'^([A-Z]{2,4})\s*- (.+)$', line)

        if match:
            key, value = match.groups()
            if current_key:
                # If a key is already set, save the current value
                if current_key in data:
                    data[current_key] += '|' + current_value
                else:
                    data[current_key] =  current_value.strip()
                current_value = ''  # Reset current value

            current_key = key
            current_value = value
        else:
            # If there's no match, append the line to the current value
            current_value += '' + line.strip()

    # Save the last key-value pair
    if current_key:
        data[current_key] = current_value.strip()

    return data

# Extract data from each article
article_data_list = [extract_data(article) for article in articles]

# Filter out articles without 'AB' key
filtered_data_list = [data for data in article_data_list if 'AB' in data]

# Create a DataFrame from the filtered data
df = pd.DataFrame(filtered_data_list)

# Keep only useful columns:  df shape before cleaning:(74243, 77)
df = df[['PMID', 'TI', 'AB', 'PB', 'FAU', 'FED', 'DP', 'OTO', 'OT', 'OWN', 'DCOM', 'LR', 'JT', 'MH', 'ISBN']]

# Drop duplicates based on the 'PMID' column : df shape after cleaning:(57560, 15)
df = df.drop_duplicates(subset='PMID', keep='first')

# Save the DataFrame to a CSV file
df.to_csv('../crawler/articles.csv', index=False)


In [None]:
# a basic TF-IDF approach and the pandas library for data manipulation:
# which will search for the top 5 most similar articles to a given query.

import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import linear_kernel

# Load your CSV data
data = pd.read_csv('articles.csv')

# Create a TF-IDF vectorizer
tfidf_vectorizer = TfidfVectorizer(stop_words='english')
tfidf_matrix = tfidf_vectorizer.fit_transform(data['AB'].fillna(''))

# Function to search for queries
def search(query, tfidf_matrix, data):
    query_vector = tfidf_vectorizer.transform([query])
    cosine_similarities = linear_kernel(query_vector, tfidf_matrix).flatten()
    document_scores = list(enumerate(cosine_similarities))
    document_scores = sorted(document_scores, key=lambda x: x[1], reverse=True)
    
    # Return the top N results
    top_results = document_scores[:5]
    for idx, score in top_results:
        print(f"Title: {data['TI'].iloc[idx]}, Score: {score}")


In [None]:
# Example query
search("IQ scores", tfidf_matrix, data)

In [None]:
# Search engine with capability of the spell check and correcting the misspelling

import pandas as pd
from spellchecker import SpellChecker
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import re
import nltk
from nltk.corpus import stopwords

# Load your dataset (replace 'your_dataset.csv' with the actual file path)
dataset_path = 'articles.csv'
df = pd.read_csv(dataset_path)


nltk.download('stopwords')

# Use English stopwords from NLTK
stop_words = set(stopwords.words('english'))

def remove_stopwords(text):
    tokens = text.split()
    filtered_tokens = [word for word in tokens if word.lower() not in stop_words]
    return ' '.join(filtered_tokens)

# Modify the preprocessing step to remove stopwords
df['AB'] = df['AB'].astype(str).apply(lambda x: x.lower())
df['AB'] = df['AB'].apply(remove_stopwords)


# Create a TF-IDF vectorizer for character-level embeddings
vectorizer = TfidfVectorizer(analyzer='char', ngram_range=(1, 3))
tfidf_matrix = vectorizer.fit_transform(df['AB'])

# SpellChecker initialization
spell = SpellChecker()

def correct_spelling(query):
    # Tokenize the query
    tokens = query.split()

    # Correct misspelled words using pyspellchecker
    corrected_tokens = [spell.correction(token) for token in tokens]

    # Join the corrected tokens back into a corrected query
    corrected_query = ' '.join(corrected_tokens)

    return corrected_query

def extract_answer_sentence(query, abstract):
    # Tokenize the query
    query_tokens = query.lower().split()

    # Use regex to split the abstract into sentences
    sentences = re.split(r'(?<!\w\.\w.)(?<![A-Z][a-z]\.)(?<=\.|\?)\s', abstract)

    # Find the first sentence containing any of the query keywords
    for sentence in sentences:
        if any(token in sentence.lower() for token in query_tokens):
            return sentence

    return None


def search_engine(query, df, tfidf_matrix, num_results=5):
    # Correct misspellings in the query
    corrected_query = correct_spelling(query)

    # Vectorize the corrected query using the same TF-IDF vectorizer
    query_vector = vectorizer.transform([corrected_query])

    # Calculate cosine similarity between the query and dataset abstracts
    similarities = cosine_similarity(query_vector, tfidf_matrix).flatten()

    # Get the indices of the top N most similar abstracts
    top_indices = similarities.argsort()[-num_results:][::-1]

    # Return the titles, scores, and answer sentences of the top N most relevant articles
    top_articles = []
    for i in top_indices:
        title = df.iloc[i]['TI']
        score = similarities[i]
        abstract = df.iloc[i]['AB']
        answer_sentence = extract_answer_sentence(query, abstract)
        top_articles.append((title, score, answer_sentence))

    return top_articles

# Example usage
query = "What is the treatment for cancer?"
top_articles = search_engine(query, df, tfidf_matrix)
for title, score, answer_sentence in top_articles:
    print(f"Title: {title}, Score: {score}")
    print(f"Answer Sentence: {answer_sentence}\n")


In [None]:
# Search engine with capability of the spell check and correcting the misspelling using TF-IDF vectorizer for character-level embeddings
# as well as enriching the search by adding synonyms for the nouns 

import pandas as pd
from spellchecker import SpellChecker
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import re
from nltk.corpus import wordnet
from nltk.tokenize import word_tokenize
from nltk import pos_tag
import spacy

# # Load spaCy English language model
# nlp = spacy.load("en_core_web_sm")

# Download NLTK resources
# import nltk
# nltk.download('wordnet')
# nltk.download('averaged_perceptron_tagger')

# Load your dataset (replace 'your_dataset.csv' with the actual file path)
dataset_path = 'articles.csv'
df = pd.read_csv(dataset_path)

# Preprocess the dataset
df['AB'] = df['AB'].astype(str).apply(lambda x: x.lower())

# Create a TF-IDF vectorizer for character-level embeddings
vectorizer = TfidfVectorizer(analyzer='char', ngram_range=(1, 3))
tfidf_matrix = vectorizer.fit_transform(df['AB'])

# SpellChecker initialization
spell = SpellChecker()

# NLTK WordNet synonym extraction
def get_synonyms(word, pos=None):
    # # Map spaCy POS tags to WordNet POS tags
    # pos_mapping = {'NOUN': 'n', 'PROPN': 'n', 'VERB': 'v'}
    # Map POS tags to WordNet POS tags
    pos_mapping = {'NN': 'n', 'VB': 'v'}
    pos_tag = pos_mapping.get(pos, 'n') if pos else None
    
    synonyms = set()
    for syn in wordnet.synsets(word, pos=pos_tag):
        for lemma in syn.lemmas():
            synonyms.add(lemma.name())
    return list(synonyms)

# Function to replace words in a sentence with their synonyms
def replace_with_synonyms(sentence, pos_tags, max_synonyms=1):
    tokens = word_tokenize(sentence)
    for i in range(len(tokens)):
        word = tokens[i]
        # Check if the word has a corresponding POS tag
        pos_tag_word = pos_tags[i][1] if i < len(pos_tags) else None
        # If the word has a specific POS tag (e.g., noun), get synonyms
        if pos_tag_word and pos_tag_word.startswith(('NN')):
            corrected_word = spell.correction(word)
            synonyms = get_synonyms(corrected_word, pos=pos_tag_word[:2])  # Use the first two characters of the tag
            print(synonyms)
            if synonyms:
                # Replace the word with up to max_synonyms synonyms
                tokens[i] = ' '.join(synonyms[:max_synonyms])
    return ' '.join(tokens)

# # Function to replace words in a sentence with their synonyms
# def replace_with_synonyms(sentence, max_synonyms=1):
#     tokens = word_tokenize(sentence)
    
#     # Use spaCy for part-of-speech tagging
#     pos_tags = [(token.text, token.pos_) for token in nlp(sentence)]
    
#     for i in range(len(tokens)):
#         word = tokens[i]
#         # Check if the word has a corresponding POS tag
#         pos_tag_word = pos_tags[i][1] if i < len(pos_tags) else None
#         # If the word has a specific POS tag (e.g., noun), get synonyms
#         if pos_tag_word in ['NOUN']:
#             corrected_word = spell.correction(word)
#             synonyms = get_synonyms(corrected_word, pos=pos_tag_word[:2])
#             print(synonyms)
#             if synonyms:
#                 # Replace the word with up to max_synonyms synonyms
#                 tokens[i] = ' '.join(synonyms[:max_synonyms])
#     return ' '.join(tokens)

def correct_spelling(query):
    # Tokenize the query
    tokens = query.split()

    # Correct misspelled words using pyspellchecker
    corrected_tokens = [spell.correction(token) for token in tokens]

    # Join the corrected tokens back into a corrected query
    corrected_query = ' '.join(corrected_tokens)

    return corrected_query

def extract_answer_sentence(query, abstract):
    # Tokenize the query
    query_tokens = query.lower().split()

    # Use regex to split the abstract into sentences
    sentences = re.split(r'(?<!\w\.\w.)(?<![A-Z][a-z]\.)(?<=\.|\?)\s', abstract)

    # Find the first sentence containing any of the query keywords
    for sentence in sentences:
        if any(token in sentence.lower() for token in query_tokens):
            return sentence

    return None

def search_engine(query, df, tfidf_matrix, num_results=5, use_synonyms=False, max_synonyms=1):
    # Correct misspellings in the query
    corrected_query = correct_spelling(query)

    # Optionally replace words with synonyms
    if use_synonyms:
        # Perform part-of-speech tagging
        pos_tags = pos_tag(word_tokenize(corrected_query))
        enriched_query = replace_with_synonyms(corrected_query, pos_tags, max_synonyms=max_synonyms)
    else:
        enriched_query = corrected_query

    # Vectorize the enriched query using the same TF-IDF vectorizer
    query_vector = vectorizer.transform([enriched_query])

    # Calculate cosine similarity between the query and dataset abstracts
    similarities = cosine_similarity(query_vector, tfidf_matrix).flatten()

    # Get the indices of the top N most similar abstracts
    top_indices = similarities.argsort()[-num_results:][::-1]

    # Return the titles, scores, and answer sentences of the top N most relevant articles
    top_articles = []
    for i in top_indices:
        title = df.iloc[i]['TI']
        score = similarities[i]
        abstract = df.iloc[i]['AB']
        answer_sentence = extract_answer_sentence(query, abstract)
        top_articles.append((title, score, answer_sentence))

    return top_articles

# Example usage
query = "What is the treatment for cancer?"
use_synonyms = True  # Set this flag to control whether to use synonyms or not
max_synonyms = 2  # Set the maximum number of synonyms to use for each word
top_articles = search_engine(query, df, tfidf_matrix, use_synonyms=use_synonyms, max_synonyms=max_synonyms)
for title, score, answer_sentence in top_articles:
    print(f"Title: {title}, Score: {score}")
    print(f"Answer Sentence: {answer_sentence}\n")


In [None]:
# without using synonyms but with misspelling
query = "What is the treatent for lung caner?"
top_articles = search_engine(query, df, tfidf_matrix)
for title, score, answer_sentence in top_articles:
    print(f"Title: {title}, Score: {score}")
    print(f"Answer Sentence: {answer_sentence}\n")

print("="*100)
# without using synonyms 
query = "What is the treatment for lung cancer?"
top_articles = search_engine(query, df, tfidf_matrix)
for title, score, answer_sentence in top_articles:
    print(f"Title: {title}, Score: {score}")
    print(f"Answer Sentence: {answer_sentence}\n")

print("="*100)
# using synonyms and misspelling
query = "What is the treatment for lung cancer?"
top_articles = search_engine(query, df, tfidf_matrix, use_synonyms=True)
for title, score, answer_sentence in top_articles:
    print(f"Title: {title}, Score: {score}")
    print(f"Answer Sentence: {answer_sentence}\n")


In [None]:
# Search engine with capability of the spell check and correcting the misspelling using TF-IDF vectorizer for character-level embeddings
# and edit distance metric-based approach using the Levenshtein distance algorithm
# as well as enriching the search by adding synonyms for the nouns 

import pandas as pd
from spellchecker import SpellChecker
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import re
from nltk.corpus import wordnet
from nltk.tokenize import word_tokenize
from nltk import pos_tag
import Levenshtein as lev

# Load your dataset (replace 'your_dataset.csv' with the actual file path)
dataset_path = 'articles.csv'
df = pd.read_csv(dataset_path)

# Preprocess the dataset
df['AB'] = df['AB'].astype(str).apply(lambda x: x.lower())

# Create a TF-IDF vectorizer for character-level embeddings
vectorizer = TfidfVectorizer(analyzer='char', ngram_range=(1, 3))
tfidf_matrix = vectorizer.fit_transform(df['AB'])

# NLTK WordNet synonym extraction
def get_synonyms(word, pos=None):
    pos_mapping = {'NN': 'n', 'VB': 'v'}
    pos_tag = pos_mapping.get(pos, 'n') if pos else None
    
    synonyms = set()
    for syn in wordnet.synsets(word, pos=pos_tag):
        for lemma in syn.lemmas():
            synonyms.add(lemma.name())
    return list(synonyms)

# Function to replace words in a sentence with their synonyms
def replace_with_synonyms(sentence, pos_tags, max_synonyms=1):
    tokens = word_tokenize(sentence)
    for i in range(len(tokens)):
        word = tokens[i]
        pos_tag_word = pos_tags[i][1] if i < len(pos_tags) else None
        if pos_tag_word and pos_tag_word.startswith(('NN')):
            corrected_word = spell.correction(word)
            synonyms = get_synonyms(corrected_word, pos=pos_tag_word[:2])
            if synonyms:
                tokens[i] = ' '.join(synonyms[:max_synonyms])
    print(' '.join(tokens))
    return ' '.join(tokens)

def correct_spelling_edit_distance(query):
    tokens = query.split()
    corrected_tokens = [correct_with_edit_distance(token) for token in tokens]
    corrected_query = ' '.join(corrected_tokens)
    return corrected_query

def correct_with_edit_distance(token):
    # Get candidate corrections within a maximum edit distance
    candidates = [word for word in vocabulary if lev.distance(token, word) <= max_edit_distance]
    
    # Choose the candidate with the minimum edit distance
    corrected_token = min(candidates, key=lambda x: lev.distance(token, x))
    
    return corrected_token

def extract_answer_sentence(query, abstract):
    query_tokens = query.lower().split()
    sentences = re.split(r'(?<!\w\.\w.)(?<![A-Z][a-z]\.)(?<=\.|\?)\s', abstract)
    
    for sentence in sentences:
        if any(token in sentence.lower() for token in query_tokens):
            return sentence
    
    return None

def search_engine(query, df, tfidf_matrix, num_results=5, use_synonyms=False, max_synonyms=1):
    corrected_query = correct_spelling_edit_distance(query)
    
    if use_synonyms:
        pos_tags = pos_tag(word_tokenize(corrected_query))
        enriched_query = replace_with_synonyms(corrected_query, pos_tags, max_synonyms=max_synonyms)
    else:
        enriched_query = corrected_query
    
    query_vector = vectorizer.transform([enriched_query])
    similarities = cosine_similarity(query_vector, tfidf_matrix).flatten()
    top_indices = similarities.argsort()[-num_results:][::-1]
    
    top_articles = []
    for i in top_indices:
        title = df.iloc[i]['TI']
        score = similarities[i]
        abstract = df.iloc[i]['AB']
        answer_sentence = extract_answer_sentence(query, abstract)
        top_articles.append((title, score, answer_sentence))
    
    return top_articles

# Example usage
query = "What is the treatment for cancer?"
use_synonyms = True
max_synonyms = 2
max_edit_distance = 2  # Set the maximum edit distance for the spell-checking
spell = SpellChecker(distance=max_edit_distance)
vocabulary = set(df['AB'].str.cat(sep=' ').lower().split())

top_articles = search_engine(query, df, tfidf_matrix, use_synonyms=use_synonyms, max_synonyms=max_synonyms)
for title, score, answer_sentence in top_articles:
    print(f"Title: {title}, Score: {score}")
    print(f"Answer Sentence: {answer_sentence}\n")


In [None]:
# without using synonyms but with misspelling
query = "What is the treatent for lung caner?"
top_articles = search_engine(query, df, tfidf_matrix)
for title, score, answer_sentence in top_articles:
    print(f"Title: {title}, Score: {score}")
    print(f"Answer Sentence: {answer_sentence}\n")

print("="*100)
# without using synonyms 
query = "What is the treatment for lung cancer?"
top_articles = search_engine(query, df, tfidf_matrix)
for title, score, answer_sentence in top_articles:
    print(f"Title: {title}, Score: {score}")
    print(f"Answer Sentence: {answer_sentence}\n")

print("="*100)
# using synonyms and misspelling
query = "What is the treatment for lung cancer?"
top_articles = search_engine(query, df, tfidf_matrix, use_synonyms=True)
for title, score, answer_sentence in top_articles:
    print(f"Title: {title}, Score: {score}")
    print(f"Answer Sentence: {answer_sentence}\n")

In [None]:
# Implement Transformer for Medical Text QA

import pandas as pd
import numpy as np
from transformers import BertTokenizer, BertModel
from sklearn.metrics.pairwise import cosine_similarity
from tqdm import tqdm
import re
import os
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
import torch

# Load dataset 
dataset_path = 'articles.csv'
df = pd.read_csv(dataset_path)

# Preprocess the dataset
df['AB'] = df['AB'].astype(str).apply(lambda x: x.lower())

# Load pre-trained BERT tokenizer and model
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertModel.from_pretrained('bert-base-uncased')

# Path to store precomputed embeddings
embeddings_path = 'precomputed_embeddings.npy'

# Set up NLTK stopwords and stemmer
stop_words = set(stopwords.words('english'))
stemmer = PorterStemmer()

# Function to preprocess and embed text in batches
def preprocess_and_embed_batch(texts, batch_size=32):
    embeddings_list = []

    for i in tqdm(range(0, len(texts), batch_size), desc="Computing embeddings in batches"):
        batch_texts = texts[i:i + batch_size]
        batch_embeddings = preprocess_and_embed(batch_texts)
        embeddings_list.append(batch_embeddings)

    return np.concatenate(embeddings_list, axis=0)

# Function to preprocess and embed text
def preprocess_and_embed(texts):
    # Remove stop words and apply stemming
    processed_texts = [' '.join([stemmer.stem(token) for token in tokenizer.tokenize(text) if token not in stop_words]) for text in texts]

    # Tokenize and encode the input text
    inputs = tokenizer(processed_texts, return_tensors='pt', max_length=512, truncation=True, padding=True)

    # Forward pass through the BERT model
    with torch.no_grad():  # This block ensures GPU usage
        outputs = model(**inputs)

    # Extract the embeddings for the [CLS] token
    embeddings = outputs.last_hidden_state[:, 0, :].detach().cpu().numpy()

    return embeddings

def compute_and_save_embeddings(df, embeddings_path, incremental=False):
    if incremental and os.path.exists(embeddings_path):
        # Load existing embeddings
        existing_embeddings = load_embeddings(embeddings_path)

        # Identify new abstracts to process
        new_abstracts = df.loc[~df.index.isin(existing_embeddings.index), 'AB']

        if not new_abstracts.empty:
            new_embeddings = preprocess_and_embed_batch(new_abstracts)
            updated_embeddings = np.concatenate([existing_embeddings, new_embeddings], axis=0)
        else:
            # Nothing new to process
            updated_embeddings = existing_embeddings

    else:
        # Process all abstracts
        updated_embeddings = preprocess_and_embed_batch(df['AB'])

    # Save updated embeddings
    np.save(embeddings_path, updated_embeddings)

def load_embeddings(embeddings_path):
    return np.load(embeddings_path)

def extract_answer_sentence(query, abstract):
    query_tokens = tokenizer.tokenize(query)
    sentences = re.split(r'(?<!\w\.\w.)(?<![A-Z][a-z]\.)(?<=\.|\?)\s', abstract)
    
    for sentence in sentences:
        sentence_tokens = tokenizer.tokenize(sentence)
        if any(token in sentence_tokens for token in query_tokens):
            return sentence
    
    return None

def search_engine(query, df, embeddings, num_results=5):
    # Preprocess the query
    query = query.lower()
    
    # Calculate embeddings for the query
    query_embedding = preprocess_and_embed(query)
    
    # Calculate cosine similarity between the query and dataset abstracts
    similarities = [cosine_similarity(query_embedding, ae.reshape(1, -1))[0][0] for ae in tqdm(embeddings, desc="Calculating similarities")]
    
    # Get the indices of the top N most similar abstracts
    top_indices = sorted(range(len(similarities)), key=lambda i: similarities[i], reverse=True)[:num_results]
    
    # Return the titles, scores, and answer sentences of the top N most relevant articles
    top_articles = []
    for i in top_indices:
        title = df.iloc[i]['TI']
        score = similarities[i]
        abstract = df.iloc[i]['AB']
        answer_sentence = extract_answer_sentence(query, abstract)
        top_articles.append((title, score, answer_sentence))
    
    return top_articles

# Check if precomputed embeddings exist, otherwise compute and save them
if not os.path.exists(embeddings_path):
    compute_and_save_embeddings(df, embeddings_path)

# Load precomputed embeddings
embeddings = load_embeddings(embeddings_path)

# Example usage
query = "What is the treatment for cancer?"
top_articles = search_engine(query, df, embeddings)
for title, score, answer_sentence in top_articles:
    print(f"Title: {title}, Score: {score}")
    print(f"Answer Sentence: {answer_sentence}\n")


In [None]:
# with misspelling
query = "What is the treatent for lung caner?"
top_articles = search_engine(query, df, embeddings)
for title, score, answer_sentence in top_articles:
    print(f"Title: {title}, Score: {score}")
    print(f"Answer Sentence: {answer_sentence}\n")

print("="*100)
# without misspelling 
query = "What is the treatment for lung cancer?"
top_articles = search_engine(query, df, embeddings)
for title, score, answer_sentence in top_articles:
    print(f"Title: {title}, Score: {score}")
    print(f"Answer Sentence: {answer_sentence}\n")