# Web Crawler Data Collection

In [None]:
# NO NEED TO RUN THIS CELL AS THE DATA IS ALREADY COLLECTED AND SAVE UNDER articles.csv

# Data Crawler that works exactly like a human and go one by one through the articles and save the abstracts and references in XML format

from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.service import Service as ChromeService
from selenium.common.exceptions import NoSuchElementException
from webdriver_manager.chrome import ChromeDriverManager

import re
import pandas as pd
from tqdm import tqdm

# Navigate to the website
url = "https://pubmed.ncbi.nlm.nih.gov/?term=intelligence+%5BTitle%2Fabstract%5D&filter=simsearch1.fha&filter=years.2013-2023&sort=date&size=200"
driver = webdriver.Chrome(service=ChromeService(ChromeDriverManager().install()))
driver.get(url)

# Click the first article to start
driver.find_element(By.XPATH, "//a[@data-ga-action=1]").click()
total_articles = driver.find_element(By.XPATH, "//*[@id='adjacent-navigation']/div[2]/a/span[1]/span[2]").text

# Find the number of total articles
total_articles = re.sub(r"[^3-9]",'', total_articles)

# Specify the chunk size
chunk_size = 1000

# Create empty lists to store data
titles = []
authors = []
abstracts = []
references_list = []
not_found_pages = []

for page in tqdm(range(int(total_articles))):
    # Extract title of the article
    try:
        title = driver.find_element(By.CLASS_NAME, "heading-title")
        if title.is_displayed():
            title = title.text
    except NoSuchElementException:
        title = ''
        pass

    # Extract autors of the article
    try:
        authors_elements = driver.find_elements(By.CLASS_NAME, "full-name")
        author_list = []
        if len(authors_elements) > 0:
            for author in authors_elements:
                author_list.append(author.text)
    except NoSuchElementException:
        author_list.append('')
        pass

    # Extract abstract of the article
    try:
        abstract = driver.find_element(By.ID, "eng-abstract")
        if abstract.is_displayed():
            abstract = abstract.text
    except NoSuchElementException:
        abstract = ''
        pass

    # Check and extract if there is reference or are more references 
    try:
        reference = driver.find_element(By.ID, "references")
        show_all_element = driver.find_element(By.CLASS_NAME, "show-all")
        if show_all_element.is_displayed():
            show_all_element.click()
        if reference.is_displayed():
            references = driver.find_element(By.CLASS_NAME, "references-list").text
    except NoSuchElementException:
        references = ''
        pass

    # Append data to lists
    titles.append(title)
    authors.append(author_list)
    abstracts.append(abstract)
    references_list.append(references)

    if (page + 1) % chunk_size == 0 or page + 1 == int(total_articles):
        # Create a DataFrame
        data = {
                    'Title': pd.Series(titles),
                    'Authors': pd.Series(authors),
                    'Abstracts': pd.Series(abstracts),
                    'References': pd.Series(references_list)
                }
        df = pd.DataFrame(data)

        # Save DataFrame to CSV
        chunk_number = (page + 1) // chunk_size
        csv_filename = f'pubmed_data_chunk_{chunk_number}.csv'
        df.to_csv(csv_filename, index=False)

        # Clear lists for the next chunk
        titles = []
        authors = []
        abstracts = []
        references_list = []
    
    # Navigate to the next article
    try:
        next_page = driver.find_element(By.XPATH, "//div[@class='next side-link visible']")
        if next_page.is_displayed():
            next_page.click()
    except NoSuchElementException:
        not_found_pages.append(page)
        print(f"{page = } not found!")
        pass

# Close the browser
driver.quit()

# Preprocessing the data

In [None]:
# Data Preprocessing to have a CSV file with the following columns and respective data in each row:
# which will be filtered later to have just useful columns and non dupplicated PMID

"""
df shape before cleaning:(74243, 77)
df shape after cleaning:(57560, 15)

Index(['PMID', 'STAT', 'DRDT', 'CTDT', 'PB', 'DP', 'TI', 'BTI', 'AB', 'CI',
       'FED', 'ED', 'FAU', 'AU', 'AD', 'LA', 'PT', 'PL', 'OTO', 'OT', 'EDAT',
       'CRDT', 'AID', 'OWN', 'DCOM', 'LR', 'IS', 'VI', 'IP', 'PG', 'LID',
       'DEP', 'TA', 'JT', 'JID', 'SB', 'MH', 'MHDA', 'PHST', 'PST', 'SO', 'GR',
       'PMC', 'MID', 'COIS', 'TT', 'RN', 'OID', 'SI', 'ISBN', 'CTI', 'CN',
       'FIR', 'IR', 'AUID', 'EIN', 'CIN', 'PS', 'FPS', 'CON', 'UOF', 'UIN',
       'RIN', 'IRAD', 'EFR', 'OAB', 'OABL', 'PMCR', 'CP', 'ECI', 'DRIN', 'RF',
       'EN', 'ROF', 'RPI', 'RPF', 'DDIN'],
      dtype='object')


Guide to the abbreviations:

    PMID: PubMed IDentifier - Unique identifier for a PubMed record.

    TI: Title - The title of the article.

    AB: Abstract - A brief summary of the article's content.

    PB: Publisher - The organization responsible for publishing the article.

    FAU: Full Author(s) - Full names of the authors.

    FED: Full Editor(s) - Full names of the editors.

    DP: Date of Publication - Date when the article was published.

    OTO: Other Term Owner - Owner of other terms.

    OT: Other Term - Additional terms or keywords associated with the article.

    OWN: Owner - Owner of the article.

    DCOM: Date Completed - Date when the article was completed.

    LR: Last Revision - Last revision date.

    JT: Journal Title - Full title of the journal.

    MH: MeSH Terms - Medical Subject Headings.

    ISBN: International Standard Book Number - ISBN of the article.

[Removed]    STAT: Status - Indicates the status of the publication.

[Removed]    DRDT: Date Received by Database Transfer - Date when the record was received by the database.

[Removed]    CTDT: Current Temporary Date - Current temporary date of the record.

[Removed]    BTI: Book Title Indicator - Indicates that the article is part of a book.

[Removed]    CI: Copyright Information - Information about the copyright holder.

[Removed]    ED: Editor - Abbreviation for the editor.

[Removed]    AU: Author - Abbreviation for the author.

[Removed]    AD: Author's Affiliation - Affiliation or institution of the author.

[Removed]    LA: Language - Language of the article.

[Removed]    PT: Publication Type - Type of publication (e.g., Review, Book Chapter).

[Removed]    PL: Place of Publication - Location where the article was published.

[Removed]    EDAT: Entrez Date - Date the record was added to the Entrez database.

[Removed]    CRDT: Create Date - Date the record was created.

[Removed]    AID: Article Identifier - Identifier associated with the article.

[Removed]    IS: Issue - Issue number of the journal.

[Removed]    VI: Volume - Volume number of the journal.

[Removed]    IP: Issue Part - Part number of the issue.

[Removed]    PG: Page - Page number.

[Removed]    LID: Location IDentifier - Identifier for the location of the article.

[Removed]    DEP: Date of Electronic Publication - Date of electronic publication.

[Removed]    TA: Journal Title (ISO abbreviation) - Title abbreviation of the journal.

[Removed]    JID: Journal ID - Identifier for the journal.

[Removed]    SB: Subset - Subset designation.

[Removed]    MHDA: MeSH Date - MeSH date.

[Removed]    PHST: Publication History Status - Publication history status.

[Removed]    PST: Publication Status - Publication status.

[Removed]    SO: Source - Source of the article.

[Removed]    GR: Grant - Grant information.

[Removed]    PMC: PubMed Central ID - Identifier for PubMed Central.

[Removed]    MID: Manuscript ID - Identifier for the manuscript.

[Removed]    COIS: Conflict of Interest Statement - Statement about potential conflicts of interest.

[Removed]    TT: Type of Test - Type of test.

[Removed]    RN: Registry Number - Registry number.

[Removed]    OID: Organization ID - Identifier for the organization.

[Removed]    SI: Secondary Source ID - Secondary source identifier.

[Removed]    CTI: Current Technology Information - Current technology information.

[Removed]    CN: Contract Number - Contract number.

[Removed]    FIR: Full Investigator(s) - Full names of the investigators.

[Removed]    IR: Investigator - Abbreviation for the investigator.

[Removed]    AUID: Author ID - Identifier for the author.

[Removed]    EIN: Editor's ID - Identifier for the editor.

[Removed]    CIN: Contributor ID - Identifier for the contributor.

[Removed]    PS: Personal Name as Subject - Personal name as subject.

[Removed]    FPS: Full Personal Name as Subject - Full personal name as subject.

[Removed]    CON: Consortium - Consortium information.

[Removed]    UOF: Use of Funds - Use of funds information.

[Removed]    UIN: Unique Identifier - Unique identifier.

[Removed]    RIN: Reviewer ID - Reviewer identifier.

[Removed]    IRAD: Investigator Affiliation Department - Investigator affiliation department.

[Removed]    EFR: EFS (Endoscopic Frequency Standardization) Factor - EFS factor.

[Removed]    OAB: Overall Bank - Overall bank.

[Removed]    OABL: Overall Blood - Overall blood.

[Removed]    PMCR: PubMed Central Release - PubMed Central release information.

[Removed]    CP: Clinical Progress - Clinical progress.

[Removed]    ECI: Early Career Investigator - Early career investigator.

[Removed]    DRIN: Dual Purpose Experimental Purpose Indicator - Dual-purpose experimental purpose indicator.

[Removed]    RF: Release Factor - Release factor.

[Removed]    EN: Endorsement - Endorsement.

[Removed]    ROF: Reviewer's Office - Reviewer's office.

[Removed]    RPI: Reviewer's Position Identifier - Reviewer's position identifier.

[Removed]    RPF: Research Performance Factor - Research performance factor.

[Removed]    DDIN: Degree-Degree Integration Network - Degree-degree integration network.
"""



import re
import pandas as pd

with open('../crawler/articles.txt', 'r', encoding='utf-8') as f:
    input_text = f.read()

# Split articles based on double quotes
articles = re.split(r'\n"\n', input_text.strip())

# Define a function to extract data from each article
def extract_data(article):
    data = {}
    current_key = None
    current_value = ''

    for line in article.split('\n'):
        # matching the key-value pair
        match = re.match(r'^([A-Z]{2,4})\s*- (.+)$', line)

        if match:
            key, value = match.groups()
            if current_key:
                # If a key is already set, save the current value
                if current_key in data:
                    data[current_key] += '|' + current_value
                else:
                    data[current_key] =  current_value.strip()
                current_value = ''  # Reset current value

            current_key = key
            current_value = value
        else:
            # If there's no match, append the line to the current value
            current_value += '' + line.strip()

    # Save the last key-value pair
    if current_key:
        data[current_key] = current_value.strip()

    return data

# Extract data from each article
article_data_list = [extract_data(article) for article in articles]

# Filter out articles without 'AB' key
filtered_data_list = [data for data in article_data_list if 'AB' in data]

# Create a DataFrame from the filtered data
df = pd.DataFrame(filtered_data_list)

# Keep only useful columns:  df shape before cleaning:(74243, 77)
df = df[['PMID', 'TI', 'AB', 'PB', 'FAU', 'FED', 'DP', 'OTO', 'OT', 'OWN', 'DCOM', 'LR', 'JT', 'MH', 'ISBN']]

# Drop duplicates based on the 'PMID' column : df shape after cleaning:(57560, 15)
df = df.drop_duplicates(subset='PMID', keep='first')

# Save the DataFrame to a CSV file
df.to_csv('../crawler/articles.csv', index=False)


In [None]:
# a basic TF-IDF approach and the pandas library for data manipulation:
# which will search for the top 5 most similar articles to a given query.

import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import linear_kernel

# Load your CSV data
data = pd.read_csv('articles.csv')

# Create a TF-IDF vectorizer
tfidf_vectorizer = TfidfVectorizer(stop_words='english')
tfidf_matrix = tfidf_vectorizer.fit_transform(data['AB'].fillna(''))

# Function to search for queries
def search(query, tfidf_matrix, data):
    query_vector = tfidf_vectorizer.transform([query])
    cosine_similarities = linear_kernel(query_vector, tfidf_matrix).flatten()
    document_scores = list(enumerate(cosine_similarities))
    document_scores = sorted(document_scores, key=lambda x: x[1], reverse=True)
    
    # Return the top N results
    top_results = document_scores[:5]
    for idx, score in top_results:
        print(f"Title: {data['TI'].iloc[idx]}, Score: {score}")


In [None]:
# Example query
search("IQ scores", tfidf_matrix, data)

# Spell Checker and Word Suggestion

In [None]:
# Search engine with capability of the spell check and correcting the misspelling

import pandas as pd
from spellchecker import SpellChecker
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import re
import nltk
from nltk.corpus import stopwords

# Load your dataset (replace 'your_dataset.csv' with the actual file path)
dataset_path = 'articles.csv'
df = pd.read_csv(dataset_path)


nltk.download('stopwords')

# Use English stopwords from NLTK
stop_words = set(stopwords.words('english'))

def remove_stopwords(text):
    tokens = text.split()
    filtered_tokens = [word for word in tokens if word.lower() not in stop_words]
    return ' '.join(filtered_tokens)

# Modify the preprocessing step to remove stopwords
df['AB'] = df['AB'].astype(str).apply(lambda x: x.lower())
df['AB'] = df['AB'].apply(remove_stopwords)


# Create a TF-IDF vectorizer for character-level embeddings
vectorizer = TfidfVectorizer(analyzer='char', ngram_range=(1, 3))
tfidf_matrix = vectorizer.fit_transform(df['AB'])

# SpellChecker initialization
spell = SpellChecker()

def correct_spelling(query):
    # Tokenize the query
    tokens = query.split()

    # Correct misspelled words using pyspellchecker
    corrected_tokens = [spell.correction(token) for token in tokens]

    # Join the corrected tokens back into a corrected query
    corrected_query = ' '.join(corrected_tokens)

    return corrected_query

def extract_answer_sentence(query, abstract):
    # Tokenize the query
    query_tokens = query.lower().split()

    # Use regex to split the abstract into sentences
    sentences = re.split(r'(?<!\w\.\w.)(?<![A-Z][a-z]\.)(?<=\.|\?)\s', abstract)

    # Find the first sentence containing any of the query keywords
    for sentence in sentences:
        if any(token in sentence.lower() for token in query_tokens):
            return sentence

    return None


def search_engine(query, df, tfidf_matrix, num_results=5):
    # Correct misspellings in the query
    corrected_query = correct_spelling(query)

    # Vectorize the corrected query using the same TF-IDF vectorizer
    query_vector = vectorizer.transform([corrected_query])

    # Calculate cosine similarity between the query and dataset abstracts
    similarities = cosine_similarity(query_vector, tfidf_matrix).flatten()

    # Get the indices of the top N most similar abstracts
    top_indices = similarities.argsort()[-num_results:][::-1]

    # Return the titles, scores, and answer sentences of the top N most relevant articles
    top_articles = []
    for i in top_indices:
        title = df.iloc[i]['TI']
        score = similarities[i]
        abstract = df.iloc[i]['AB']
        answer_sentence = extract_answer_sentence(query, abstract)
        top_articles.append((title, score, answer_sentence))

    return top_articles

# Example usage
query = "What is the treatment for cancer?"
top_articles = search_engine(query, df, tfidf_matrix)
for title, score, answer_sentence in top_articles:
    print(f"Title: {title}, Score: {score}")
    print(f"Answer Sentence: {answer_sentence}\n")


### TF-IDF searches for exact matches, and thus suffers from misspellings and synonyms.
The purpose of this code cell is to solve those problems by fixing misspellings and replacing all synonyms by one of them.

In [None]:
# Search engine with capability of the spell check and correcting the misspelling using TF-IDF vectorizer for character-level embeddings
# as well as enriching the search by adding synonyms for the nouns 

import pandas as pd
from spellchecker import SpellChecker
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import re
from nltk.corpus import wordnet
from nltk.tokenize import word_tokenize
from nltk import pos_tag
import spacy

# # Load spaCy English language model
# nlp = spacy.load("en_core_web_sm")

# Download NLTK resources
# import nltk
# nltk.download('wordnet')
# nltk.download('averaged_perceptron_tagger')

# Load your dataset (replace 'your_dataset.csv' with the actual file path)
dataset_path = 'articles.csv'
df = pd.read_csv(dataset_path)

# Preprocess the dataset
df['AB'] = df['AB'].astype(str).apply(lambda x: x.lower())

# Create a TF-IDF vectorizer for character-level embeddings
vectorizer = TfidfVectorizer(analyzer='char', ngram_range=(1, 3))
tfidf_matrix = vectorizer.fit_transform(df['AB'])

# SpellChecker initialization
spell = SpellChecker()

# NLTK WordNet synonym extraction
def get_synonyms(word, pos=None):
    # # Map spaCy POS tags to WordNet POS tags
    # pos_mapping = {'NOUN': 'n', 'PROPN': 'n', 'VERB': 'v'}
    # Map POS tags to WordNet POS tags
    pos_mapping = {'NN': 'n', 'VB': 'v'}
    pos_tag = pos_mapping.get(pos, 'n') if pos else None
    
    synonyms = set()
    for syn in wordnet.synsets(word, pos=pos_tag):
        for lemma in syn.lemmas():
            synonyms.add(lemma.name())
    return list(synonyms)

# Function to replace words in a sentence with their synonyms
def replace_with_synonyms(sentence, pos_tags, max_synonyms=1):
    tokens = word_tokenize(sentence)
    for i in range(len(tokens)):
        word = tokens[i]
        # Check if the word has a corresponding POS tag
        pos_tag_word = pos_tags[i][1] if i < len(pos_tags) else None
        # If the word has a specific POS tag (e.g., noun), get synonyms
        if pos_tag_word and pos_tag_word.startswith(('NN')):
            corrected_word = spell.correction(word)
            synonyms = get_synonyms(corrected_word, pos=pos_tag_word[:2])  # Use the first two characters of the tag
            print(synonyms)
            if synonyms:
                # Replace the word with up to max_synonyms synonyms
                tokens[i] = ' '.join(synonyms[:max_synonyms])
    return ' '.join(tokens)

# # Function to replace words in a sentence with their synonyms
# def replace_with_synonyms(sentence, max_synonyms=1):
#     tokens = word_tokenize(sentence)
    
#     # Use spaCy for part-of-speech tagging
#     pos_tags = [(token.text, token.pos_) for token in nlp(sentence)]
    
#     for i in range(len(tokens)):
#         word = tokens[i]
#         # Check if the word has a corresponding POS tag
#         pos_tag_word = pos_tags[i][1] if i < len(pos_tags) else None
#         # If the word has a specific POS tag (e.g., noun), get synonyms
#         if pos_tag_word in ['NOUN']:
#             corrected_word = spell.correction(word)
#             synonyms = get_synonyms(corrected_word, pos=pos_tag_word[:2])
#             print(synonyms)
#             if synonyms:
#                 # Replace the word with up to max_synonyms synonyms
#                 tokens[i] = ' '.join(synonyms[:max_synonyms])
#     return ' '.join(tokens)

def correct_spelling(query):
    # Tokenize the query
    tokens = query.split()

    # Correct misspelled words using pyspellchecker
    corrected_tokens = [spell.correction(token) for token in tokens]

    # Join the corrected tokens back into a corrected query
    corrected_query = ' '.join(corrected_tokens)

    return corrected_query

def extract_answer_sentence(query, abstract):
    # Tokenize the query
    query_tokens = query.lower().split()

    # Use regex to split the abstract into sentences
    sentences = re.split(r'(?<!\w\.\w.)(?<![A-Z][a-z]\.)(?<=\.|\?)\s', abstract)

    # Find the first sentence containing any of the query keywords
    for sentence in sentences:
        if any(token in sentence.lower() for token in query_tokens):
            return sentence

    return None

def search_engine(query, df, tfidf_matrix, num_results=5, use_synonyms=False, max_synonyms=1):
    # Correct misspellings in the query
    corrected_query = correct_spelling(query)

    # Optionally replace words with synonyms
    if use_synonyms:
        # Perform part-of-speech tagging
        pos_tags = pos_tag(word_tokenize(corrected_query))
        enriched_query = replace_with_synonyms(corrected_query, pos_tags, max_synonyms=max_synonyms)
    else:
        enriched_query = corrected_query

    # Vectorize the enriched query using the same TF-IDF vectorizer
    query_vector = vectorizer.transform([enriched_query])

    # Calculate cosine similarity between the query and dataset abstracts
    similarities = cosine_similarity(query_vector, tfidf_matrix).flatten()

    # Get the indices of the top N most similar abstracts
    top_indices = similarities.argsort()[-num_results:][::-1]

    # Return the titles, scores, and answer sentences of the top N most relevant articles
    top_articles = []
    for i in top_indices:
        title = df.iloc[i]['TI']
        score = similarities[i]
        abstract = df.iloc[i]['AB']
        answer_sentence = extract_answer_sentence(query, abstract)
        top_articles.append((title, score, answer_sentence))

    return top_articles

# Example usage
query = "What is the treatment for cancer?"
use_synonyms = True  # Set this flag to control whether to use synonyms or not
max_synonyms = 2  # Set the maximum number of synonyms to use for each word
top_articles = search_engine(query, df, tfidf_matrix, use_synonyms=use_synonyms, max_synonyms=max_synonyms)
for title, score, answer_sentence in top_articles:
    print(f"Title: {title}, Score: {score}")
    print(f"Answer Sentence: {answer_sentence}\n")


In [None]:
# without using synonyms but with misspelling
query = "What is the treatent for lung caner?"
top_articles = search_engine(query, df, tfidf_matrix)
for title, score, answer_sentence in top_articles:
    print(f"Title: {title}, Score: {score}")
    print(f"Answer Sentence: {answer_sentence}\n")

print("="*100)
# without using synonyms 
query = "What is the treatment for lung cancer?"
top_articles = search_engine(query, df, tfidf_matrix)
for title, score, answer_sentence in top_articles:
    print(f"Title: {title}, Score: {score}")
    print(f"Answer Sentence: {answer_sentence}\n")

print("="*100)
# using synonyms and misspelling
query = "What is the treatment for lung cancer?"
top_articles = search_engine(query, df, tfidf_matrix, use_synonyms=True)
for title, score, answer_sentence in top_articles:
    print(f"Title: {title}, Score: {score}")
    print(f"Answer Sentence: {answer_sentence}\n")


# Syntactic method: Levenshtein distance algorithm

In [None]:
# Search engine with capability of the spell check and correcting the misspelling using TF-IDF vectorizer for character-level embeddings
# and edit distance metric-based approach using the Levenshtein distance algorithm
# as well as enriching the search by adding synonyms for the nouns 

import pandas as pd
from spellchecker import SpellChecker
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import re
from nltk.corpus import wordnet
from nltk.tokenize import word_tokenize
from nltk import pos_tag
import Levenshtein as lev

# Load your dataset (replace 'your_dataset.csv' with the actual file path)
dataset_path = 'articles.csv'
df = pd.read_csv(dataset_path)

# Preprocess the dataset
df['AB'] = df['AB'].astype(str).apply(lambda x: x.lower())

# Create a TF-IDF vectorizer for character-level embeddings
vectorizer = TfidfVectorizer(analyzer='char', ngram_range=(1, 3))
tfidf_matrix = vectorizer.fit_transform(df['AB'])

# NLTK WordNet synonym extraction
def get_synonyms(word, pos=None):
    pos_mapping = {'NN': 'n', 'VB': 'v'}
    pos_tag = pos_mapping.get(pos, 'n') if pos else None
    
    synonyms = set()
    for syn in wordnet.synsets(word, pos=pos_tag):
        for lemma in syn.lemmas():
            synonyms.add(lemma.name())
    return list(synonyms)

# Function to replace words in a sentence with their synonyms
def replace_with_synonyms(sentence, pos_tags, max_synonyms=1):
    tokens = word_tokenize(sentence)
    for i in range(len(tokens)):
        word = tokens[i]
        pos_tag_word = pos_tags[i][1] if i < len(pos_tags) else None
        if pos_tag_word and pos_tag_word.startswith(('NN')):
            corrected_word = spell.correction(word)
            synonyms = get_synonyms(corrected_word, pos=pos_tag_word[:2])
            if synonyms:
                tokens[i] = ' '.join(synonyms[:max_synonyms])
    print(' '.join(tokens))
    return ' '.join(tokens)

def correct_spelling_edit_distance(query):
    tokens = query.split()
    corrected_tokens = [correct_with_edit_distance(token) for token in tokens]
    corrected_query = ' '.join(corrected_tokens)
    return corrected_query

def correct_with_edit_distance(token):
    # Get candidate corrections within a maximum edit distance
    candidates = [word for word in vocabulary if lev.distance(token, word) <= max_edit_distance]
    
    # Choose the candidate with the minimum edit distance
    corrected_token = min(candidates, key=lambda x: lev.distance(token, x))
    
    return corrected_token

def extract_answer_sentence(query, abstract):
    query_tokens = query.lower().split()
    sentences = re.split(r'(?<!\w\.\w.)(?<![A-Z][a-z]\.)(?<=\.|\?)\s', abstract)
    
    for sentence in sentences:
        if any(token in sentence.lower() for token in query_tokens):
            return sentence
    
    return None

def search_engine(query, df, tfidf_matrix, num_results=5, use_synonyms=False, max_synonyms=1):
    corrected_query = correct_spelling_edit_distance(query)
    
    if use_synonyms:
        pos_tags = pos_tag(word_tokenize(corrected_query))
        enriched_query = replace_with_synonyms(corrected_query, pos_tags, max_synonyms=max_synonyms)
    else:
        enriched_query = corrected_query
    
    query_vector = vectorizer.transform([enriched_query])
    similarities = cosine_similarity(query_vector, tfidf_matrix).flatten()
    top_indices = similarities.argsort()[-num_results:][::-1]
    
    top_articles = []
    for i in top_indices:
        title = df.iloc[i]['TI']
        score = similarities[i]
        abstract = df.iloc[i]['AB']
        answer_sentence = extract_answer_sentence(query, abstract)
        top_articles.append((title, score, answer_sentence))
    
    return top_articles

# Example usage
query = "What is the treatment for cancer?"
use_synonyms = True
max_synonyms = 2
max_edit_distance = 2  # Set the maximum edit distance for the spell-checking
spell = SpellChecker(distance=max_edit_distance)
vocabulary = set(df['AB'].str.cat(sep=' ').lower().split())

top_articles = search_engine(query, df, tfidf_matrix, use_synonyms=use_synonyms, max_synonyms=max_synonyms)
for title, score, answer_sentence in top_articles:
    print(f"Title: {title}, Score: {score}")
    print(f"Answer Sentence: {answer_sentence}\n")


In [None]:
# without using synonyms but with misspelling
query = "What is the treatent for lung caner?"
top_articles = search_engine(query, df, tfidf_matrix)
for title, score, answer_sentence in top_articles:
    print(f"Title: {title}, Score: {score}")
    print(f"Answer Sentence: {answer_sentence}\n")

print("="*100)
# without using synonyms 
query = "What is the treatment for lung cancer?"
top_articles = search_engine(query, df, tfidf_matrix)
for title, score, answer_sentence in top_articles:
    print(f"Title: {title}, Score: {score}")
    print(f"Answer Sentence: {answer_sentence}\n")

print("="*100)
# using synonyms and misspelling
query = "What is the treatment for lung cancer?"
top_articles = search_engine(query, df, tfidf_matrix, use_synonyms=True)
for title, score, answer_sentence in top_articles:
    print(f"Title: {title}, Score: {score}")
    print(f"Answer Sentence: {answer_sentence}\n")

### Transform the abstracts into word embeddings using pre-trained word embeddings (e.g., Word2Vec, GloVe, or FastText) and calculate the cosine similarity between the query and the abstracts based on the word embeddings.

In [None]:
# Implement Transformer for Medical Text QA

import pandas as pd
import numpy as np
from transformers import BertTokenizer, BertModel
from sklearn.metrics.pairwise import cosine_similarity
from tqdm import tqdm
import re
import os
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
import torch

# Load dataset 
dataset_path = 'articles.csv'
df = pd.read_csv(dataset_path)

# Preprocess the dataset
df['AB'] = df['AB'].astype(str).apply(lambda x: x.lower())

# Load pre-trained BERT tokenizer and model
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertModel.from_pretrained('bert-base-uncased')

# Path to store precomputed embeddings
embeddings_path = 'precomputed_embeddings.npy'

# Set up NLTK stopwords and stemmer
stop_words = set(stopwords.words('english'))
stemmer = PorterStemmer()

# Function to preprocess and embed text in batches
def preprocess_and_embed_batch(texts, batch_size=32):
    embeddings_list = []

    for i in tqdm(range(0, len(texts), batch_size), desc="Computing embeddings in batches"):
        batch_texts = texts[i:i + batch_size]
        batch_embeddings = preprocess_and_embed(batch_texts)
        embeddings_list.append(batch_embeddings)

    return np.concatenate(embeddings_list, axis=0)

# Function to preprocess and embed text
def preprocess_and_embed(texts):
    # Remove stop words and apply stemming
    processed_texts = [' '.join([stemmer.stem(token) for token in tokenizer.tokenize(text) if token not in stop_words]) for text in texts]

    # Tokenize and encode the input text
    inputs = tokenizer(processed_texts, return_tensors='pt', max_length=512, truncation=True, padding=True)

    # Forward pass through the BERT model
    with torch.no_grad():  # This block ensures GPU usage
        outputs = model(**inputs)

    # Extract the embeddings for the [CLS] token
    embeddings = outputs.last_hidden_state[:, 0, :].detach().cpu().numpy()

    return embeddings

def compute_and_save_embeddings(df, embeddings_path, incremental=False):
    if incremental and os.path.exists(embeddings_path):
        # Load existing embeddings
        existing_embeddings = load_embeddings(embeddings_path)

        # Identify new abstracts to process
        new_abstracts = df.loc[~df.index.isin(existing_embeddings.index), 'AB']

        if not new_abstracts.empty:
            new_embeddings = preprocess_and_embed_batch(new_abstracts)
            updated_embeddings = np.concatenate([existing_embeddings, new_embeddings], axis=0)
        else:
            # Nothing new to process
            updated_embeddings = existing_embeddings

    else:
        # Process all abstracts
        updated_embeddings = preprocess_and_embed_batch(df['AB'])

    # Save updated embeddings
    np.save(embeddings_path, updated_embeddings)

def load_embeddings(embeddings_path):
    return np.load(embeddings_path)

def extract_answer_sentence(query, abstract):
    query_tokens = tokenizer.tokenize(query)
    sentences = re.split(r'(?<!\w\.\w.)(?<![A-Z][a-z]\.)(?<=\.|\?)\s', abstract)
    
    for sentence in sentences:
        sentence_tokens = tokenizer.tokenize(sentence)
        if any(token in sentence_tokens for token in query_tokens):
            return sentence
    
    return None

def search_engine(query, df, embeddings, num_results=5):
    # Preprocess the query
    query = query.lower()
    
    # Calculate embeddings for the query
    query_embedding = preprocess_and_embed(query)
    
    # Calculate cosine similarity between the query and dataset abstracts
    similarities = [cosine_similarity(query_embedding, ae.reshape(1, -1))[0][0] for ae in tqdm(embeddings, desc="Calculating similarities")]
    
    # Get the indices of the top N most similar abstracts
    top_indices = sorted(range(len(similarities)), key=lambda i: similarities[i], reverse=True)[:num_results]
    
    # Return the titles, scores, and answer sentences of the top N most relevant articles
    top_articles = []
    for i in top_indices:
        title = df.iloc[i]['TI']
        score = similarities[i]
        abstract = df.iloc[i]['AB']
        answer_sentence = extract_answer_sentence(query, abstract)
        top_articles.append((title, score, answer_sentence))
    
    return top_articles

# Check if precomputed embeddings exist, otherwise compute and save them
if not os.path.exists(embeddings_path):
    compute_and_save_embeddings(df, embeddings_path)

# Load precomputed embeddings
embeddings = load_embeddings(embeddings_path)

# Example usage
query = "What is the treatment for cancer?"
top_articles = search_engine(query, df, embeddings)
for title, score, answer_sentence in top_articles:
    print(f"Title: {title}, Score: {score}")
    print(f"Answer Sentence: {answer_sentence}\n")


In [None]:
# with misspelling
query = "What is the treatent for lung caner?"
top_articles = search_engine(query, df, embeddings)
for title, score, answer_sentence in top_articles:
    print(f"Title: {title}, Score: {score}")
    print(f"Answer Sentence: {answer_sentence}\n")

print("="*100)
# without misspelling 
query = "What is the treatment for lung cancer?"
top_articles = search_engine(query, df, embeddings)
for title, score, answer_sentence in top_articles:
    print(f"Title: {title}, Score: {score}")
    print(f"Answer Sentence: {answer_sentence}\n")

In [None]:
# Conbine different columns of the dataset into one column
import pandas as pd

# Read the original CSV file
df_part = pd.read_csv('articles.csv', index_col='PMID', usecols=['PMID', 'TI', 'AB', 'FAU', 'DP', 'OT', 'JT', 'MH'])

# Create a new DataFrame with the desired structure
new_df = pd.DataFrame(index=df_part.index)

# Combine the information into a single column
new_df['Combined_Info'] = (
    'Title: ' + df_part['TI'].fillna('None') + '\n' +
    'Abstract: ' + df_part['AB'].fillna('None') + '\n' +
    'Authors: ' + df_part['FAU'].fillna('None') + '\n' +
    'Data of Publication: ' + df_part['DP'].fillna('None') + '\n' +
    'Terms or keywords associated with the article: ' + df_part['OT'].fillna('None') + '\n' +
    'Journal Title: ' + df_part['JT'].fillna('None') + '\n' +
    'Medical subject headings: ' + df_part['MH'].fillna('None')
)

# Save the new DataFrame to a CSV file
new_df.to_csv('combined_data.csv')

In [None]:
# Splitting the large CSV file into smaller chunks
import pandas as pd

def split_csv(input_csv, output_prefix, chunk_size):
    # Read the large CSV file into a pandas DataFrame
    df = pd.read_csv(input_csv)

    # Determine the number of chunks needed
    num_chunks = (len(df) // chunk_size) + 1

    # Split the DataFrame into chunks
    chunks = [df[i * chunk_size:(i + 1) * chunk_size] for i in range(num_chunks)]

    # Save each chunk as a separate CSV file
    for i, chunk in enumerate(chunks):
        output_csv = f"{output_prefix}_{i + 1}.csv"
        chunk.to_csv(output_csv, index=False)
        print(f"Chunk {i + 1} saved to {output_csv}")

# Example usage
input_csv_path = 'data_1.csv'  # Replace with the path to your large CSV file
output_prefix = 'sub_data'  # Prefix for the output CSV files
chunk_size = 1000  # Number of rows per chunk

split_csv(input_csv_path, output_prefix, chunk_size)

# Embedding the abstracts using the pre-trained BERT model

In [None]:
# Embedding the abstracts using BERT and saving them to a file

import pandas as pd
import torch
from transformers import BertTokenizer, BertModel
from sklearn.metrics.pairwise import cosine_similarity
from tqdm import tqdm

# Load pre-trained BERT model and tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertModel.from_pretrained('bert-base-uncased')

# Tokenize and encode the abstracts
def encode_abstracts_sliding_window(abstracts, window_size=512, stride=256):
    encoded_abstracts = []

    for abstract in tqdm(abstracts, desc="Encoding Abstracts", unit="abstract"):
        tokens = tokenizer.tokenize(abstract)
        total_length = len(tokens)

        # Determine the number of overlapping windows
        num_windows = abs(total_length - window_size) // stride + 1

        for i in range(0, num_windows * stride, stride):
            # Extract a window of tokens
            window_tokens = tokens[i:i + window_size]

            # Convert tokens back to a string
            window_text = tokenizer.convert_tokens_to_string(window_tokens)

            # Tokenize and encode the window
            inputs = tokenizer(window_text, return_tensors="pt", padding=True, truncation=True)
            with torch.no_grad():
                outputs = model(**inputs)

            encoded_abstracts.append(outputs.last_hidden_state.mean(dim=1))

    if not encoded_abstracts:
        print("No encoded abstracts found.")
    return torch.cat(encoded_abstracts, dim=0)


# Function to save encoded abstracts
def save_encoded_abstracts(encoded_abstracts, filename):
    torch.save(encoded_abstracts, filename)

# Function to load encoded abstracts
def load_encoded_abstracts(filename):
    return torch.load(filename)

# Example: Load, encode, and save each part separately
for i in tqdm(range(1, 11), desc="Processing Parts", unit="part"):
    file_path = f'sub_data_{i}.csv'
    df_part = pd.read_csv(file_path)

    # Encode abstracts
    encoded_abstracts_part = encode_abstracts_sliding_window(df_part['Combined_Info'])

    # Save encoded abstracts
    save_encoded_abstracts(encoded_abstracts_part, f'encoded_data_part_{i}.pt')

# Load and concatenate encoded abstracts from all parts
encoded_abstracts_parts = []
for i in tqdm(range(1, 11), desc="Loading Parts", unit="part"):
    encoded_abstracts_part = load_encoded_abstracts(f'encoded_data_part_{i}.pt')
    encoded_abstracts_parts.append(encoded_abstracts_part)

# Concatenate the parts
encoded_abstracts = torch.cat(encoded_abstracts_parts, dim=0)

# Save the encoded_abstracts tensor
torch.save(encoded_abstracts, 'encoded_data.pt')



# Question Answering System using BERT pretrained on SQuAD

In [None]:
"""retrieving the most similar abstracts to a question and then generating an answer based on those abstracts is a 
reasonable strategy for a Question Answering (QA) system. While the SQuAD (Stanford Question Answering Dataset) 
is typically used for training and evaluating QA models, you can adapt your approach to leverage 
the idea of retrieving relevant passages or abstracts and then generating answers.

Steps:

    1-Retrieve Similar Abstracts:
        Use a method (such as cosine similarity) to retrieve the top N most similar abstracts to a given question from your collection of abstracts.

    2-Generate Answers:
        For each of the retrieved abstracts, use a QA model to generate answers to the question.
        Fine-tune a pre-trained QA model on your specific dataset, considering the structure of your abstracts and questions.

    3-Combine Answers:
        Aggregate or combine the answers generated from different abstracts to provide a final answer."""

from transformers import BertForQuestionAnswering, BertTokenizer
from sklearn.metrics.pairwise import cosine_similarity
import torch
from tqdm import tqdm

# Load pre-trained BERT model and tokenizer
qa_model = BertForQuestionAnswering.from_pretrained('bert-large-uncased-whole-word-masking-finetuned-squad')
qa_tokenizer = BertTokenizer.from_pretrained('bert-large-uncased-whole-word-masking-finetuned-squad')


# Tokenize and encode the abstracts
def encode_abstracts_sliding_window(abstracts, window_size=512, stride=256):
    encoded_abstracts = []

    for abstract in tqdm(abstracts, desc="Encoding Abstracts", unit="abstract"):
        tokens = tokenizer.tokenize(abstract)
        total_length = len(tokens)

        # Determine the number of overlapping windows
        num_windows = abs(total_length - window_size) // stride + 1

        for i in range(0, num_windows * stride, stride):
            # Extract a window of tokens
            window_tokens = tokens[i:i + window_size]

            # Convert tokens back to a string
            window_text = tokenizer.convert_tokens_to_string(window_tokens)

            # Tokenize and encode the window
            inputs = tokenizer(window_text, return_tensors="pt", padding=True, truncation=True)
            with torch.no_grad():
                outputs = model(**inputs)

            encoded_abstracts.append(outputs.last_hidden_state.mean(dim=1))

    if not encoded_abstracts:
        print("No encoded abstracts found.")
    return torch.cat(encoded_abstracts, dim=0)


# Function to load encoded abstracts
def load_encoded_abstracts(filename):
    return torch.load(filename)

# Function to retrieve top k similar abstracts
def retrieve_top_k_abstracts(query, abstracts, df, k=5):
    # Encode the query using the sliding window approach (as before)
    query_embedding = encode_abstracts_sliding_window([query])
    
    # Calculate cosine similarity between the query and encoded abstracts
    similarities = cosine_similarity(query_embedding, abstracts)
    
    # Get the indices of the top k most similar abstracts
    top_k_indices = similarities.argsort()[0, -k:][::-1]

    if len(top_k_indices) == 0:
        print("No matching abstracts found.")
        return []

    # Print some information for debugging
    print("Top k PMIDs:", df.index[top_k_indices].tolist())
    print("Abstract lengths:", [len(df.loc[pmid, 'Combined_Info']) for pmid in df.index[top_k_indices]])

    return top_k_indices


# Function to generate answers using the QA model
def generate_answers(question, abstracts, df):
    answers = []

    for index in abstracts:
        # Get the PMID
        pmid = df.index[index]

        # Get the abstract text
        abstract_text = df.loc[pmid, 'Combined_Info']

        # Tokenize and encode the question and abstract
        inputs = qa_tokenizer(question, abstract_text, return_tensors="pt", max_length=512, truncation=True)
        
        # Perform inference with the QA model
        with torch.no_grad():
            outputs = qa_model(**inputs)

        # Get the predicted answer
        answer_start = torch.argmax(outputs.start_logits)
        answer_end = torch.argmax(outputs.end_logits) + 1
        answer = qa_tokenizer.convert_tokens_to_string(qa_tokenizer.convert_ids_to_tokens(inputs["input_ids"][0][answer_start:answer_end]))

        answers.append(answer)

    return answers

# Using PMID as the index column
df_part = pd.read_csv('data_1.csv', index_col='PMID')
# df_part = pd.read_csv('articles.csv', index_col='PMID', usecols=['TI', 'AB', 'FAU', 'DP', 'OT', 'JT', 'MH'])


# Example usage
encoded_abstracts = load_encoded_abstracts('encoded_data.pt')
question = "what is Artificial Intelligence?"
top_k_abstracts = retrieve_top_k_abstracts(question, encoded_abstracts, df_part, k=5)

# Print the top 5 similar abstracts
print("Top 5 Similar Abstracts:")
for index in top_k_abstracts:
    pmid = df_part.index[index]
    print("PMID:", pmid)
    print("Abstract:", df_part.loc[pmid, 'Combined_Info'])

answers = generate_answers(question, top_k_abstracts, df_part)

# Display the generated answers
print("\nGenerated Answers:")
for answer in answers:
    print(answer)
#[TODO] check if the retrieved data is correct as sometimes it produces some IDs that are not part of the dataset, e.g. query = "who is Chenq?"

## Data Preparation

In [None]:
# # Conbine different columns of the dataset into one column
# import pandas as pd

# # Read the original CSV file
# df_part = pd.read_csv('articles.csv', index_col='PMID', usecols=['PMID', 'TI', 'AB', 'FAU', 'DP', 'OT', 'JT', 'MH'])

# # Create a new DataFrame with the desired structure
# new_df = pd.DataFrame(index=df_part.index)

# # Combine the information into a single column
# new_df['CD'] = (
#     'PMID: ' + df_part.index.astype(str) + ', ' +
#     'Abstract: ' + df_part['AB'].fillna('None') + ', ' +
#     'Title: ' + df_part['TI'].fillna('None') + ', ' +
#     'Authors: ' + df_part['FAU'].fillna('None') + ', ' +
#     'Data of Publication: ' + df_part['DP'].fillna('None') + ', ' +
#     'Terms or keywords associated with the article: ' + df_part['OT'].fillna('None') + ', ' +
#     'Journal Title: ' + df_part['JT'].fillna('None') + ', ' +
#     'Medical subject headings: ' + df_part['MH'].fillna('None') + ', '# +
#     # 'Abstract: ' + df_part['AB'].fillna('None')
# )
# new_df['source'] = 'https://pubmed.ncbi.nlm.nih.gov/' + df_part.index.astype(str)
# # Save the new DataFrame to a CSV file
# new_df.to_csv('additional_data.csv')

In [None]:
# # Conbine different columns of the dataset into one column
# import pandas as pd

# # Read the original CSV file
# df_part = pd.read_csv('articles.csv', index_col='PMID', usecols=['PMID', 'TI', 'AB', 'FAU', 'DP', 'OT', 'JT', 'MH'])

# # Create a new DataFrame with the desired structure
# new_df = pd.DataFrame(index=df_part.index)

# # Combine the information into a single column
# new_df['CD'] = (
#     'PMID: ' + df_part.index.astype(str) + '\n' +
#     'Abstract: ' + df_part['AB'].fillna('None') + '\n' +
#     'Title: ' + df_part['TI'].fillna('None') + '\n' +
#     'Authors: ' + df_part['FAU'].fillna('None') + ',\n' +
#     'Data of Publication: ' + df_part['DP'].fillna('None') + '\n' +
#     'Terms or keywords associated with the article: ' + df_part['OT'].fillna('None') + '\n' +
#     'Journal Title: ' + df_part['JT'].fillna('None') + '\n' +
#     'Medical subject headings: ' + df_part['MH'].fillna('None') + '\n'# +
#     # 'Abstract: ' + df_part['AB'].fillna('None')
# )
# new_df['source'] = 'https://pubmed.ncbi.nlm.nih.gov/' + df_part.index.astype(str)


# import pandas as pd

# # # Read your DataFrame from a CSV file
# # df = pd.read_csv('your_dataframe.csv')

# # Function to filter out lines ending with 'None' from a given text
# def filter_lines(text):
#     lines = text.split('\n')
#     filtered_lines = [line for line in lines if not line.strip().endswith('None')]
#     return ', '.join(filtered_lines)

# # Apply the filtering function to each row in the 'CD' column
# new_df['CD'] = new_df['CD'].apply(filter_lines)

# # Save the new DataFrame to a CSV file
# new_df.to_csv('additional_data.csv')
# # Print the DataFrame with the filtered 'CD' column
# new_df.head()


Unnamed: 0_level_0,CD,source
PMID,Unnamed: 1_level_1,Unnamed: 2_level_1
24278995,"PMID: 24278995, Abstract: CLINICAL CHARACTERIS...",https://pubmed.ncbi.nlm.nih.gov/24278995
25529590,"PMID: 25529590, Abstract: This study utilized ...",https://pubmed.ncbi.nlm.nih.gov/25529590
25529585,"PMID: 25529585, Abstract: Interpretation of th...",https://pubmed.ncbi.nlm.nih.gov/25529585
25284715,"PMID: 25284715, Abstract: Despite the importan...",https://pubmed.ncbi.nlm.nih.gov/25284715
25265311,"PMID: 25265311, Abstract: Neuropsychological a...",https://pubmed.ncbi.nlm.nih.gov/25265311


In [None]:
# # # [TODO]Add source of each article using https://pubmed.ncbi.nlm.nih.gov/PMID

# docs['source'] = 'https://pubmed.ncbi.nlm.nih.gov/' + docs['PMID'].astype(str)
# # save the data
# docs.to_csv('data_0.csv', index=False)

In [None]:
# [TODO]Upload data to the Hugging Face Datasets library
# [TODO]Load data from hugging face datasets
# [TODO]Add source of each article using https://pubmed.ncbi.nlm.nih.gov/PMID
import pandas as pd
import os
from transformers import LlamaTokenizer
docs = pd.read_csv('additional_data.csv')
# docs['Combined_Info'] = docs['Combined_Info'].str.replace('|', ' ')

hf_auth = os.environ.get('HF_AUTH')
#[TODO] tokenize for GPT-3.5 Turbo

tokenizer = LlamaTokenizer.from_pretrained("meta-llama/Llama-2-13b-chat-hf",use_auth_token=hf_auth)
# length of tokenized input
def token_len(text):
    tokens = tokenizer.encode(text)
    return len(tokens)


# stats for tokenized input But not necessary
token_counts = [token_len(docs['CD'][i]) for i, _ in enumerate(docs['CD'])]
min_tokens=min(token_counts)
avg_tokens=int(sum(token_counts) / len(token_counts))
max_tokens=max(token_counts)
print(f"""Min: {min_tokens}
Avg: {avg_tokens}
Max: {max_tokens}""")



Min: 75
Avg: 560
Max: 31071


In [None]:
# tokenizer.convert_ids_to_tokens(tokenizer("PMID 24278995 Title CASK Disorders Authors Moog Ute Kutsche Kerstin Data of Publication 1993 Terms or keywords associated with the article Intellectual Disability and Microcephaly with Pontine and Cerebellar Hypoplasia MICPCH XLinked Intellectual Disability XLID with or without Nystagmus Peripheral plasma membrane").input_ids)

In [None]:
# Priority is by the length of chunk and overlap,
# if they don't exceed the default values, the separator will be used
from langchain.text_splitter import NLTKTextSplitter,CharacterTextSplitter, RecursiveCharacterTextSplitter

text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=1200,
    chunk_overlap=40,
    length_function=token_len,
    # separators=['\n\n', '\n', ' ', '']
)

# test the text splitter
chunks = text_splitter.split_text(docs['CD'][0])
print(f"length of chunk: {len(chunks)}")
print(f"Content of chunk0:\n{chunks[0]}")
print('the length of chunk 0 is:', len(chunks[0]))
print("*"*100)
print(f"Content of chunk1:\n{chunks[1]}")
print('the length of chunk 1 is:', len(chunks[1]))
print("*"*100)
print(f"Content of chunk1:\n{chunks[2]}")
print('the length of chunk 2 is:', len(chunks[2]))
print("*"*100)
# print(f"Content of chunk1:\n{chunks[3]}")
# print('the length of chunk 3 is:', len(chunks[3]))
# print("*"*100)
# print(f"Content of chunk1:\n{chunks[4]}")
# print('the length of chunk 4 is:', len(chunks[4]))
# print("*"*100)
# print(f"Content of chunk1:\n{chunks[5]}")
# print('the length of chunk 5 is:', len(chunks[5]))
# print("*"*100)
# print(f"Content of chunk1:\n{chunks[6]}")
# print('the length of chunk 6 is:', len(chunks[6]))
# print("*"*100)
# print(f"Content of chunk1:\n{chunks[7]}")
# print('the length of last chunk is:', len(chunks[7]))

length of chunk: 3
Content of chunk0:
PMID: 24278995, Abstract: CLINICAL CHARACTERISTICS: CASK disorders include a spectrum of phenotypes in both females and males. Two main types of clinical presentation are seen: Microcephalywith pontine and cerebellar hypoplasia (MICPCH), generally associated withpathogenic loss-of-function variants in CASK. X-linked intellectual disability(XLID) with or without nystagmus, generally associated with hypomorphic CASKpathogenic variants. MICPCH is typically seen in females with moderate-to-severeintellectual disability, progressive microcephaly with or without ophthalmologicanomalies, and sensorineural hearing loss. Most are able to sit independently;20%-25% attain the ability to walk; language is nearly absent in most. Neurologicfeatures may include axial hypotonia, hypertonia/spasticity of the extremities,and dystonia or other movement disorders. Nearly 40% have seizures by age tenyears. Behaviors may include sleep disturbances, hand stereotypies, an

In [None]:
# import re

# def remove_punctuation(text):
#     # Define the pattern to match punctuation
#     punctuation_pattern = r'[^\w\s]'
    
#     # Use regex to substitute punctuation with an empty string
#     text_without_punctuation = re.sub(punctuation_pattern, '', text)
    
#     return text_without_punctuation

# # Example usage:
# text = "PMID: 24278995, Title: CASK Disorders., Authors: Moog, Ute Kutsche, Kerstin, Data of Publication: 1993, Terms or keywords associated with the article: Intellectual Disability and Microcephaly with Pontine and Cerebellar Hypoplasia (MICPCH) X-Linked Intellectual Disability (XLID) with or without Nystagmus Peripheral plasma membrane"
# cleaned_text = remove_punctuation(text)
# print(cleaned_text)


In [None]:
from tqdm import tqdm

# import re

# def remove_punctuation(text):
#     # Define the pattern to match punctuation
#     punctuation_pattern = r'[^\w\s]'
    
#     # Use regex to substitute punctuation with an empty string
#     text_without_punctuation = re.sub(punctuation_pattern, '', text)
    
#     return text_without_punctuation

documents=[]
for j, doc in tqdm(enumerate(docs['CD'])):
    # chunks = text_splitter.split_text(remove_punctuation(doc))
    chunks = text_splitter.split_text(doc)
    for i, chunk in enumerate(chunks):
        documents.append({
            'id': f"{docs['PMID'][j]}-{i}",
            'text': chunk,
            'source': docs['source'][j],
        })

len(documents)

57560it [1:01:47, 15.52it/s]


93776

In [None]:
# Convert the list of dictionaries to a DataFrame
import pandas as pd
data = pd.DataFrame(documents)
data.to_csv('data_llama_recursive_1200_40.csv', index=False)
data = pd.read_csv('data_llama_recursive_1200_40.csv')
data.head()

Unnamed: 0,id,text,source
0,24278995-0,"PMID: 24278995, Abstract: CLINICAL CHARACTERIS...",https://pubmed.ncbi.nlm.nih.gov/24278995
1,24278995-1,"testing. Rarely, affectedmales have a mosaic p...",https://pubmed.ncbi.nlm.nih.gov/24278995
2,24278995-2,who inherit the pathogenic variantwill typical...,https://pubmed.ncbi.nlm.nih.gov/24278995
3,25529590-0,"PMID: 25529590, Abstract: This study utilized ...",https://pubmed.ncbi.nlm.nih.gov/25529590
4,25529585-0,"PMID: 25529585, Abstract: Interpretation of th...",https://pubmed.ncbi.nlm.nih.gov/25529585


In [None]:
# Find the the top 5 relevent articles for a given query using TF-IDF and cosine similarity
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import linear_kernel

# Load your CSV data
# additional_docs = pd.read_csv('additional_data.csv')

# Create a TF-IDF vectorizer
# Using charactor based vectorizer cannot find the names of the authors
tfidf_vectorizer = TfidfVectorizer(stop_words='english')
tfidf_matrix = tfidf_vectorizer.fit_transform(data['text'].fillna(''))

# Function to search for queries
def search(query, tfidf_matrix, data):
    query_vector = tfidf_vectorizer.transform([query])
    cosine_similarities = linear_kernel(query_vector, tfidf_matrix).flatten()
    document_scores = list(enumerate(cosine_similarities))
    document_scores = sorted(document_scores, key=lambda x: x[1], reverse=True)
    
    # Return the top N results
    top_results = document_scores[:5]
    for idx, score in top_results:
        if score != 0:
            print(f"ID: {data['id'].iloc[idx]}, Score: {score}")


In [None]:
# Example query
search("who is Moog?", tfidf_matrix, data)

ID: 28973399-1, Score: 0.15922436691226732
ID: 24278995-2, Score: 0.11699660887872798
ID: 30642688-0, Score: 0.09687613918535377
ID: 34698500-1, Score: 0.08939465596638295
ID: 35434225-0, Score: 0.0858795629247108


In [None]:
# BM25 implementation, that is not recommendeed 
# as it produce the same result but takes much longer time 
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from scipy import sparse


class BM25(object):
    def __init__(self, b=0.75, k1=1.6):
        self.vectorizer = TfidfVectorizer(norm=None, smooth_idf=False)
        self.b = b
        self.k1 = k1

    def fit(self, X):
        """ Fit IDF to documents X """
        self.vectorizer.fit(X)
        y = super(TfidfVectorizer, self.vectorizer).transform(X)
        self.avdl = y.sum(1).mean()

    def transform(self, q, X):
        """ Calculate BM25 between query q and documents X """
        b, k1, avdl = self.b, self.k1, self.avdl

        # apply CountVectorizer
        X = super(TfidfVectorizer, self.vectorizer).transform(X)
        len_X = X.sum(1).A1
        q, = super(TfidfVectorizer, self.vectorizer).transform([q])
        assert sparse.isspmatrix_csr(q)

        # convert to csc for better column slicing
        X = X.tocsc()[:, q.indices]
        denom = X + (k1 * (1 - b + b * len_X / avdl))[:, None]
        # idf(t) = log [ n / df(t) ] + 1 in sklearn, so it need to be coneverted
        # to idf(t) = log [ n / df(t) ] with minus 1
        idf = self.vectorizer._tfidf.idf_[None, q.indices] - 1.
        numer = X.multiply(np.broadcast_to(idf, X.shape)) * (k1 + 1)                                                          
        return (numer / denom).sum(1).A1

# Example usage

query = "who is Moog?"
# search(query, bm25_matrix, data)
def search(query, data):
    bm25 = BM25()
    bm25.fit(data['text'])
    scores = bm25.transform(query, data['text'])
    document_scores = list(enumerate(scores))
    document_scores = sorted(document_scores, key=lambda x: x[1], reverse=True)
    
    # Return the top N results
    top_results = document_scores[:5]
    for idx, score in top_results:
        if score != 0:
            print(f"ID: {data['id'].iloc[idx]}, Score: {score}")

search(query, data)


ID: 24278995-2, Score: 15.563824888287513
ID: 28973399-1, Score: 13.237401367851781
ID: 30642688-0, Score: 11.251462229953042
ID: 35434225-0, Score: 9.726458452054661
ID: 34698500-1, Score: 9.553797854351048


In [None]:
import pandas as pd
from rank_bm25 import BM25Okapi

# Load your CSV data
# additional_docs = pd.read_csv('additional_data.csv')

# Tokenize the text data
tokenized_texts = [text.split() for text in data['text'].fillna('')]

# Create a BM25 model
bm25 = BM25Okapi(tokenized_texts)

# Function to search for queries
def search(query, bm25, tokenized_texts, data):
    scores = bm25.get_scores(query.split())
    top_results = sorted(enumerate(scores), key=lambda x: x[1], reverse=True)[:5]
    for idx, score in top_results:
        if score != 0:
            print(f"ID: {data['id'].iloc[idx]}, Score: {score}")

# Example usage
query = "who is Moog?"
search(query, bm25, tokenized_texts, data)


ID: 32338619-0, Score: 5.653210806314239
ID: 32428260-0, Score: 5.6218955080845925
ID: 36905588-0, Score: 5.444500880099623
ID: 36072043-0, Score: 5.353867915241998
ID: 24864183-0, Score: 5.302817414684655


## Document Embedding Pipeline

In [None]:
from langchain.embeddings.huggingface import HuggingFaceEmbeddings
import os
from pinecone import Pinecone, PodSpec
from tqdm import tqdm


In [None]:
# we don't use OpenAI embedding as it costs money  multi-qa-mpnet-base-dot-v1
embedding_model = 'sentence-transformers/all-MiniLM-L6-v2' #all-mpnet-base-v2'

device = 'cuda:0' # make sure you are on gpu
batch_size = 32
embed_model = HuggingFaceEmbeddings(
    model_name=embedding_model,
    model_kwargs={'device': device},
    encode_kwargs={'device': device, 'batch_size': batch_size}
)

  return self.fget.__get__(instance, owner)()


In [None]:
# # Load pre-trained BERT model and tokenizer
# embedding_model = BertModel.from_pretrained('bert-base-uncased')
# encoded_abstracts = []
# for i, txt in tqdm(enumerate(data['text'])):
#     # Tokenize and encode the window
#     inputs = tokenizer(txt, return_tensors="pt", padding=True, truncation=True)
#     with torch.no_grad():
#         outputs = embedding_model(**inputs)
#         encoded_abstracts.append(outputs.last_hidden_state.mean(dim=1))
# embeddings= torch.cat(encoded_abstracts, dim=0)
# print("number of docs:",len(embeddings))
# print("dimension of docs:",len(embeddings[0]))

In [None]:

embeddings = embed_model.embed_documents(data['text'])
print("number of docs:",len(embeddings))
print("dimension of docs:",len(embeddings[0]))

number of docs: 3009
dimension of docs: 384


In [None]:
# embeddings[0]

In [None]:
# initialize Pinecone
pc = Pinecone(api_key=os.environ.get('PINECONE_API_KEY'))
index_name = 'medical-articles-embeddings'
#initialize the index
pc.create_index(
    index_name,
    dimension=384,#len(embeddings[0]),
    metric='cosine',
    spec= PodSpec(environment="gcp-starter")
)
# Describe the index
index_name = 'medical-articles-embeddings'
index = pc.Index(index_name)
index.describe_index_stats()


{'dimension': 384,
 'index_fullness': 0.0,
 'namespaces': {},
 'total_vector_count': 0}

In [None]:

# batch_size = 32

# for i in tqdm(range(0, len(data), batch_size)):
#     i_end = min(len(data), i+batch_size)
#     batch = data.iloc[i:i_end]
#     ids = [f"{x['id']}" for _, x in batch.iterrows()]
#     texts = [x['text'] for _, x in batch.iterrows()]
#     inputs = tokenizer(texts, return_tensors="pt", padding=True, truncation=True)
#     with torch.no_grad():
#         outputs = embedding_model(**inputs)
#         encoded_abstracts.append(outputs.last_hidden_state.mean(dim=1))
#     embeds= torch.cat(encoded_abstracts, dim=0)


#     # get metadata to store in Pinecone
#     metadata = [
#         {'text': x['text'],
#          'source': x['source']} for _, x in batch.iterrows()
#     ]
#     index.upsert(vectors=zip(ids, embeds, metadata))

In [None]:
batch_size = 32

for i in tqdm(range(0, len(data), batch_size)):
    i_end = min(len(data), i+batch_size)
    batch = data.iloc[i:i_end]
    ids = [f"{x['id']}" for _, x in batch.iterrows()]
    texts = [x['text'] for _, x in batch.iterrows()]
    embeds = embed_model.embed_documents(texts)
    # get metadata to store in Pinecone
    metadata = [
        {'text': x['text'],
         'source': x['source']} for _, x in batch.iterrows()
    ]
    # metadata = [
    #     {'text': x['text']} for _, x in batch.iterrows()
    # ]
    index.upsert(vectors=zip(ids, embeds, metadata))



100%|██████████| 2931/2931 [32:34<00:00,  1.50it/s]


In [None]:
# Describe the index
index.describe_index_stats()

{'dimension': 384,
 'index_fullness': 0.93776,
 'namespaces': {'': {'vector_count': 93776}},
 'total_vector_count': 93776}

## Question Answering Chain

In [None]:
# Check the scores for the top 5 matches
query = 'who is Moog'

# query
results = index.query(vector=embed_model.embed_query(query), top_k=5, include_metadata=True)
for result in results['matches']:
    print(f"{round(result['score'], 2)}: {result['metadata']['text']}")

0.34: Matthew R|Levy, Joshua J,, Data of Publication: 2023 Oct 21, Terms or keywords associated with the article: Mohs micrographic surgery|artificial intelligence|clinical research|general dermatology|medical dermatology|oncology, Journal Title: Experimental dermatology,
0.29: PMID: 35104814, Abstract: INTRODUCTION: Moyamoya disease is characterized by progressive stenotic changes in the terminal segment of the internal carotid artery and the development ofabnormal vascular networks called moyamoya vessels. The objective of this reviewwas to provide a holistic view of the epidemiology, etiology, clinical findings,treatment, and pathogenesis of moyamoya disease. A literature search wasperformed in PubMed using the term ""moyamoya disease,"" for articles publisheduntil 2021. RESULTS: Artificial intelligence (AI) clustering was used to classifythe articles into 5 clusters: (1) pathophysiology (23.5%); (2) clinicalbackground (37.3%); (3) imaging (13.2%); (4) treatment (17.3%); and (5) gen

In [None]:
# Return the top N results
from langchain.vectorstores import Pinecone
vectorstore = Pinecone(index, embed_model.embed_query, 'text')

In [None]:
query = 'what is the cause of CASK Disorders?'

vectorstore.similarity_search(
    query,  # the search query
    k=3  # returns top 3 most relevant chunks of text
)

[Document(page_content='for a CASK pathogenic variant and in a male who is hemizygousfor a CASK pathogenic variant on molecular genetic testing. Rarely, affectedmales have a mosaic pathogenic variant. MANAGEMENT: Treatment of manifestations:Treatment is symptomatic and includes standard management of developmental delayand intellectual disability issues; medication for seizures; nutritional support;use of physiotherapy; and treatment of abnormal vision or hearing loss. GENETICCOUNSELING: CASK disorders are inherited in an X-linked manner. Risk to thefamily members of a proband with a CASK disorder depends on'),
 Document(page_content='thefamily members of a proband with a CASK disorder depends on the phenotype (i.e.,MICPCH or XLID +/- nystagmus) in the proband. MICPCH. Most affected females andmales represent simplex cases (i.e., the only affected family member) and havethe disorder as the result of a de novo CASK pathogenic variant. Becauseheterozygous females manifest the phenotype, 