## Data Preparation

In [None]:
# Conbine different columns of the dataset into one column
import pandas as pd

# Read the original CSV file
df_part = pd.read_csv('articles.csv', index_col='PMID', usecols=['PMID', 'TI', 'AB', 'FAU', 'DP', 'OT', 'JT', 'MH'])

# Create a new DataFrame with the desired structure
new_df = pd.DataFrame(index=df_part.index)

# Combine the information into a single column
new_df['CD'] = (
    'PMID: ' + df_part.index.astype(str) + '\n' +
    'Abstract: ' + df_part['AB'].fillna('None') + '\n' +
    'Title: ' + df_part['TI'].fillna('None') + '\n' +
    'Authors: ' + df_part['FAU'].fillna('None') + ',\n' +
    'Data of Publication: ' + df_part['DP'].fillna('None') + '\n' +
    'Terms or keywords associated with the article: ' + df_part['OT'].fillna('None') + '\n' +
    'Journal Title: ' + df_part['JT'].fillna('None') + '\n' +
    'Medical subject headings: ' + df_part['MH'].fillna('None') + '\n'# +
    # 'Abstract: ' + df_part['AB'].fillna('None')
)
new_df['source'] = 'https://pubmed.ncbi.nlm.nih.gov/' + df_part.index.astype(str)



# Function to filter out lines ending with 'None' from a given text
def filter_lines(text):
    lines = text.split('\n')
    filtered_lines = [line for line in lines if not line.strip().endswith('None')]
    return ', '.join(filtered_lines)

# Apply the filtering function to each row in the 'CD' column
new_df['CD'] = new_df['CD'].apply(filter_lines)

# Save the new DataFrame to a CSV file
new_df.to_csv('additional_data.csv')
# Print the DataFrame with the filtered 'CD' column
new_df.head()

In [None]:
import pandas as pd
import os
from transformers import LlamaTokenizer, AutoTokenizer
from sentence_transformers import SentenceTransformer

docs = pd.read_csv('additional_data.csv')
# docs['Combined_Info'] = docs['Combined_Info'].str.replace('|', ' ')

hf_auth = os.environ.get('HF_AUTH')

# tokenizer = LlamaTokenizer.from_pretrained("meta-llama/Llama-2-13b-chat-hf",use_auth_token=hf_auth)
# # length of tokenized input
# def token_len(text):
#     tokens = tokenizer.encode(text)
#     return len(tokens)

#sentence-transformers/all-distilroberta-v1
#sentence-transformers/all-mpnet-base-v2
#sentence-transformers/all-MiniLM-L6-v2
# Load the tokenizer
tokenizer = AutoTokenizer.from_pretrained('sentence-transformers/all-distilroberta-v1')
# length of tokenized input
def token_len(text):
    tokens = tokenizer.tokenize(text)
    return len(tokens)


# stats for tokenized input But not necessary
token_counts = [token_len(docs['CD'][i]) for i, _ in enumerate(docs['CD'])]
min_tokens=min(token_counts)
avg_tokens=int(sum(token_counts) / len(token_counts))
max_tokens=max(token_counts)
print(f"""Min: {min_tokens}
Avg: {avg_tokens}
Max: {max_tokens}""")

In [None]:
# tokenizer.convert_ids_to_tokens(tokenizer("PMID 24278995 Title CASK Disorders Authors Moog Ute Kutsche Kerstin Data of Publication 1993 Terms or keywords associated with the article Intellectual Disability and Microcephaly with Pontine and Cerebellar Hypoplasia MICPCH XLinked Intellectual Disability XLID with or without Nystagmus Peripheral plasma membrane").input_ids)

In [None]:
# Priority is by the length of chunk and overlap,
# if they don't exceed the default values, the separator will be used
from langchain.text_splitter import NLTKTextSplitter,CharacterTextSplitter, RecursiveCharacterTextSplitter

text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=400,
    chunk_overlap=50,
    length_function=token_len,
    # separators=['\n\n', '\n', ' ', '']
)

# test the text splitter
chunks = text_splitter.split_text(docs['CD'][0])
print(f"length of chunk: {len(chunks)}")
print(f"Content of chunk0:\n{chunks[0]}")
print('the length of chunk 0 is:', len(chunks[0]))
print("*"*100)
print(f"Content of chunk1:\n{chunks[1]}")
print('the length of chunk 1 is:', len(chunks[1]))
print("*"*100)
print(f"Content of chunk1:\n{chunks[2]}")
print('the length of chunk 2 is:', len(chunks[2]))
print("*"*100)
# print(f"Content of chunk1:\n{chunks[3]}")
# print('the length of chunk 3 is:', len(chunks[3]))
# print("*"*100)
# print(f"Content of chunk1:\n{chunks[4]}")
# print('the length of chunk 4 is:', len(chunks[4]))
# print("*"*100)
# print(f"Content of chunk1:\n{chunks[5]}")
# print('the length of chunk 5 is:', len(chunks[5]))
# print("*"*100)
# print(f"Content of chunk1:\n{chunks[6]}")
# print('the length of chunk 6 is:', len(chunks[6]))
# print("*"*100)
# print(f"Content of chunk1:\n{chunks[7]}")
# print('the length of last chunk is:', len(chunks[7]))

In [None]:
from tqdm import tqdm

# import re

# def remove_punctuation(text):
#     # Define the pattern to match punctuation
#     punctuation_pattern = r'[^\w\s]'
    
#     # Use regex to substitute punctuation with an empty string
#     text_without_punctuation = re.sub(punctuation_pattern, '', text)
    
#     return text_without_punctuation

documents=[]
for j, doc in tqdm(enumerate(docs['CD'])):
    # chunks = text_splitter.split_text(remove_punctuation(doc))
    chunks = text_splitter.split_text(doc)
    for i, chunk in enumerate(chunks):
        documents.append({
            'id': f"{docs['PMID'][j]}-{i}",
            'text': chunk,
            'resource': docs['source'][j],
        })

len(documents)

In [None]:
# Convert the list of dictionaries to a DataFrame
import pandas as pd
# data = pd.DataFrame(documents)
# data.to_csv('data_distilroberta_recursive_400_50.csv', index=False)
data = pd.read_csv('data_distilroberta_recursive_400_50.csv')
data.head()

In [None]:
# from sentence_transformers import SentenceTransformer

# # Load the model
# model = SentenceTransformer('sentence-transformers/all-distilroberta-v1')

# # Input sentence
# # sentence = "variant has been identifiedin an affected family member, prenatal testing for a pregnancy at increased riskand preimplantation genetic testing for a CASK disorder are possible., Title: CASK Disorders., Authors: Moog, Ute|Kutsche, Kerstin,, Data of Publication: 1993, Terms or keywords associated with the article: Intellectual Disability and Microcephaly with Pontine and Cerebellar Hypoplasia (MICPCH)|X-Linked Intellectual Disability (XLID) with or without Nystagmus|Peripheral plasma membrane protein CASK|CASK|CASK Disorders,"

# # Get the sentence embedding
# embedding = model.encode(sentence)

# # Output the embedded representation
# print(embedding)


In [None]:
# from transformers import AutoTokenizer

# # Load the tokenizer
# tokenizer = AutoTokenizer.from_pretrained('sentence-transformers/all-MiniLM-L6-v2')

# # Tokenize the sentence
# tokens = tokenizer.tokenize(sentence)

# # Output the tokens
# print(tokens)
# print(len(tokens))


In [None]:
# Find the the top 5 relevent articles for a given query using TF-IDF and cosine similarity
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import linear_kernel

# Load CSV data
# additional_docs = pd.read_csv('additional_data.csv')

# Create a TF-IDF vectorizer
# Using charactor based vectorizer cannot find the names of the authors
tfidf_vectorizer = TfidfVectorizer(stop_words='english')
tfidf_matrix = tfidf_vectorizer.fit_transform(data['text'].fillna(''))

# Function to search for queries
def search(query, tfidf_matrix, data):
    query_vector = tfidf_vectorizer.transform([query])
    cosine_similarities = linear_kernel(query_vector, tfidf_matrix).flatten()
    document_scores = list(enumerate(cosine_similarities))
    document_scores = sorted(document_scores, key=lambda x: x[1], reverse=True)
    
    # Return the top N results
    top_results = document_scores[:5]
    for idx, score in top_results:
        if score != 0:
            print(f"ID: {data['id'].iloc[idx]}, Score: {score}")


In [None]:
# Example query
search("who is Moog?", tfidf_matrix, data)

In [None]:
# BM25 implementation, that is not recommendeed 
# as it produce the same result but takes much longer time 
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from scipy import sparse


class BM25(object):
    def __init__(self, b=0.75, k1=1.6):
        self.vectorizer = TfidfVectorizer(norm=None, smooth_idf=False)
        self.b = b
        self.k1 = k1

    def fit(self, X):
        """ Fit IDF to documents X """
        self.vectorizer.fit(X)
        y = super(TfidfVectorizer, self.vectorizer).transform(X)
        self.avdl = y.sum(1).mean()

    def transform(self, q, X):
        """ Calculate BM25 between query q and documents X """
        b, k1, avdl = self.b, self.k1, self.avdl

        # apply CountVectorizer
        X = super(TfidfVectorizer, self.vectorizer).transform(X)
        len_X = X.sum(1).A1
        q, = super(TfidfVectorizer, self.vectorizer).transform([q])
        assert sparse.isspmatrix_csr(q)

        # convert to csc for better column slicing
        X = X.tocsc()[:, q.indices]
        denom = X + (k1 * (1 - b + b * len_X / avdl))[:, None]
        # idf(t) = log [ n / df(t) ] + 1 in sklearn, so it need to be coneverted
        # to idf(t) = log [ n / df(t) ] with minus 1
        idf = self.vectorizer._tfidf.idf_[None, q.indices] - 1.
        numer = X.multiply(np.broadcast_to(idf, X.shape)) * (k1 + 1)                                                          
        return (numer / denom).sum(1).A1

# Example usage

query = "who is Moog?"
# search(query, bm25_matrix, data)
def search(query, data):
    bm25 = BM25()
    bm25.fit(data['text'])
    scores = bm25.transform(query, data['text'])
    document_scores = list(enumerate(scores))
    document_scores = sorted(document_scores, key=lambda x: x[1], reverse=True)
    
    # Return the top N results
    top_results = document_scores[:5]
    for idx, score in top_results:
        if score != 0:
            print(f"ID: {data['id'].iloc[idx]}, Score: {score}")

search(query, data)


In [None]:
import pandas as pd
from rank_bm25 import BM25Okapi

# Load CSV data
# additional_docs = pd.read_csv('additional_data.csv')

# Tokenize the text data
tokenized_texts = [text.split() for text in data['text'].fillna('')]

# Create a BM25 model
bm25 = BM25Okapi(tokenized_texts)

# Function to search for queries
def search(query, bm25, tokenized_texts, data):
    scores = bm25.get_scores(query.split())
    top_results = sorted(enumerate(scores), key=lambda x: x[1], reverse=True)[:5]
    for idx, score in top_results:
        if score != 0:
            print(f"ID: {data['id'].iloc[idx]}, Score: {score}")

# Example usage
query = "who is Moog?"
search(query, bm25, tokenized_texts, data)
