In [None]:
!pip install llama-index
!pip install llama-index-vector-stores-faiss faiss-cpu

Collecting llama-index
  Downloading llama_index-0.11.10-py3-none-any.whl.metadata (11 kB)
Collecting llama-index-agent-openai<0.4.0,>=0.3.1 (from llama-index)
  Downloading llama_index_agent_openai-0.3.4-py3-none-any.whl.metadata (728 bytes)
Collecting llama-index-cli<0.4.0,>=0.3.1 (from llama-index)
  Downloading llama_index_cli-0.3.1-py3-none-any.whl.metadata (1.5 kB)
Collecting llama-index-core<0.12.0,>=0.11.10 (from llama-index)
  Downloading llama_index_core-0.11.10-py3-none-any.whl.metadata (2.4 kB)
Collecting llama-index-embeddings-openai<0.3.0,>=0.2.4 (from llama-index)
  Downloading llama_index_embeddings_openai-0.2.5-py3-none-any.whl.metadata (686 bytes)
Collecting llama-index-indices-managed-llama-cloud>=0.3.0 (from llama-index)
  Downloading llama_index_indices_managed_llama_cloud-0.3.1-py3-none-any.whl.metadata (3.8 kB)
Collecting llama-index-legacy<0.10.0,>=0.9.48 (from llama-index)
  Downloading llama_index_legacy-0.9.48.post3-py3-none-any.whl.metadata (8.5 kB)
Collecti

In [46]:
import openai
import tiktoken
from llama_index.core import ServiceContext, PromptHelper
from llama_index.embeddings.openai import OpenAIEmbedding
from llama_index.llms.openai import OpenAI
from llama_index.core.text_splitter import TokenTextSplitter
from llama_index.core.node_parser import SimpleNodeParser
from llama_index.core import VectorStoreIndex, SimpleDirectoryReader
from llama_index.core import set_global_service_context
from llama_index.core import VectorStoreIndex
from llama_index.vector_stores.faiss import FaissVectorStore
from llama_index.core import Settings
from llama_index.core.node_parser import SentenceSplitter
import faiss
import pandas as pd
from llama_index.core import Document
import os
from typing import List

Data prep functions

In [49]:
def load_pdf_documents(pdf_dir):
    print("now process pdf files")
    reader = SimpleDirectoryReader(input_dir=pdf_dir, file_extractor=None)  # file_extractor can be customized
    documents = reader.load_data()
    return documents

def convert_excel_csv_to_documents(input_dir: str) -> List[Document]:
    documents = []
    for filename in os.listdir(input_dir):
        file_path = os.path.join(input_dir, filename)
        if filename.lower().endswith(('.xlsx', '.xls')):
            try:
                excel_file = pd.ExcelFile(file_path)
                sheets = excel_file.sheet_names
                print(f"Processing Excel file: {filename}, Sheets: {sheets}")

                for sheet in sheets:
                    df = pd.read_excel(file_path, sheet_name=sheet)

                    if df.empty:
                        print(f"Sheet '{sheet}' in {filename} is empty. Skipping.")
                        continue

                    df.fillna('N/A', inplace=True)

                    for index, row in df.iterrows():
                        row_data = ", ".join([f"{col} is {val}" for col, val in row.items()])
                        row_text = f"Sheet '{sheet}', Record {index + 1} from {filename}: {row_data}"
                        documents.append(Document(text=row_text))

            except Exception as e:
                print(f"Error processing Excel file {filename}: {e}")

        elif filename.lower().endswith('.csv'):
            try:
                df = pd.read_csv(file_path)

                if df.empty:
                    print(f"CSV file {filename} is empty. Skipping.")
                    continue

                df.fillna('N/A', inplace=True)

                print(f"Processing CSV file: {filename}, Rows: {len(df)}")

                for index, row in df.iterrows():
                    row_data = ", ".join([f"{col} is {val}" for col, val in row.items()])
                    row_text = f"Record {index + 1} from {filename}: {row_data}"
                    print(f"Creating Document for {filename}, Record {index + 1}")
                    documents.append(Document(text=row_text))

            except Exception as e:
                print(f"Error processing CSV file {filename}: {e}")

        else:
            print(f"Unsupported file format for file: {filename}. Skipping.")

    print(f"Total Documents Created: {len(documents)}")
    return documents

Data prep

In [48]:
from google.colab import drive
drive.mount('/content/drive')

data_dir = "/content/drive/MyDrive/rag_papers"
!ls "/content/drive/MyDrive/rag_papers"

csv_documents = convert_excel_csv_to_documents(data_dir)
pdf_documents = load_pdf_documents(data_dir)
all_documents = csv_documents + pdf_documents

Unsupported file format for file: Differential-pre-malignant-programs-and-microenvir.pdf. Skipping.
Unsupported file format for file: s41588-022-01088-x (2).pdf. Skipping.
Processing Excel file: ScTypeDB_full.xlsx, Sheets: ['Sheet1']
Total Documents Created: 241


Configuration of parameters

In [58]:
# parameters
os.environ['OPENAI_API_KEY']="your API key"
llm_model = "gpt-4o"
temperature = 0.0  # Set to 0 for deterministic responses
max_tokens = 1024
embed_batch_siz = 100
embed_model = "text-embedding-ada-002" # can adjust
chunk_size = 1024 # can adjust
chunk_overlap=20 # can adjust
context_window = 4096
num_output = 1024
embedding_dim = 1536  # For 'text-embedding-ada-002', for 003-large, can be 3072
similarity_top_k = 5 # Number of top similar chunks to retrieve, can adjust here
streaming = True         # Set to True for streaming responses

In [59]:
# Configuration
Settings.llm = OpenAI(
    model= llm_model,
    temperature=temperature,
    max_tokens=max_tokens
)

Settings.embed_model = OpenAIEmbedding(
    model=embed_model,
    embed_batch_size= embed_batch_siz
)

Settings.text_splitter = SentenceSplitter(
    separator=" ",
    chunk_size=chunk_size,
    chunk_overlap=chunk_overlap,
    paragraph_separator="\n\n\n",
    secondary_chunking_regex="[^,.; ]+[,.; ]?",
)

Settings.tokenizer = tiktoken.encoding_for_model(llm_model).encode

Settings.context_window = context_window
Settings.num_output = num_output

faiss_index = faiss.IndexFlatL2(embedding_dim)
vector_store = FaissVectorStore(faiss_index)

Prep Query Engine

In [60]:
parser = SimpleNodeParser()
nodes = parser.get_nodes_from_documents(all_documents)


index = VectorStoreIndex(
    nodes,
    vector_store=vector_store
)

query_engine = index.as_query_engine(
    similarity_top_k=similarity_top_k,
    streaming= streaming
)

Ask questions

In [61]:
# State Questions here
# For excel and csvs, can directly ask those questions, will give correct answer.
query = "markers for pro-B cell, pre-B cell, their similarity? and markers unique for pre-B? makrer unique for pro-B? Answer each question carefully"

# Response
response = query_engine.query(query)
print(response)

Markers for Pro-B cells include CD27, IgD, CD24, PTPRC, PAX5, CD24, CD38, CD79A, DNTT, C10orf10, VPREB1, ARPP21, CD99, IGLL1, CD9, CD79B, TCL1A, IGLL5, HLA-DQA1, HLA-DQB1, VPREB3, and IGLL5.

Markers for Pre-B cells include CD19, CD27, IgD, CD24, PTPRC, PAX5, CD24, CD38, CD79A, NSMCE1, PCDH9, ACSM3, CCDC191, TCL1A, CD79B, TCL1A, IGLL5, HLA-DQA1, HLA-DQB1, VPREB3, and IGLL5.

**Similarity:**
Both Pro-B and Pre-B cells share several markers, including CD27, IgD, CD24, PTPRC, PAX5, CD24, CD38, CD79A, TCL1A, CD79B, IGLL5, HLA-DQA1, HLA-DQB1, and VPREB3.

**Markers unique for Pre-B cells:**
- CD19
- NSMCE1
- PCDH9
- ACSM3
- CCDC191

**Markers unique for Pro-B cells:**
- DNTT
- C10orf10
- VPREB1
- ARPP21
- CD99
- IGLL1
- CD9


In [63]:
# State Questions here
# for papers, give some hint like key words, then it works well.
query = "markers for SSC, detaily list out all of them. key words to find them might be something similar to: marker, marker gene, upregulate, downregulate, overexpress, underexpress, overrepresent, dominant, don't just restric to genes, also include regulon, transcript factors, if it is regulon and transcript factors, list the main group name, and if later in the paper, there are detail discussion of which gene in that group, also list out. "

# Response
response = query_engine.query(query)
print(response)

Markers for SSCs include:

1. **Gene Signatures**:
   - **MDK**: Encodes a heparin-binding growth factor transiently expressed in early colonic development.
   - **RXRA/RARA/ALDOB**: Luminal retinoic acid-induced absorptive cell-differentiation genes.
   - **ANAX10/ANXA1**: Rostral identity genes paradoxically increased in absorptive cell differentiation.

2. **Super-Regulons**:
   - **WNT- and Hippo-driven super-regulons**: Marked by activities of MYC, ASCL2, TCF7, and TEAD1.
   - **Interleukin signaling and microbiota interaction super-regulon**: Upregulated transcription factor activities include RELB (nuclear factor kB [NF-kB] signaling), IRF1, IRF6, and IRF7.

3. **Transcription Factors**:
   - **RELB**: Associated with NF-kB signaling.
   - **IRF1, IRF6, IRF7**: Reflecting an immunogenic state.

4. **Inflammasome-related Genes**:
   - **IL18**: Implicated in responses to external pathogens.
   - **Gasdermins**: Further implicated in responses to external pathogens.

5. **Other Re

In [None]:
# more prompts here. 