In [1]:
pip -q install langchain openai tiktoken chromadb 

[0mNote: you may need to restart the kernel to use updated packages.


In [2]:
import os
import dotenv
dotenv.load_dotenv()
open_api_key = os.getenv('OPENAI_API_KEY')

In [3]:
import pandas as pd
import langchain
import time
import json
import numpy as np
import shutil

from langchain.vectorstores import Chroma
from langchain.embeddings import OpenAIEmbeddings
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.llms import OpenAI
from langchain.chains import RetrievalQA
from langchain.chat_models import ChatOpenAI
from langchain.prompts.chat import (
    ChatPromptTemplate,
    HumanMessagePromptTemplate,
    SystemMessagePromptTemplate,
)
from langchain.llms import OpenAI
from langchain.chains.question_answering import load_qa_chain
from langchain.document_loaders import JSONLoader

In [4]:
# # Load the original JSON
# with open("datasets/xray_articles.json", "r") as file:
#     data = json.load(file)

# # Process each document
# for doc in data:
#     doc['Authors'] = ' , '.join(doc['Authors'])
#     doc['FullText'] = ' , '.join(doc['FullText'])

# # Save the processed JSON
# with open("datasets/xray_articles_processed.json", "w") as file:
#     json.dump(data, file, indent=4)

In [5]:
loader = JSONLoader(
            file_path="datasets/xray_articles_processed.json",
            jq_schema='.[].FullText',
            text_content=True)

documents = loader.load()

In [6]:
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
texts = text_splitter.split_documents(documents)

In [7]:
len(texts)

62557

In [8]:
texts[3]

Document(page_content='memory and cognitive abilities, and it is the most common cause of dementia in older people. In the world, there were more than 50 million AD sufferers in 2018. Worldwide, there will be 131 million persons with AD in 2050, and the socioeconomic cost will be 9.12 trillion dollars. Forgetting recently acquired information, important dates or events, difficulty in performing simple daily works, and repeatedly asking the same questions are all classic early symptoms of AD. In the final stage, patients’ behavioral changes are also observed. The disease strikes the majority of people in their mid-60s. Scientists agree that the root cause of this neurological disease is a combination of genetics, long-term environmental conditions, and lifestyle. Though some medications are available, AD is not curable, and the damage it causes is permanent. The most common cause of death in Alzheimer’s patients is aspiration pneumonia. , Though machine learning (ML) is a very potent te

In [9]:
first_five_texts = texts[:5]

In [10]:
first_five_texts

[Document(page_content='Explainable AI-based Alzheimer’s prediction and management using multimodal data , Background , According to the World Health Organization (WHO), dementia is the seventh leading reason of death among all illnesses and one of the leading causes of disability among the world’s elderly people. Day by day the number of Alzheimer’s patients is rising. Considering the increasing rate and the dangers, Alzheimer’s disease should be diagnosed carefully. Machine learning is a potential technique for Alzheimer’s diagnosis but general users do not trust machine learning models due to the black-box nature. Even, some of those models do not provide the best performance because of using only neuroimaging data. , Objective , To solve these issues, this paper proposes a novel explainable Alzheimer’s disease prediction model using a multimodal dataset. This approach performs a data-level fusion using clinical data, MRI segmentation data, and psychological data. However, currently

In [11]:
# Embed and store the texts
# Supplying a persist_directory will store the embeddings on disk
persist_directory = 'db'

## here we are using OpenAI embeddings but in future we will swap out to local embeddings
embedding = OpenAIEmbeddings()

vectordb = Chroma.from_documents(documents=first_five_texts, 
                                 embedding=embedding,
                                 persist_directory=persist_directory)

In [12]:
vectordb.persist()
vectordb = None

In [13]:
vectordb = Chroma(persist_directory=persist_directory, 
                  embedding_function=embedding)

In [14]:
retriever = vectordb.as_retriever()

In [15]:
docs = retriever.get_relevant_documents("Alzheimer’s?")
len(docs)


4

In [16]:
retriever = vectordb.as_retriever(search_kwargs={"k": 2})

In [17]:
print(retriever.search_type)
print(retriever.search_kwargs)

similarity
{'k': 2}


In [18]:
# create the chain to answer questions 
qa_chain = RetrievalQA.from_chain_type(llm=OpenAI(
            temperature=0, openai_api_key=open_api_key), 
                                  chain_type="stuff", 
                                  retriever=retriever, 
                                  return_source_documents=True)

In [19]:
## Cite sources
def process_llm_response(llm_response):
    print(llm_response['result'])
    print('\n\nSources:')
    for source in llm_response["source_documents"]:
        print(source.metadata['source'])

In [20]:
# full example
query = "Alzheimer’s?"
llm_response = qa_chain(query)
process_llm_response(llm_response)

 Alzheimer’s Disease (AD) is a chronic, progressive neurodegenerative disease that gradually deteriorates memory and cognitive abilities, and it is the most common cause of dementia in older people. In the world, there were more than 50 million AD sufferers in 2018. Worldwide, there will be 131 million persons with AD in 2050, and the socioeconomic cost will be 9.12 trillion dollars. Forgetting recently acquired information, important dates or events, difficulty in performing simple daily works, and repeatedly asking the same questions are all classic early symptoms of AD. In the final stage, patients’ behavioral changes are also observed. The disease strikes the majority of people in their mid-60s. Scientists agree that the root cause of this neurological disease is a combination of genetics, long-term environmental conditions, and lifestyle. Though some medications are available, AD is not curable, and the damage it causes is permanent. The most common cause of death in Alzheimer’s p

In [21]:
# break it down
query = "Alzheimer’s?"
llm_response = qa_chain(query)
# process_llm_response(llm_response)
llm_response

{'query': 'Alzheimer’s?',
 'result': ' Alzheimer’s Disease (AD) is a chronic, progressive neurodegenerative disease that gradually deteriorates memory and cognitive abilities, and it is the most common cause of dementia in older people. In the world, there were more than 50 million AD sufferers in 2018. Worldwide, there will be 131 million persons with AD in 2050, and the socioeconomic cost will be 9.12 trillion dollars. Forgetting recently acquired information, important dates or events, difficulty in performing simple daily works, and repeatedly asking the same questions are all classic early symptoms of AD. In the final stage, patients’ behavioral changes are also observed. The disease strikes the majority of people in their mid-60s. Scientists agree that the root cause of this neurological disease is a combination of genetics, long-term environmental conditions, and lifestyle. Though some medications are available, AD is not curable, and the damage it causes is permanent. The most 

# Deleting DB

In [22]:
!zip -r db.zip ./db

  adding: db/ (stored 0%)
  adding: db/1064f4ad-d661-40db-998c-826c4f9c5c00/ (stored 0%)
  adding: db/1064f4ad-d661-40db-998c-826c4f9c5c00/data_level0.bin (deflated 85%)
  adding: db/1064f4ad-d661-40db-998c-826c4f9c5c00/length.bin (deflated 13%)
  adding: db/1064f4ad-d661-40db-998c-826c4f9c5c00/link_lists.bin (stored 0%)
  adding: db/1064f4ad-d661-40db-998c-826c4f9c5c00/header.bin (deflated 61%)
  adding: db/chroma.sqlite3 (deflated 73%)


In [23]:
# To cleanup, you can delete the collection
vectordb.delete_collection()
vectordb.persist()

# delete the directory
#shutil.rmtree('db', ignore_errors=True)
!rm -rf /db

# Starting again loading the db

In [24]:
!unzip -o db.zip

Archive:  db.zip
   creating: db/
   creating: db/1064f4ad-d661-40db-998c-826c4f9c5c00/
  inflating: db/1064f4ad-d661-40db-998c-826c4f9c5c00/data_level0.bin  
  inflating: db/1064f4ad-d661-40db-998c-826c4f9c5c00/length.bin  
 extracting: db/1064f4ad-d661-40db-998c-826c4f9c5c00/link_lists.bin  
  inflating: db/1064f4ad-d661-40db-998c-826c4f9c5c00/header.bin  
  inflating: db/chroma.sqlite3       


In [25]:
!chmod u+rw db/chroma.sqlite3

In [26]:
persist_directory = 'db'
embedding = OpenAIEmbeddings()

vectordb2 = Chroma(persist_directory=persist_directory, 
                  embedding_function=embedding,
                   )

retriever = vectordb2.as_retriever(search_kwargs={"k": 2})

OperationalError: attempt to write a readonly database

In [None]:
# Set up the turbo LLM
turbo_llm_chat = ChatOpenAI(
    temperature=0,
    openai_api_key=open_api_key,
    model_name='gpt-3.5-turbo'
)

turbo_llm_completion = OpenAI(
            temperature=0, openai_api_key=open_api_key)

In [None]:
# create the chain to answer questions 
qa_chain = RetrievalQA.from_chain_type(llm=turbo_llm_chat, 
                                  chain_type="stuff", 
                                  retriever=retriever, 
                                  return_source_documents=True)

In [None]:
# full example
query = "Alzheimer’s?"
llm_response = qa_chain(query)
process_llm_response(llm_response)

Alzheimer's disease is a progressive brain disorder that affects memory, thinking, and behavior. It is the most common cause of dementia, a general term for memory loss and other cognitive abilities serious enough to interfere with daily life. The exact cause of Alzheimer's disease is not yet fully understood, but it is believed to involve a combination of genetic, lifestyle, and environmental factors. There is currently no cure for Alzheimer's disease, but there are treatments available to help manage symptoms and improve quality of life.


Sources:
