#### Loading the data

In [1]:
import numpy as np
import pandas as pd

In [2]:
maude_fda_data = pd.read_excel('fda_device_data - Subset.xlsx')

In [3]:
maude_fda_data.head()

Unnamed: 0,event_type,date_of_event,product_problems,complaint_txt,follow_up,manufacturer_narrative,device_generic_name,device_manufacturer_name,device_model_number,device_lot_number,device_report_product_code,expiration_date_of_device,date_returned_to_manufacturer,device_availability,device_operator,device_name
0,Malfunction,20200418,"['Battery Problem', 'Power Problem']",INFORMATION RECEIVED BY MEDTRONIC INDICATED TH...,,(B)(4). CURRENTLY IT IS UNKNOWN WHETHER OR NOT...,"ARTIFICIAL PANCREAS DEVICE SYSTEM, THRESHOLD S...",MEDTRONIC PUERTO RICO OPERATIONS CO.,MMT-1715K,HG1B44Y,OZO,,20200429.0,Device was returned to manufacturer,LAY USER/PATIENT,"Automated Insulin Dosing , Threshold Suspend"
1,Malfunction,20200421,"['Excess Flow or Over-Infusion', 'Battery Prob...",INFORMATION RECEIVED BY MEDTRONIC INDICATED TH...,,CURRENTLY IT IS UNKNOWN WHETHER OR NOT THE DEV...,"PUMP, INFUSION, INSULIN, TO BE USED WITH INVAS...",MEDTRONIC MINIMED,MMT-XXX,,OYC,,,No,LAY USER/PATIENT,"Pump, Infusion, Insulin, To Be Used With Invas..."
2,Injury,20200407,"['Failure to Charge', 'Battery Problem']",FOLLOWING THE BATTERY PERFORMANCE ALERT (BPA) ...,,THE RESULTS/METHOD AND CONCLUSION CODES ALONG ...,IMPLANTABLE CARDIOVERTER DEFIBRILLATOR,"ST. JUDE MEDICAL, INC.(CRM-SUNNYVALE)",CD1357-40Q,4443992,LWS,20160131.0,20200422.0,Device was returned to manufacturer,HEALTH PROFESSIONAL,Implantable Cardioverter Defibrillator (Non-Crt)
3,Malfunction,20200402,['Battery Problem'],IT WAS REPORTED THAT THE PUMP BATTERY WAS DEPL...,,NO PRODUCT WAS RETURNED FOR EVALUATION. SHOULD...,CONTINUOUS GLUCOSE MONITOR,TANDEM DIABETES CARE,1000096,,OYC,,,No,LAY USER/PATIENT,"Pump, Infusion, Insulin, To Be Used With Invas..."
4,Malfunction,20200414,['Battery Problem'],IT WAS REPORTED THAT THIS DEVICE TRIPPED ERI O...,,THE DEVICE WAS NOT RETURNED FOR ANALYSIS. THE ...,ICD,BIOTRONIK SE & CO. KG,383594,,LWS,20150228.0,20210302.0,Device was returned to manufacturer,HEALTH PROFESSIONAL,Implantable Cardioverter Defibrillator (Non-Crt)


#### CSV Knowledge Base - Does not work using Phi 

In [4]:
from phi.knowledge.csv import CSVKnowledgeBase
from phi.vectordb.chroma import ChromaDb

In [5]:
maude_knowledge_base = CSVKnowledgeBase(
    path="fda_device_data - Subset.xlsx",
    vector_db=ChromaDb(collection="maude"),
)
# Comment out after first run
#maude_knowledge_base.load(recreate=False)

In [6]:
maude_knowledge_base

CSVKnowledgeBase(reader=CSVReader(chunk=True, chunk_size=3000, separators=['\n', '\n\n', '\r', '\r\n', '\n\r', '\t', ' ', '  '], chunking_strategy=<phi.document.chunking.fixed.FixedSizeChunking object at 0x0000028E0160C040>), vector_db=<phi.vectordb.chroma.chromadb.ChromaDb object at 0x0000028E0160C730>, num_documents=5, optimize_on=1000, driver='knowledge', chunking_strategy=<phi.document.chunking.fixed.FixedSizeChunking object at 0x0000028E0160C040>, path='fda_device_data - Subset.xlsx')

In [7]:
from phi.agent import Agent

agent = Agent(
    knowledge=maude_knowledge_base,
    search_knowledge=True,
)
agent.knowledge.load(recreate=False)

agent.print_response("What are the potential hazards present from the knowledge base?")


Output()

##### Response:  I couldn't find any specific information in the knowledge base regarding potential hazards. If you have more specific details or a certain context in mind, please let me know so I can assist you better! 

In [8]:
!pip install openai lancedb tantivy pypdf sqlalchemy



In [9]:
from phi.agent import Agent
from phi.model.openai import OpenAIChat
from phi.embedder.openai import OpenAIEmbedder
from phi.knowledge.pdf import PDFUrlKnowledgeBase
from phi.knowledge.csv import CSVKnowledgeBase
from phi.vectordb.lancedb import LanceDb, SearchType

# Create a knowledge base from a PDF
knowledge_base = CSVKnowledgeBase(
    path="H:\Interview Preparation\Coding\GenAI\Tryouts\6-Maude DB Analysis\fda_device_data - Subset.xlsx",
    # Use LanceDB as the vector database
    vector_db=LanceDb(
        table_name="maude",
        uri="tmp/lancedb",
        search_type=SearchType.vector,
        embedder=OpenAIEmbedder(model="text-embedding-3-small"),
    ),
)
# Comment out after first run as the knowledge base is loaded
knowledge_base.load(recreate=False)

agent = Agent(
    model=OpenAIChat(id="gpt-4o"),
    # Add the knowledge base to the agent
    knowledge=knowledge_base,
    show_tool_calls=True,
    markdown=True,
)
agent.print_response("What are the potential problem present from the knowledge base?", stream=True)


Output()

## Langchain RAG- Implementation 

In [None]:
#!pip install unstructured openpyxl chardet iso-639

In [None]:
!pip uninstall iso-639
#pip install iso-639


In [47]:
!pip show iso-639


Name: iso-639
Version: 0.4.5
Summary: Python library for ISO 639 standard
Home-page: https://github.com/noumar/iso639
Author: Mikael Karlsson
Author-email: i8myshoes@gmail.com
License: AGPLv3
Location: h:\interview preparation\coding\genai\tryouts\venv\lib\site-packages
Requires: 
Required-by: 


In [48]:
import iso639

print(hasattr(iso639, "Language"))
print(hasattr(iso639, "LanguageNotFoundError"))


False
False


##### Data Splitting from excel file 

In [10]:
from langchain_community.document_loaders import UnstructuredCSVLoader, UnstructuredExcelLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter, CharacterTextSplitter, RecursiveJsonSplitter

In [11]:
file_path = r"H:\Interview Preparation\Coding\GenAI\Tryouts\6-Maude DB Analysis\fda_device_data - Subset.xlsx"
file_path_1 = r'H:\Interview Preparation\Coding\GenAI\Tryouts\6-Maude DB Analysis\fda_device_data.xlsx'

In [12]:
loader = UnstructuredExcelLoader(file_path_1)
docs = loader.load()
splitted_documents = RecursiveCharacterTextSplitter(chunk_size = 1000, chunk_overlap=200).split_documents(docs)

AttributeError: module 'iso639' has no attribute 'LanguageNotFoundError'

In [17]:
splitted_documents



In [19]:
len(splitted_documents), splitted_documents[:10]

(25,

##### Creating vector DB from documents using embedding

In [20]:
from langchain_community.vectorstores import FAISS
from langchain_community.embeddings import OllamaEmbeddings, OpenAIEmbeddings, HuggingFaceEmbeddings

In [21]:
ollama_embedding = OllamaEmbeddings()
vector_store_db = FAISS.from_documents(splitted_documents, ollama_embedding)
vector_store_db

  ollama_embedding = OllamaEmbeddings()


<langchain_community.vectorstores.faiss.FAISS at 0x2a1e34ccf10>

In [24]:
vector_store_db.save_local("Ollama_maude_vector_store")
#vector_store_db = FAISS.load_local("Ollama_maude_vector_store", ollama_embeddings)

In [25]:
openAI_embedding = OpenAIEmbeddings()
openAI_vector_store_db = FAISS.from_documents(splitted_documents, openAI_embedding)
openAI_vector_store_db

  openAI_embedding = OpenAIEmbeddings()


<langchain_community.vectorstores.faiss.FAISS at 0x2a1e3eff3d0>

In [26]:
openAI_vector_store_db.save_local("OpenAI_maude_vector_store")
#openAI_vector_store_db = FAISS.load_local("OpenAI_maude_vector_store", openAI_embedding)

##### Query Vector Store

In [22]:
query="What are the potential hazards present in the data?"
docs=vector_store_db.similarity_search(query)
docs



In [27]:
query="What are the potential hazards present in the data?"
openai_docs=openAI_vector_store_db.similarity_search(query)
openai_docs



In [23]:
docs_and_score=vector_store_db.similarity_search_with_score(query)
docs_and_score

  18848.082),
  18910.621),
  19340.734),
  19899.428)]

In [29]:
openai_docs_and_score=openAI_vector_store_db.similarity_search_with_score(query)
openai_docs_and_score

  0.48276165),
  0.4829978),
  0.4921661),
  0.49284127)]

##### Creating retrievers & query 

In [37]:
retriever=vector_store_db.as_retriever()
retr_docs=retriever.invoke(query)
retr_docs



In [30]:
openai_retriever=openAI_vector_store_db.as_retriever()
openai_retr_docs=openai_retriever.invoke(query)
openai_retr_docs



#### Document Chain & Context Passing

#### Creating Agentic RAG