In [21]:
## Data Ingestion
from langchain_community.document_loaders import TextLoader
loader = TextLoader("analysis_input.txt")
text_documents = loader.load()

In [22]:
import os
from dotenv import load_dotenv
load_dotenv()
os.environ['OPENAI_API_KEY'] = os.getenv("OPENAI_API_KEY")

In [23]:
# web based loader
from langchain_community.document_loaders import WebBaseLoader
import bs4

loader = WebBaseLoader(
    web_paths=("https://lilianweng.github.io/posts/2023-06-23-agent/",),
    bs_kwargs=dict(
        parse_only=bs4.SoupStrainer(
            class_=("post-title", "post-content", "post-header")
        )
    )
)
text_documents = loader.load()

In [24]:
## Pdf reader
from langchain_community.document_loaders import PyPDFLoader
loader = PyPDFLoader('Multimodal_LLMs_Hallucination_Mitigation.pdf')
docs = loader.load()

from langchain.text_splitter import RecursiveCharacterTextSplitter
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
documents = text_splitter.split_documents(docs)

## Vector Embedding And Vector Store
from langchain_openai import OpenAIEmbeddings
from langchain_community.vectorstores import Chroma
# embeddings = OpenAIEmbeddings(model="text-embedding-3-small")

## For Ollama compatibility
from langchain_ollama import OllamaEmbeddings
# ollama_embeddings = OllamaEmbeddings(model="llama2")
embeddings = OllamaEmbeddings(model="llama3")
# db_ollama = Chroma.from_documents(documents, ollama_embeddings)

db = Chroma.from_documents(documents, embeddings)

query = "Who are the authors of Multimodal LLMs Hallucination Mitigation?"
retrieved_results = db.similarity_search(query)
print(retrieved_results[0].page_content)

## FAISS Vector Database
from langchain_community.vectorstores import FAISS
db = FAISS.from_documents(documents[:15], embeddings)

Alec Radford, Jong Wook Kim, Chris Hallacy, Aditya Ramesh, Gabriel Goh, Sandhini Agarwal, Girish Sastry,
Amanda Askell, Pamela Mishkin, Jack Clark, et al. Learning transferable visual models from natural language
supervision. InInternational conference on machine learning, pages 8748–8763. PMLR, 2021.
Adam Roberts, Colin Raffel, and Noam Shazeer. How much knowledge can you pack into the parameters of a
language model?arXiv preprint arXiv:2002.08910, 2020.
Dustin Schwenk, Apoorv Khandelwal, Christopher Clark, Kenneth Marino, and Roozbeh Mottaghi. A-okvqa:
A benchmark for visual question answering using world knowledge. InEuropean conference on computer
vision, pages 146–162. Springer, 2022.
Zhiqing Sun, Sheng Shen, Shengcao Cao, Haotian Liu, Chunyuan Li, Yikang Shen, Chuang Gan, Liang-Yan
Gui, Yu-Xiong Wang, Yiming Yang, et al. Aligning large multimodal models with factually augmented rlhf.
arXiv preprint arXiv:2309.14525, 2023.


In [25]:
## For LangServe deployment
from fastapi import FastAPI
from langserve import add_routes
app = FastAPI()
add_routes(app, db.as_retriever(), path="/vectorstore")

## Streamlit app integration
import streamlit as st
st.title("Document Q&A")
query = st.text_input("Enter your question:")
if query:
    results = db.similarity_search(query)
    st.write(results[0].page_content)


