In [None]:
pip install transformers faiss-cpu PyPDF2 sentence-transformers


Collecting faiss-cpu
  Downloading faiss_cpu-1.8.0.post1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (27.0 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m27.0/27.0 MB[0m [31m13.4 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting PyPDF2
  Downloading pypdf2-3.0.1-py3-none-any.whl (232 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m232.6/232.6 kB[0m [31m21.4 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting sentence-transformers
  Downloading sentence_transformers-3.0.1-py3-none-any.whl (227 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m227.1/227.1 kB[0m [31m6.9 MB/s[0m eta [36m0:00:00[0m
Collecting nvidia-cuda-nvrtc-cu12==12.1.105 (from torch>=1.11.0->sentence-transformers)
  Using cached nvidia_cuda_nvrtc_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (23.7 MB)
Collecting nvidia-cuda-runtime-cu12==12.1.105 (from torch>=1.11.0->sentence-transformers)
  Using cached nvidia_cuda_runtime_cu12-12.1.105-py3-none-

# **RAG based LLM python application**

In [None]:
import os
import faiss
import numpy as np
from PyPDF2 import PdfReader
from transformers import pipeline, AutoTokenizer, AutoModel
from sentence_transformers import SentenceTransformer

def load_pdf(file_path):
    reader = PdfReader(file_path)
    text = ""
    for page in reader.pages:
        text += page.extract_text()
    return text


def create_embeddings(text, embedder):
    chunk_size = 512
    chunks = [text[i:i+chunk_size] for i in range(0, len(text), chunk_size)]

    embeddings = embedder.encode(chunks, convert_to_tensor=True)

    dimension = embeddings.shape[1]
    index = faiss.IndexFlatL2(dimension)
    index.add(embeddings.cpu().numpy())

    return chunks, index

def retrieve_and_generate_answer(query, chunks, index, embedder, qa_pipeline):
    query_embedding = embedder.encode([query], convert_to_tensor=True).cpu().numpy()

    _, indices = index.search(query_embedding, k=3)
    relevant_chunks = [chunks[i] for i in indices[0]]

    context = " ".join(relevant_chunks)
    response = qa_pipeline(question=query, context=context)

    return response['answer']

def main(file_path, query):
    if file_path.endswith('.pdf'):
        document_text = load_pdf(file_path)

    # Load models and tokenizer
    embedder = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')
    qa_pipeline = pipeline("question-answering", model="distilbert-base-uncased-distilled-squad")

    chunks, index = create_embeddings(document_text, embedder)
    answer = retrieve_and_generate_answer(query, chunks, index, embedder, qa_pipeline)

    return answer

if __name__ == "__main__":
    file_path = 'Sivakami (Document).pdf'  # Change this to your document path
    query = "why are Professionals in information technology (IT) essential in today's fast-paced digital world"

    answer = main(file_path, query)
    print("Answer:", answer)


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/10.7k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]



1_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

Answer: To avoid and treat emotional weariness


# **UNIVERSAL CODE**

In [None]:
import os
import faiss
import requests
from bs4 import BeautifulSoup
import numpy as np
from PyPDF2 import PdfReader
from transformers import pipeline, AutoTokenizer, AutoModel

def load_pdf():
    file_path = input("Enter path of pdf file : ")
    reader = PdfReader(file_path)
    text = ""
    for page in reader.pages:
        text += page.extract_text()
    return text

def load_text():
    text = input("Enter text: ")
    return text

def load_url():
    url=input("Enter url : ")
    response = requests.get(url)
    soup = BeautifulSoup(response.content, 'html.parser')
    text = ' '.join(map(lambda p: p.get_text(), soup.find_all('p')))
    return text

def create_embeddings(text, tokenizer, model):
    chunk_size = 512
    chunks = [text[i:i+chunk_size] for i in range(0, len(text), chunk_size)]

    embeddings = []
    for chunk in chunks:
        inputs = tokenizer(chunk, return_tensors='pt', truncation=True, padding=True)
        outputs = model(**inputs)
        embeddings.append(outputs.last_hidden_state.mean(dim=1).detach().numpy())

    embeddings = np.vstack(embeddings)

    dimension = embeddings.shape[1]
    index = faiss.IndexFlatL2(dimension)
    index.add(embeddings)

    return chunks, index

def retrieve_and_generate_answer(query, chunks, index, tokenizer, model, qa_pipeline):
    query_inputs = tokenizer(query, return_tensors='pt', truncation=True, padding=True)
    query_embedding = model(**query_inputs).last_hidden_state.mean(dim=1).detach().numpy()

    _, indices = index.search(query_embedding, k=3)
    relevant_chunks = [chunks[i] for i in indices[0]]

    context = " ".join(relevant_chunks)
    response = qa_pipeline(question=query, context=context)

    return response['answer']

print("\nchoose input text format")
print("1.pdf")
print("2.text")
print("3.url")
choice = int(input())

#source.startswith('http://') or source.startswith('https://')
def main():
    if choice == 1:
        document_text = load_pdf()
    elif choice == 3:
        document_text = load_url()
    elif choice == 2:
        document_text = load_text()
    else:
        print("Invalid choice")

    # Load models and tokenizer
    tokenizer = AutoTokenizer.from_pretrained('sentence-transformers/all-MiniLM-L6-v2')
    model = AutoModel.from_pretrained('sentence-transformers/all-MiniLM-L6-v2')
    qa_pipeline = pipeline("question-answering", model="distilbert-base-uncased-distilled-squad")

    chunks, index = create_embeddings(document_text, tokenizer, model)
    answer = retrieve_and_generate_answer(query, chunks, index, tokenizer, model, qa_pipeline)

    return answer

if __name__ == "__main__":
    query = input("Enter Query : ")

    answer = main()
    print("Answer:", answer)



choose input text format
1.pdf
2.text
3.url
1
Enter Query : why are Professionals in information technology (IT) essential in today's fast-paced digital world
Enter path of pdf file : /content/Sivakami (Document).pdf
Answer: To avoid and treat emotional weariness


In [None]:
pip install streamlit


Collecting streamlit
  Downloading streamlit-1.36.0-py2.py3-none-any.whl (8.6 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m8.6/8.6 MB[0m [31m24.8 MB/s[0m eta [36m0:00:00[0m
Collecting gitpython!=3.1.19,<4,>=3.0.7 (from streamlit)
  Downloading GitPython-3.1.43-py3-none-any.whl (207 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m207.3/207.3 kB[0m [31m7.3 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting pydeck<1,>=0.8.0b4 (from streamlit)
  Downloading pydeck-0.9.1-py2.py3-none-any.whl (6.9 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m6.9/6.9 MB[0m [31m40.9 MB/s[0m eta [36m0:00:00[0m
Collecting watchdog<5,>=2.1.5 (from streamlit)
  Downloading watchdog-4.0.1-py3-none-manylinux2014_x86_64.whl (83 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m83.0/83.0 kB[0m [31m1.0 MB/s[0m eta [36m0:00:00[0m
Collecting gitdb<5,>=4.0.1 (from gitpython!=3.1.19,<4,>=3.0.7->streamlit)
  Downloading gitdb-4.

In [None]:
import streamlit as st
import os
import faiss
import requests
from bs4 import BeautifulSoup
import numpy as np
from PyPDF2 import PdfReader
from transformers import pipeline, AutoTokenizer, AutoModel

def load_pdf(file):
    reader = PdfReader(file)
    text = ""
    for page in reader.pages:
        text += page.extract_text()
    return text

def load_text(text):
    return text

def load_url(url):
    response = requests.get(url)
    soup = BeautifulSoup(response.content, 'html.parser')
    text = ' '.join(map(lambda p: p.get_text(), soup.find_all('p')))
    return text

def create_embeddings(text, tokenizer, model):
    chunk_size = 512
    chunks = [text[i:i+chunk_size] for i in range(0, len(text), chunk_size)]

    embeddings = []
    for chunk in chunks:
        inputs = tokenizer(chunk, return_tensors='pt', truncation=True, padding=True)
        outputs = model(**inputs)
        embeddings.append(outputs.last_hidden_state.mean(dim=1).detach().numpy())

    embeddings = np.vstack(embeddings)

    dimension = embeddings.shape[1]
    index = faiss.IndexFlatL2(dimension)
    index.add(embeddings)

    return chunks, index

def retrieve_and_generate_answer(query, chunks, index, tokenizer, model, qa_pipeline):
    query_inputs = tokenizer(query, return_tensors='pt', truncation=True, padding=True)
    query_embedding = model(**query_inputs).last_hidden_state.mean(dim=1).detach().numpy()

    _, indices = index.search(query_embedding, k=3)
    relevant_chunks = [chunks[i] for i in indices[0]]

    context = " ".join(relevant_chunks)
    response = qa_pipeline(question=query, context=context)

    return response['answer']

# Streamlit UI
st.title("Document QA System")

st.sidebar.title("Input Options")
input_option = st.sidebar.selectbox("Select input format:", ("PDF", "Text", "URL"))

document_text = ""

if input_option == "PDF":
    uploaded_file = st.sidebar.file_uploader("Choose a PDF file", type="pdf")
    if uploaded_file is not None:
        document_text = load_pdf(uploaded_file)
elif input_option == "Text":
    document_text = st.sidebar.text_area("Enter text:")
elif input_option == "URL":
    url = st.sidebar.text_input("Enter URL:")
    if url:
        document_text = load_url(url)

if document_text:
    st.write("Document loaded successfully!")

    # Load models and tokenizer
    tokenizer = AutoTokenizer.from_pretrained('sentence-transformers/all-MiniLM-L6-v2')
    model = AutoModel.from_pretrained('sentence-transformers/all-MiniLM-L6-v2')
    qa_pipeline = pipeline("question-answering", model="distilbert-base-uncased-distilled-squad")

    chunks, index = create_embeddings(document_text, tokenizer, model)

    query = st.text_input("Enter your query:")
    if query:
        answer = retrieve_and_generate_answer(query, chunks, index, tokenizer, model, qa_pipeline)
        st.write("Answer:", answer)


2024-07-18 14:40:13.306 
  command:

    streamlit run /usr/local/lib/python3.10/dist-packages/colab_kernel_launcher.py [ARGUMENTS]
2024-07-18 14:40:13.312 Session state does not function when running a script without `streamlit run`
