In [2]:
%pwd

'c:\\Users\\Chinelo\\Data-Chatbot\\research'

In [3]:
import os
os.chdir("../")

In [4]:
%pwd

'c:\\Users\\Chinelo\\Data-Chatbot'

In [5]:
import os
import faiss
import numpy as np
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.document_loaders import PyPDFLoader, TextLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.vectorstores import FAISS
from langchain.prompts import PromptTemplate
from langchain_groq import ChatGroq
from langchain.schema import HumanMessage, SystemMessage, AIMessage
from dotenv import load_dotenv
from docx import Document 
from PyPDF2 import PdfReader


In [7]:
# Load environment variables
load_dotenv()
os.environ['GROQ_API_KEY'] = os.getenv("GROQ_API_KEY")

In [8]:
def get_text_from_pdf(pdf_docs):
    """Extract text from uploaded PDF files."""
    text = ""
    for pdf in pdf_docs:
        pdf_reader = PdfReader(pdf)
        for page in pdf_reader.pages:
            text += page.extract_text()
    return text


In [9]:
def get_text_from_docx(docx_docs):
    """Extract text from uploaded DOCX files."""
    text = ""
    for docx in docx_docs:
        doc = Document(docx)
        for paragraph in doc.paragraphs:
            text += paragraph.text + "\n"  
    return text

In [10]:
def get_text_chunks(text):
    """Split text into manageable chunks."""
    text_splitter = RecursiveCharacterTextSplitter(chunk_size=10000, chunk_overlap=1000)
    chunks = text_splitter.split_text(text)
    return chunks

In [11]:
def get_vector_store(text_chunks):
    """Create vector store and save it locally."""
    embeddings = HuggingFaceEmbeddings(model_name="all-MiniLM-L6-v2")
    vector_store = FAISS.from_texts(text_chunks, embedding=embeddings)
    vector_store.save_local("faiss_index")

In [12]:
def process_documents(pdf_docs, docx_docs):
    """Process both PDF and DOCX documents."""
    # Extract text from PDF and DOCX files
    pdf_text = get_text_from_pdf(pdf_docs) if pdf_docs else ""
    docx_text = get_text_from_docx(docx_docs) if docx_docs else ""

    # Combine the extracted text from both PDF and DOCX files
    full_text = pdf_text + docx_text

    # Split the combined text into chunks
    text_chunks = get_text_chunks(full_text)

    # Create a vector store
    get_vector_store(text_chunks)

In [13]:
def get_conversational_chain():
    prompt_template = """
    Answer the question as detailed as possible from the provided context, make sure to provide all the details, if the answer is not in
    provided context just say, "answer is not available in the context", don't provide the wrong answer\n\n

    2. **Documents**: 
   - Review the documents thoroughly and focus only on the information presented in them. 
   - Do not introduce any external information not found in the provided documents.
   - If there is insufficient information to provide a direct answer, respond with:
     "Sorry, I couldn't find sufficient information in the provided documents to answer your question. Please ensure that your query is related to the document context, or provide more specific details."

3. **Response Guidelines**:
   - Provide clear, accurate, and concise answers, directly addressing the user’s query with only the information derived from the documents.
   - Maintain a professional tone, using terminology from the documents when appropriate, but ensure that your response is easily understandable to users with basic to intermediate knowledge of the content.
   - If the response requires technical terms, make sure to define them or explain them in simple language.
   - Always use a tone that is respectful, empathetic, and informative. Make sure the user feels heard and supported.
   
4. **When to Ask for Clarification**:
   - If the user's query is ambiguous, overly broad, or could refer to multiple topics, ask the user for further clarification to provide a more accurate response.
   - Examples of follow-up questions: 
     "Could you please clarify what you mean by [specific term]?"
     "Can you provide more details or context regarding [specific topic]?"

    Context:\n {context}?\n
    Question: \n{question}\n  Answer:
    """
    # Here, you need to use llm (which is defined earlier) instead of model.
    llm = ChatGroq(model_name="llama-3.3-70b-versatile", temperature=0.5)
    
    prompt = PromptTemplate(template=prompt_template, input_variables=["context", "question"])
    
    # Use llm instead of model when calling load_qa_chain
    chain = load_qa_chain(llm, chain_type="stuff", prompt=prompt)
    
    return chain


In [14]:
def user_input(user_question):
    embeddings = HuggingFaceEmbeddings(model_name="all-MiniLM-L6-v2")
    
    new_db = FAISS.load_local("faiss_index", embeddings)
    docs = new_db.similarity_search(user_question)

    chain = get_conversational_chain()

    
    response = chain(
        {"input_documents":docs, "question": user_question}
        , return_only_outputs=True)

    print(response)
    st.write("Reply: ", response["output_text"])


In [18]:
pip install text_processing

Collecting text_processing
  Downloading text_processing-0.0.2.tar.gz (2.9 kB)
  Preparing metadata (setup.py): started
  Preparing metadata (setup.py): finished with status 'done'
Building wheels for collected packages: text_processing
  Building wheel for text_processing (setup.py): started
  Building wheel for text_processing (setup.py): finished with status 'done'
  Created wheel for text_processing: filename=text_processing-0.0.2-py3-none-any.whl size=3693 sha256=a89b49e073c14202168399412ae50a1892526b2aa9bcd3c6aa7c94e862342ed1
  Stored in directory: c:\users\chinelo\appdata\local\pip\cache\wheels\e6\b0\52\417909c7fa7a88fa0251391552118fecfb64107c03203c4917
Successfully built text_processing
Installing collected packages: text_processing
Successfully installed text_processing-0.0.2
Note: you may need to restart the kernel to use updated packages.


In [16]:
pip install nbimporter


Collecting nbimporterNote: you may need to restart the kernel to use updated packages.

  Downloading nbimporter-0.3.4-py3-none-any.whl.metadata (252 bytes)
Downloading nbimporter-0.3.4-py3-none-any.whl (4.9 kB)
Installing collected packages: nbimporter
Successfully installed nbimporter-0.3.4
