# Install & Import Libraries

In [10]:
!pip install -U langchain langchain-community langchain-groq langchain-huggingface gradio pypdf unstructured python-docx
!pip install faiss-cpu


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)




huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Collecting faiss-cpu
  Downloading faiss_cpu-1.12.0-cp311-cp311-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl.metadata (5.1 kB)
Downloading faiss_cpu-1.12.0-cp311-cp311-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl (31.4 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m31.4/31.4 MB[0m [31m60.1 MB/s[0m eta [36m0:00:00[0m:00:01[0m00:01[0m
[?25hInstalling collected packages: faiss-cpu
Successfully installed faiss-cpu-1.12.0


In [11]:
import os
import shutil
import warnings
import logging

import gradio as gr
from langchain_community.document_loaders import PyPDFLoader, TextLoader, CSVLoader, UnstructuredWordDocumentLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_community.vectorstores import FAISS
from langchain_huggingface import HuggingFaceEmbeddings
from langchain.chains import RetrievalQA
from langchain_groq import ChatGroq

warnings.filterwarnings("ignore")
logging.getLogger("langchain").setLevel(logging.ERROR)
logging.getLogger("transformers").setLevel(logging.ERROR)


# API Key

In [12]:
from kaggle_secrets import UserSecretsClient

user_secrets = UserSecretsClient()

GROQ_API_KEY = user_secrets.get_secret("GROQ_API_KEY")


# Initialize LLM and Embeddings

In [13]:
INDEX_PATH = "faiss_index"
embedding_model = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")

llm = ChatGroq(
    model="llama-3.1-8b-instant",
    api_key=GROQ_API_KEY,
    temperature=0.0,
    max_retries=2
)

# Document Loader 

In [14]:
def load_documents(file_path):
    ext = os.path.splitext(file_path)[1].lower()
    if ext == ".pdf":
        loader = PyPDFLoader(file_path)
    elif ext == ".txt":
        loader = TextLoader(file_path)
    elif ext == ".docx":
        loader = UnstructuredWordDocumentLoader(file_path)
    elif ext == ".csv":
        loader = CSVLoader(file_path)
    else:
        raise ValueError(f" The file type {ext} is not currently supported.")
    return loader.load()


# Uploaded Files

In [15]:
def build_faiss_from_files(files):
    try:
        all_docs = []
        for f in files:
            docs = load_documents(f)
            text_splitter = RecursiveCharacterTextSplitter(
                chunk_size=500,
                chunk_overlap=50
            )
            split_docs = text_splitter.split_documents(docs)
            all_docs.extend(split_docs)

        vector_store = FAISS.from_documents(all_docs, embedding_model)
        vector_store.save_local(INDEX_PATH)
        return "Index has been created and saved successfully!"
    except Exception as e:
        return f" Error while building the index: {str(e)}"


# Build Chat

In [16]:
def get_chat_response(question: str) -> str:
    if not os.path.exists(INDEX_PATH):
        return " The index has not been created yet. Please upload your files first."

    vector_store = FAISS.load_local(
        INDEX_PATH,
        embedding_model,
        allow_dangerous_deserialization=True
    )
    docs = vector_store.similarity_search_with_score(question, k=3)

    if not docs:
        return "No relationship information found in the files."

    context = "\n\n".join(
        [f"[ {doc[0].metadata.get('source','unknown')}] {doc[0].page_content}" for doc in docs]
    )

    prompt = f"""
أنت مساعد ذكي. استخدم فقط المعلومات التالية للإجابة على السؤال.
إذا لم تجد الإجابة في السياق، قل: "لا أجد إجابة في الملفات المرفوعة".

السياق:
{context}

السؤال: {question}

الإجابة:
"""
    response = llm.invoke(prompt)
    return response.content.strip()


# Interface work

In [17]:
with gr.Blocks() as demo:

    
    with gr.Tab(" Upload files"):
        file_input = gr.File(
            file_types=[".pdf", ".txt", ".docx", ".csv"],
            type="filepath",             
            label="Select files",
            file_count="multiple"       
        )
        build_button = gr.Button(" Index construction")
        build_output = gr.Textbox(label="Status")
        build_button.click(
            fn=build_faiss_from_files,
            inputs=[file_input],
            outputs=[build_output]
        )
    with gr.Tab(" Ask a question"):
        question_input = gr.Textbox(label="Write your question here")
        answer_output = gr.Textbox(label="Answer")
        ask_button = gr.Button("Submit question")
        ask_button.click(
            fn=get_chat_response,
            inputs=[question_input],
            outputs=[answer_output] )


In [22]:
demo.launch(share=True)

Rerunning server... use `close()` to stop if you need to change `launch()` parameters.
----
* Running on public URL: https://3cdaa96d7a796e3df6.gradio.live

This share link expires in 1 week. For free permanent hosting and GPU upgrades, run `gradio deploy` from the terminal in the working directory to deploy to Hugging Face Spaces (https://huggingface.co/spaces)


