<a href="https://colab.research.google.com/github/Benyormin/SmartDocs-AI/blob/main/SmartDocs_AI_(Advance_RAG).ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Installing requirements

In [1]:
%%writefile requirements.txt
langchain
python-dotenv
streamlit
unstructured[all-docs]
tiktoken
faiss-cpu
libmagic
python-magic
langchain-google-genai
google-generativeai
pyngrok
langchain_community
langchain-google-genai

Writing requirements.txt


In [1]:
!pip install -r requirements.txt --quiet

  Preparing metadata (setup.py) ... [?25l[?25hdone
  Preparing metadata (setup.py) ... [?25l[?25hdone
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m48.2/48.2 kB[0m [31m1.9 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m10.1/10.1 MB[0m [31m61.2 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m31.3/31.3 MB[0m [31m58.5 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m42.0/42.0 kB[0m [31m2.6 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.5/2.5 MB[0m [31m76.2 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m17.6/17.6 MB[0m [31m81.4 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m16.4/16.4 MB[0m [31m81.4 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m4

# Setup API Keys

You should replace your own API keys and Tokens here

In [6]:
%%writefile .env
GOOGLE_API_KEY= "Your_GOOGLE_API_KEY"
NGROK_TOKEN = "Your_NGROK_TOKEN"

Overwriting .env


# Main code

In [3]:
%%writefile app.py
import os
import streamlit as st
import pickle
import time
from langchain import OpenAI
from langchain.chains import RetrievalQAWithSourcesChain
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.document_loaders import UnstructuredURLLoader
from langchain.vectorstores import FAISS
from dotenv import load_dotenv
from langchain.embeddings import HuggingFaceEmbeddings
from langchain_google_genai import GoogleGenerativeAI
import pandas as pd

from langchain.document_loaders import TextLoader
from langchain.document_loaders import CSVLoader
from langchain.document_loaders import UnstructuredExcelLoader
from langchain.document_loaders import UnstructuredPDFLoader

import os
import tempfile




@st.cache_resource(show_spinner="Loading embedding model...")
def load_embedding_model():
    model_name = "sentence-transformers/all-MiniLM-L6-v2"
    return HuggingFaceEmbeddings(model_name=model_name)

st.title("SmartDocs AI 🤖")
st.sidebar.title("Sources 🛠️")

uploaded_docs = []
url_docs = []
GOOGLE_API_KEY = os.getenv("GOOGLE_API_KEY")
llm = GoogleGenerativeAI(model="gemini-2.5-flash", google_api_key=GOOGLE_API_KEY, temperature= 0)
text_splitter = RecursiveCharacterTextSplitter(
    separators=['\n\n', '\n', '.', ','],
    chunk_size=1000,
    chunk_overlap  = 150
)
embeddings = load_embedding_model()


def load_uploaded_files(uploaded_files):
    results = []
    for file in uploaded_files:
        filename = file.name
        file_extension = filename.split('.')[-1].lower()

        # Save uploaded file to a temporary file on disk
        with tempfile.NamedTemporaryFile(delete=False, suffix=f".{file_extension}") as tmp_file:
            tmp_file.write(file.getbuffer())
            tmp_path = tmp_file.name

        try:
            if file_extension == "txt":
                text_loader = TextLoader(tmp_path)
                data = text_loader.load()
                results.extend(data)  # load() returns list of Document objects

            elif file_extension == "csv":
                csv_loader = CSVLoader(tmp_path)
                data = csv_loader.load()
                results.extend(data)

            elif file_extension in ["xls", "xlsx"]:
                excel_loader = UnstructuredExcelLoader(tmp_path)
                data = excel_loader.load()
                results.extend(data)

            elif file_extension == "pdf":
                pdf_loader = UnstructuredPDFLoader(tmp_path)
                data = pdf_loader.load()
                results.extend(data)
        finally:
            # Clean up the temporary file
            os.unlink(tmp_path)

    return results


load_dotenv()



# Initialize session state for URL list
if "urls" not in st.session_state:
    st.session_state.urls = []
if "new_url" not in st.session_state:
    st.session_state.new_url = ""

# Input for a new URL (not tied to list yet)
st.session_state.new_url = st.sidebar.text_input("Enter new URL")

# Add URL when button clicked
if st.sidebar.button("➕ Add URL"):
    url = st.session_state.new_url.strip()
    if url:
        st.session_state.urls.append(url)
        st.success(f"Added URL: {url}")
        st.session_state.new_url = ""  # Clear input

# Show current list of URLs
if st.session_state.urls:
    st.sidebar.markdown("### ✅ Added URLs")
    for idx, url in enumerate(st.session_state.urls, 1):
        st.sidebar.markdown(f"{idx}. {url}")
else:
    st.sidebar.info("No URLs added yet.")

if st.sidebar.button("❌ Clear URLs"):
    st.session_state.urls = []

urls = st.session_state.get("urls", [])







file_path = "faiss_store.pkl"

##uploading logic

uploaded_files = st.sidebar.file_uploader(
    "Upload one or more files (.txt, .csv, .pdf, .xls/.xlsx)",
    type=["txt", "csv", "pdf", "xls", "xlsx"],
    accept_multiple_files=True
)


process_files_btn = st.sidebar.button("Process Sources")

main_placeholder = st.empty()


if uploaded_files and process_files_btn:
    uploaded_docs = load_uploaded_files(uploaded_files)
    if (len(uploaded_docs)>0):
      main_placeholder.text("Files Loading...Started...✅✅✅")






if process_files_btn:
    # load data
    if urls:
      loader = UnstructuredURLLoader(urls=urls)
      main_placeholder.text("Processing URLs...✅✅✅")
      url_docs = loader.load()



    all_docs = uploaded_docs + url_docs
    main_placeholder.text("Text Splitter...Started...✅✅✅")
    docs = text_splitter.split_documents(all_docs)
    # create embeddings and save it to FAISS index



    vectorstore = FAISS.from_documents(docs, embeddings)
    main_placeholder.text("Embedding Vector Started Building...✅✅✅")
    time.sleep(2)

    # Save the FAISS index to a pickle file
    with open(file_path, "wb") as f:
        pickle.dump(vectorstore, f)

query = main_placeholder.text_input("Question: ")
if query:
    if os.path.exists(file_path):
        with open(file_path, "rb") as f:
            vectorstore = pickle.load(f)
            chain = RetrievalQAWithSourcesChain.from_llm(llm=llm, retriever=vectorstore.as_retriever())
            result = chain({"question": query}, return_only_outputs=True)
            # result will be a dictionary of this format --> {"answer": "", "sources": [] }


            st.header("Answer")
            st.write(result["answer"])

            # Display sources, if available
            sources = result.get("sources", "")
            if sources:
                st.subheader("Sources:")
                sources_list = sources.split("\n")  # Split the sources by newline
                for source in sources_list:
                    st.write(source)

Writing app.py


## Set up the server

In [4]:
from pyngrok import ngrok
from pyngrok import ngrok, conf
from dotenv import load_dotenv
import os


# Kill existing tunnels (if any)
ngrok.kill()

# Set your authtoken
load_dotenv()

conf.get_default().auth_token = os.getenv("NGROK_TOKEN")

# Start tunnel explicitly with protocol
public_url = ngrok.connect(8501, "http")
print("Streamlit app link:", public_url)



Streamlit app link: NgrokTunnel: "https://a9bb-34-80-48-84.ngrok-free.app" -> "http://localhost:8501"


## Run Streamlit

In [5]:
!streamlit run app.py &>/content/logs.txt &