In [2]:
!pip install langchain langchain-community langchainhub huggingface_hub pandas python-dotenv chromadb

Collecting langchain-community
  Downloading langchain_community-0.3.17-py3-none-any.whl.metadata (2.4 kB)
Collecting langchainhub
  Downloading langchainhub-0.1.21-py3-none-any.whl.metadata (659 bytes)
Collecting python-dotenv
  Downloading python_dotenv-1.0.1-py3-none-any.whl.metadata (23 kB)
Collecting chromadb
  Downloading chromadb-0.6.3-py3-none-any.whl.metadata (6.8 kB)
Collecting dataclasses-json<0.7,>=0.5.7 (from langchain-community)
  Downloading dataclasses_json-0.6.7-py3-none-any.whl.metadata (25 kB)
Collecting pydantic-settings<3.0.0,>=2.4.0 (from langchain-community)
  Downloading pydantic_settings-2.7.1-py3-none-any.whl.metadata (3.5 kB)
Collecting httpx-sse<1.0.0,>=0.4.0 (from langchain-community)
  Downloading httpx_sse-0.4.0-py3-none-any.whl.metadata (9.0 kB)
Collecting types-requests<3.0.0.0,>=2.31.0.2 (from langchainhub)
  Downloading types_requests-2.32.0.20241016-py3-none-any.whl.metadata (1.9 kB)
Collecting build>=1.0.3 (from chromadb)
  Downloading build-1.2.2.p

In [2]:
from huggingface_hub import HfApi
api = HfApi(token=os.getenv("HUGGINGFACE_API_KEY"))
print(api.whoami())  # Should return your user info

{'type': 'user', 'id': '6492d604c416da73cd221350', 'name': 'VaibhavD', 'fullname': 'Vaibhav Desai', 'isPro': False, 'avatarUrl': '/avatars/cb4e2d6b233a3e1a624219288f035fa5.svg', 'orgs': [], 'auth': {'type': 'access_token', 'accessToken': {'displayName': 'hf_token', 'role': 'fineGrained', 'createdAt': '2025-02-15T03:12:10.015Z', 'fineGrained': {'canReadGatedRepos': False, 'global': [], 'scoped': [{'entity': {'_id': '6492d604c416da73cd221350', 'type': 'user', 'name': 'VaibhavD'}, 'permissions': []}]}}}}


In [6]:
!pip install pypdf sentence-transformers

Collecting pypdf
  Downloading pypdf-5.3.0-py3-none-any.whl.metadata (7.2 kB)
Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch>=1.11.0->sentence-transformers)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch>=1.11.0->sentence-transformers)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.4.127 (from torch>=1.11.0->sentence-transformers)
  Downloading nvidia_cuda_cupti_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cudnn-cu12==9.1.0.70 (from torch>=1.11.0->sentence-transformers)
  Downloading nvidia_cudnn_cu12-9.1.0.70-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cublas-cu12==12.4.5.8 (from torch>=1.11.0->sentence-transformers)
  Downloading nvidia_cublas_cu12-12.4.5.8-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collec

#**MISTRAL**

In [21]:


# Import required libraries
import os
from dotenv import load_dotenv
from langchain_community.vectorstores import Chroma
from langchain_community.embeddings import HuggingFaceEmbeddings
from langchain.document_loaders import PyPDFLoader, CSVLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.chains.summarize import load_summarize_chain
from langchain.llms import HuggingFaceHub
from langchain.schema import Document  # Import the Document class
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

# Load environment variables from .env file
load_dotenv()

# Step 1: Load and preprocess documents
def load_documents(pdf_folder, excel_folder):
    documents = []

    # Load PDFs
    for pdf_file in os.listdir(pdf_folder):
        if pdf_file.endswith(".pdf"):
            pdf_path = os.path.join(pdf_folder, pdf_file)
            loader = PyPDFLoader(pdf_path)
            documents.extend(loader.load())

    # Load Excel and CSV files
    for excel_file in os.listdir(excel_folder):
        file_path = os.path.join(excel_folder, excel_file)
        if excel_file.endswith(".csv"):
            # Load CSV files
            loader = CSVLoader(file_path)
            documents.extend(loader.load())
        elif excel_file.endswith(".xlsx") or excel_file.endswith(".xls"):
            # Load Excel files using pandas
            df = pd.read_excel(file_path)
            # Convert DataFrame to text
            text = df.to_string(index=False)
            # Create a document object (mimicking LangChain's document format)
            documents.append({"page_content": text, "metadata": {"source": file_path}})

    return documents

# Step 2: Split documents into chunks
def split_documents(documents):
    text_splitter = RecursiveCharacterTextSplitter(chunk_size=1500, chunk_overlap=100)
    return text_splitter.split_documents(documents)

# Step 3: Initialize ChromaDB with LangChain's Chroma integration
def initialize_chroma(documents):
    # Initialize Hugging Face embeddings
    embeddings = HuggingFaceEmbeddings(model_name="all-MiniLM-L6-v2")

    # Create Chroma vector store
    vector_store = Chroma.from_documents(documents, embeddings, persist_directory="./chroma_db")
    return vector_store

# Step 4: Query similar documents
def query_similar_documents(vector_store, query_text, top_k=5):
    results = vector_store.similarity_search(query_text, k=top_k)
    return results

# Step 5: Remove duplicate documents
def remove_duplicate_documents(documents):
    seen_content = set()
    unique_documents = []
    for doc in documents:
        content = doc.page_content if hasattr(doc, "page_content") else doc["page_content"]
        if content not in seen_content:
            seen_content.add(content)
            unique_documents.append(doc)
    return unique_documents

# Step 6: Combine similar documents
def combine_similar_documents(documents, similarity_threshold=0.8):
    texts = [doc.page_content if hasattr(doc, "page_content") else doc["page_content"] for doc in documents]
    vectorizer = TfidfVectorizer()
    tfidf_matrix = vectorizer.fit_transform(texts)
    similarity_matrix = cosine_similarity(tfidf_matrix)

    combined_docs = []
    added_indices = set()
    for i in range(len(texts)):
        if i not in added_indices:
            combined_text = texts[i]
            for j in range(i + 1, len(texts)):
                if similarity_matrix[i, j] > similarity_threshold:
                    combined_text += " " + texts[j]
                    added_indices.add(j)
            # Convert the combined text into a Document object
            combined_docs.append(Document(page_content=combined_text))
    return combined_docs

# Step 7: Summarize relevant documents using LangChain (RAG)
def summarize_relevant_documents(vector_store, query_text, top_k=3):
    # Step 1: Retrieve relevant documents
    relevant_docs = query_similar_documents(vector_store, query_text, top_k=top_k)
    print(f"Retrieved {len(relevant_docs)} documents for query: '{query_text}'")

    # Step 2: Remove duplicate documents
    unique_docs = remove_duplicate_documents(relevant_docs)
    print(f"After removing duplicates, {len(unique_docs)} unique documents remain.")

    # Step 3: Combine similar documents
    combined_docs = combine_similar_documents(unique_docs)
    print(f"After combining similar documents, {len(combined_docs)} documents remain.")

    # Step 4: Summarize the combined documents
    summary = summarize_document(combined_docs)
    return summary

# Step 8: Summarize a document using LangChain
def summarize_document(documents):
    # Load Hugging Face API key from .env
    huggingface_api_key = os.getenv("HUGGINGFACE_API_KEY")
    if not huggingface_api_key:
        raise ValueError("Hugging Face API key not found in .env file.")

    # Initialize LLM
    llm = HuggingFaceHub(repo_id="mistralai/Mistral-7B-Instruct-v0.1", huggingfacehub_api_token=huggingface_api_key)
    chain = load_summarize_chain(llm, chain_type="map_reduce")
    summary = chain.run(documents)  # Pass the list of documents directly
    return summary

# Example usage
if __name__ == "__main__":
    # Define folder paths
    pdf_folder = "./sample_data/pdfs"
    excel_folder = "./sample_data/excels"

    # Load and preprocess documents
    documents = load_documents(pdf_folder, excel_folder)
    split_docs = split_documents(documents)

    # Initialize ChromaDB
    vector_store = initialize_chroma(split_docs)

    # Query 1: First query
    query1 = "How should be our Financial Planning"
    summary1 = summarize_relevant_documents(vector_store, query1, top_k=3)
    print(f"Summary for query '{query1}':", summary1)



Retrieved 3 documents for query: 'How should be our Financial Planning'
After removing duplicates, 1 unique documents remain.
After combining similar documents, 1 documents remain.




Summary for query 'How should be our Financial Planning': Write a concise summary of the following:


"Write a concise summary of the following:


" Financial planning involves the questions of a firm’s 
long-term growth and profitability and investment 
and financing decisions 
 It focuses on aggressive capital expenditure 
programmes and debt equity mix rather than the 
individual projects and sources of finance. Financial 
planning also involves an interface between the 
corporate policy and financial planning and the trade 
off between financial policy variables. 
 
Financial Planning Cont’ 
 
4"


CONCISE SUMMARY: Financial planning is a strategic process that focuses on a firm's long-term growth and profitability. It involves making investment and financing decisions, particularly concerning capital expenditure programs and debt-equity mix, rather than individual projects or specific financing sources. It also bridges corporate policy and financial planning, balancing various f

# Llama**-3.3**

In [22]:
# Import required libraries
import os
from dotenv import load_dotenv
from langchain_community.vectorstores import Chroma
from langchain_community.embeddings import HuggingFaceEmbeddings
from langchain.document_loaders import PyPDFLoader, CSVLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.chains.summarize import load_summarize_chain
from langchain.llms import HuggingFaceHub
from langchain.schema import Document  # Import the Document class
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

# Load environment variables from .env file
load_dotenv()

# Step 1: Load and preprocess documents
def load_documents(pdf_folder, excel_folder):
    documents = []

    # Load PDFs
    for pdf_file in os.listdir(pdf_folder):
        if pdf_file.endswith(".pdf"):
            pdf_path = os.path.join(pdf_folder, pdf_file)
            loader = PyPDFLoader(pdf_path)
            documents.extend(loader.load())

    # Load Excel and CSV files
    for excel_file in os.listdir(excel_folder):
        file_path = os.path.join(excel_folder, excel_file)
        if excel_file.endswith(".csv"):
            # Load CSV files
            loader = CSVLoader(file_path)
            documents.extend(loader.load())
        elif excel_file.endswith(".xlsx") or excel_file.endswith(".xls"):
            # Load Excel files using pandas
            df = pd.read_excel(file_path)
            # Convert DataFrame to text
            text = df.to_string(index=False)
            # Create a document object (mimicking LangChain's document format)
            documents.append({"page_content": text, "metadata": {"source": file_path}})

    return documents

# Step 2: Split documents into chunks
def split_documents(documents):
    text_splitter = RecursiveCharacterTextSplitter(chunk_size=1500, chunk_overlap=100)
    return text_splitter.split_documents(documents)

# Step 3: Initialize ChromaDB with LangChain's Chroma integration
def initialize_chroma(documents):
    # Initialize Hugging Face embeddings
    embeddings = HuggingFaceEmbeddings(model_name="all-MiniLM-L6-v2")

    # Create Chroma vector store
    vector_store = Chroma.from_documents(documents, embeddings, persist_directory="./chroma_db")
    return vector_store

# Step 4: Query similar documents
def query_similar_documents(vector_store, query_text, top_k=5):
    results = vector_store.similarity_search(query_text, k=top_k)
    return results

# Step 5: Remove duplicate documents
def remove_duplicate_documents(documents):
    seen_content = set()
    unique_documents = []
    for doc in documents:
        content = doc.page_content if hasattr(doc, "page_content") else doc["page_content"]
        if content not in seen_content:
            seen_content.add(content)
            unique_documents.append(doc)
    return unique_documents

# Step 6: Combine similar documents
def combine_similar_documents(documents, similarity_threshold=0.8):
    texts = [doc.page_content if hasattr(doc, "page_content") else doc["page_content"] for doc in documents]
    vectorizer = TfidfVectorizer()
    tfidf_matrix = vectorizer.fit_transform(texts)
    similarity_matrix = cosine_similarity(tfidf_matrix)

    combined_docs = []
    added_indices = set()
    for i in range(len(texts)):
        if i not in added_indices:
            combined_text = texts[i]
            for j in range(i + 1, len(texts)):
                if similarity_matrix[i, j] > similarity_threshold:
                    combined_text += " " + texts[j]
                    added_indices.add(j)
            # Convert the combined text into a Document object
            combined_docs.append(Document(page_content=combined_text))
    return combined_docs

# Step 7: Summarize relevant documents using LangChain (RAG)
def summarize_relevant_documents(vector_store, query_text, top_k=3):
    # Step 1: Retrieve relevant documents
    relevant_docs = query_similar_documents(vector_store, query_text, top_k=top_k)
    print(f"Retrieved {len(relevant_docs)} documents for query: '{query_text}'")

    # Step 2: Remove duplicate documents
    unique_docs = remove_duplicate_documents(relevant_docs)
    print(f"After removing duplicates, {len(unique_docs)} unique documents remain.")

    # Step 3: Combine similar documents
    combined_docs = combine_similar_documents(unique_docs)
    print(f"After combining similar documents, {len(combined_docs)} documents remain.")

    # Step 4: Summarize the combined documents
    summary = summarize_document(combined_docs)
    return summary

# Step 8: Summarize a document using LangChain
def summarize_document(documents):
    # Load Hugging Face API key from .env
    huggingface_api_key = os.getenv("HUGGINGFACE_API_KEY")
    if not huggingface_api_key:
        raise ValueError("Hugging Face API key not found in .env file.")

    # Initialize LLM
    llm = HuggingFaceHub(repo_id="meta-llama/Llama-3.3-70B-Instruct", huggingfacehub_api_token=huggingface_api_key)
    chain = load_summarize_chain(llm, chain_type="map_reduce")
    summary = chain.run(documents)  # Pass the list of documents directly
    return summary

# Example usage
if __name__ == "__main__":
    # Define folder paths
    pdf_folder = "./sample_data/pdfs"
    excel_folder = "./sample_data/excels"

    # Load and preprocess documents
    documents = load_documents(pdf_folder, excel_folder)
    split_docs = split_documents(documents)

    # Initialize ChromaDB
    vector_store = initialize_chroma(split_docs)

    # Query 1: First query
    query1 = "stocks"
    summary1 = summarize_relevant_documents(vector_store, query1, top_k=3)
    print(f"Summary for query '{query1}':", summary1)



Retrieved 3 documents for query: 'stocks'
After removing duplicates, 1 unique documents remain.
After combining similar documents, 1 documents remain.




Summary for query 'stocks': Write a concise summary of the following:


"Write a concise summary of the following:


"entities, which is achieved by the pooling of a number of small investments into a large bucket. 
Stock Market is the most suitable investment for the common man as it offers an opportunity to 
invest in a diversified, professionally managed portfolio at a relatively low cost. The review of 
literature has brought to light that  
 Enlistment of corporate securities in more than one stock exchange at the same time 
improves liquidity of securities and functioning of stock exchange- According to Gupta. 
 There is existence of wild speculation in the Indian stock market -According to L.C. 
Gupta."


CONCISE SUMMARY: The stock market is an ideal investment for the average person due to its
diversification and low cost. It's enhanced by simultaneous listing of securities on multiple exchanges,
which boosts liquidity and exchange efficiency, as per Gupta. However, the India