In [None]:
pip install langchain chromadb huggingface_hub sentence-transformers

In [None]:
pip install -U langchain-community

In [None]:
import os

import os

# Set your Hugging Face API token directly
os.environ["HF_TOKEN"] = ""



In [None]:
# # Data Input construction

# 1. Create a folder named data in the same directory as your Python script.
# 2. Save the below text above into a file named legal_faqs.txt inside the data folder.



# What is a contract?
# A contract is a legally binding agreement between two or more parties. It outlines the terms and conditions that the parties have agreed upon. For a contract to be valid, it generally needs to include an offer, acceptance of that offer, and consideration (something of value exchanged between the parties).

# What is intellectual property?
# Intellectual property (IP) refers to creations of the mind, such as inventions; literary and artistic works; designs; and symbols, names and images used in commerce. IP is protected in law by, for example, patents, copyright and trademarks, which enable people to earn recognition or financial benefit from what they invent or create.

# What is negligence?
# In legal terms, negligence is a failure to exercise the care that a reasonably prudent person would exercise in similar circumstances. This can result in harm or loss to another person. To prove negligence, one generally needs to show a duty of care existed, that this duty was breached, that the breach caused an injury, and that there were actual damages.

# What is a tort?
# A tort is a civil wrong that causes someone else to suffer loss or harm resulting in legal liability for the person who commits the tortious act. Common torts include negligence, defamation, and trespass. Unlike criminal law, which deals with wrongs against the state, tort law deals with wrongs against individuals.

# What is the difference between a misdemeanor and a felony?
# In the United States legal system, crimes are often classified as either misdemeanors or felonies. Generally, a felony is a more serious crime that can result in a sentence of more than one year in prison. Misdemeanors are less serious and typically carry penalties such as fines or jail time of less than a year. The specific definitions and classifications can vary by jurisdiction.



In [None]:
import os
from typing import List
from getpass import getpass

from langchain.document_loaders import TextLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.vectorstores import Chroma
from langchain.chains import create_retrieval_chain
from langchain.chains.combine_documents import create_stuff_documents_chain
from langchain_core.prompts import ChatPromptTemplate
from langchain_community.llms import HuggingFaceEndpoint
from huggingface_hub import login

# 1. Authentication with error handling
def authenticate_hf():
    """Ensures valid Hugging Face credentials"""
    try:
        token = os.environ.get("HF_TOKEN") or getpass("Enter HF token: ")
        login(token=token, add_to_git_credential=False)
        os.environ["HUGGINGFACEHUB_API_TOKEN"] = token
        return token
    except Exception as e:
        print(f"Authentication failed: {str(e)}")
        exit(1)

# 2. Document processing pipeline
def initialize_rag_system():
    """Modern RAG initialization with proper chain setup"""
    authenticate_hf()

    # Document loading
    documents = TextLoader("./data/legal_faqs.txt").load()

    # Text splitting
    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size=1000,
        chunk_overlap=100,
        separators=["\n\n", "\n", ". ", "? ", "! "]
    )
    chunks = text_splitter.split_documents(documents)

    # Vector store
    embeddings = HuggingFaceEmbeddings(model_name="all-mpnet-base-v2")
    db = Chroma.from_documents(chunks, embeddings)
    retriever = db.as_retriever(search_kwargs={"k": 3})

    # LLM initialization
    llm = HuggingFaceEndpoint(
        repo_id="google/gemma-7b-it",
        temperature=0.1,
        task="text-generation",
        max_new_tokens=300,
    )

    # Prompt template
    prompt = ChatPromptTemplate.from_template("""
    Answer the question based only on the context:
    Context: {context}
    Question: {input}
    Answer:
    """)

    # Create document chain
    document_chain = create_stuff_documents_chain(llm, prompt)

    # Create retrieval chain
    return create_retrieval_chain(retriever, document_chain)

# 3. Query handling
def ask_question(qa_system, query: str):
    """Proper invocation with input formatting"""
    try:
        result = qa_system.invoke({"input": query})
        print(f"\nQuestion: {query}")
        print(f"Answer: {result['answer']}")
    except Exception as e:
        print(f"Query failed: {str(e)}")

# 4. Main application flow
if __name__ == "__main__":
    print("Enterprise FAQ System - Secure RAG v3.0")
    try:
        qa = initialize_rag_system()
    except Exception as e:
        print(f"Initialization failed: {str(e)}")
        exit(1)

    print("System ready. Type questions (exit to quit):")
    while True:
        try:
            user_input = input("\n> ")
            if user_input.lower() == "exit":
                break
            ask_question(qa, user_input)
        except KeyboardInterrupt:
            print("\nExiting...")
            break

In [None]:
# BitNet introduces a fundamentally new approach to large language models (LLMs) by using extreme quantization-specifically, ternary weights (-1, 0, 1) encoded in just 1.58 bits per parameter. This innovation brings several notable benefits:

# 1. Dramatic Efficiency Gains
# Lower Memory Footprint: BitNet models require up to 7.2 times less memory than traditional 16-bit (FP16) models, enabling them to run efficiently on devices with limited resources, such as smartphones, laptops, and IoT devices.

# Faster Inference: By replacing most floating-point multiplications with simple additions and subtractions, BitNet achieves up to 4.1 times faster inference and up to 8.9 times higher throughput, especially as model size increases.

# Reduced Energy Consumption: BitNet slashes energy usage by 55–96% compared to full-precision LLMs, making it a sustainable choice for both edge devices and data centers

In [None]:
# bitnett implmenetation

from langchain.document_loaders import TextLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.vectorstores import Chroma
from langchain.chains import RetrievalQA
from langchain.llms import HuggingFaceHub

def load_documents():
    # Document loading implementation
    return [Document(page_content="Your document content here")]

# Document processing pipeline
documents = load_documents()
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=100)
chunks = text_splitter.split_documents(documents)

# Vector store configuration
embeddings = HuggingFaceEmbeddings(model_name="all-mpnet-base-v2")
db = Chroma.from_documents(chunks, embeddings)

# Model initialization with error handling
try:
    bitnet_llm = HuggingFaceHub(
        repo_id="microsoft/bitnet-b1.58-2B-4T",
        model_kwargs={"temperature": 0.1, "max_length": 300}
    )
except Exception as e:
    print(f"BitNet initialization error: {e}")
    bitnet_llm = HuggingFaceHub(
        repo_id="google/gemma-7b-it",
        model_kwargs={"temperature": 0.1, "max_length": 300}
    )

# QA system configuration
qa_bitnet = RetrievalQA.from_chain_type(
    llm=bitnet_llm,
    chain_type="stuff",
    retriever=db.as_retriever(search_kwargs={"k": 3})
)

def ask_question_bitnet(query):
    """Enhanced query handling with error protection"""
    try:
        response = qa_bitnet.run(query)
        print(f"[BitNet] Answer: {response}")
    except Exception as e:
        print(f"Query processing error: {e}")

# Interactive interface
if __name__ == "__main__":
    print("Enhanced Legal/IT/HR FAQ System")
    while True:
        user_input = input("> ")
        if user_input.lower() == 'exit':
            break
        ask_question_bitnet(user_input)




In [None]:
# To test the connection to hugging face model

In [None]:
from huggingface_hub import login
from getpass import getpass

# Interactive authentication
try:
    login(token=os.environ["HF_TOKEN"])
except KeyError:
    token = getpass("Enter Hugging Face token: ")
    os.environ["HF_TOKEN"] = token
    login(token=token)

# Updated LLM initialization
llm = HuggingFaceHub(
    repo_id="google/gemma-7b-it",
    task="text-generation",
    model_kwargs={
        "temperature": 0.1,
        "max_new_tokens": 300,
        "return_full_text": False
    },
    huggingfacehub_api_token=os.environ["HF_TOKEN"]  # Explicit token
)


In [None]:
import requests
API_URL = "https://api-inference.huggingface.co/models/google/gemma-7b-it"
headers = {"Authorization": f"Bearer {os.environ['HF_TOKEN']}"}
response = requests.post(API_URL, headers=headers, json={"inputs": "Hello"})
print(response.status_code)  # Should return 200

In [None]:
from langchain_community.llms.huggingface_endpoint import HuggingFaceEndpoint

llm = HuggingFaceEndpoint(
    endpoint_url="google/gemma-7b-it",  # Model ID only
    max_new_tokens=300,
    temperature=0.1,
    huggingfacehub_api_token=os.environ["HF_TOKEN"]
)

In [None]:
from huggingface_hub import whoami
print(whoami())

In [None]:
import requests
API_URL = "https://api-inference.huggingface.co/models/google/gemma-7b-it"
headers = {"Authorization": f"Bearer {os.environ['HF_TOKEN']}"}
response = requests.post(API_URL, headers=headers, json={"inputs": "Hello"})
print(response.status_code)  # Should return 200

In [None]:
# deployment

In [None]:
# Dockerfile
# save the above code file as rag_faq.py
FROM python:3.9-slim-buster

WORKDIR /app

COPY requirements.txt .
RUN pip install -r requirements.txt

COPY . .

CMD ["python", "rag_faq.py"]

In [None]:
#You would then build and run the container:
docker build -t rag-faq .
docker run -p 5000:5000 rag-faq # If you were to add a web interface


In [None]:
# 2. Cloud-Based Deployment:

# Serverless Functions (e.g., AWS Lambda, Google Cloud Functions, Azure Functions): Package your RAG logic into a serverless function that gets triggered by user queries (e.g., via an API endpoint). This is cost-effective for applications with variable traffic. You would typically need to adapt your code to be event-driven.
# Container Orchestration Services (e.g., Kubernetes on AWS EKS, Google GKE, Azure AKS): For more scalable and complex deployments, you can deploy your Dockerized application to a Kubernetes cluster.
# Platform-as-a-Service (PaaS) (e.g., Heroku, Google App Engine, AWS Elastic Beanstalk): These platforms simplify deployment by managing the underlying infrastructure. You typically just need to provide your code and a configuration file.
# Virtual Machines (VMs) (e.g., AWS EC2, Google Compute Engine, Azure Virtual Machines): You can provision a VM and run your application directly on it. This gives you more control over the environment but requires more management.