In [26]:
from IPython.display import display
from IPython.display import Markdown
import textwrap


def to_markdown(text):
  text = text.replace('•', '  *')
  return Markdown(textwrap.indent(text, '> ', predicate=lambda _: True))

In [28]:
import google.generativeai as genai

In [30]:
import os
GOOGLE_API_KEY='your api key'
genai.configure(api_key=GOOGLE_API_KEY)

In [31]:
model = genai.GenerativeModel(model_name = "gemini-pro")
model

genai.GenerativeModel(
    model_name='models/gemini-pro',
    generation_config={},
    safety_settings={},
    tools=None,
    system_instruction=None,
    cached_content=None
)

In [32]:
response = model.generate_content("What is an ai agent?")

In [33]:
to_markdown(response.text)

> An AI agent is a computer system that is able to perceive its environment and take actions to achieve its goals. AI agents are often used in robotics, gaming, and other applications where it is necessary for the computer to be able to make decisions and act autonomously.
> 
> There are many different types of AI agents, but they all share some common characteristics. First, AI agents must be able to perceive their environment. This can be done through sensors, cameras, or other input devices. Second, AI agents must be able to reason about their environment and make decisions. This can be done using a variety of techniques, such as machine learning, logic, and probability theory. Finally, AI agents must be able to act on their decisions. This can be done through motors, actuators, or other output devices.
> 
> AI agents are becoming increasingly sophisticated, and they are being used in a wider range of applications. As AI technology continues to develop, AI agents are likely to play an increasingly important role in our lives.

In [36]:
from langchain_google_genai import ChatGoogleGenerativeAI


In [37]:
llm = ChatGoogleGenerativeAI(model="gemini-pro",google_api_key=GOOGLE_API_KEY)

In [38]:
result = llm.invoke("What are the usecases of LLMs?")


In [39]:
to_markdown(result.content)

> **Text Generation:**
> * Content creation (e.g., articles, scripts, poems)
> * Summarization and paraphrasing
> * Chatbots and virtual assistants
> * Language translation
> 
> **Text Analysis:**
> * Sentiment analysis
> * Question answering
> * Text classification
> * Code generation
> 
> **Customer Service and Support:**
> * Automated customer service chatbots
> * Help document generation and search
> * Ticket routing and resolution
> 
> **Business Intelligence:**
> * Market research and analysis
> * Competitive intelligence
> * Financial forecasting
> * Risk assessment
> 
> **Healthcare:**
> * Medical diagnosis and treatment planning
> * Drug discovery and research
> * Patient record analysis
> 
> **Education:**
> * Personalized learning plans
> * Intelligent tutoring systems
> * Essay grading and feedback
> 
> **Entertainment:**
> * Video game storytelling and dialogue
> * Music and poetry generation
> * Virtual reality and augmented reality experiences
> 
> **Research:**
> * Scientific paper generation and analysis
> * Data mining and exploration
> * Hypothesis testing and model building
> 
> **Other:**
> * Code analysis and debugging
> * Legal document review
> * Social media monitoring and analysis
> * Spam and phishing detection

In [41]:
import urllib
import warnings
from pathlib import Path as p
from pprint import pprint

import pandas as pd
from langchain import PromptTemplate
from langchain.chains.question_answering import load_qa_chain
from langchain.document_loaders import PyPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.vectorstores import Chroma
from langchain.chains import RetrievalQA



warnings.filterwarnings("ignore")
# restart python kernal if issues with langchain import.

In [42]:
model = ChatGoogleGenerativeAI(model="gemini-pro",google_api_key=GOOGLE_API_KEY,
                             temperature=0.2,convert_system_message_to_human=True)


In [45]:
pdf_loader = PyPDFLoader(r'C:\Users\mukht\Downloads\2410.22153v1.pdf')
pages = pdf_loader.load_and_split()
print(pages[3].page_content)


F1 Score Moderation Aegis Toxicchat RTP_LX XSafety
En Mul En Mul En Mul En Mul En Mul
Aegis-Defensive 66.75 56.40 84.95 78.95 63.83 43.93 86.89 86.59 67.61 73.12
MD-Judge 76.8 67.03 84.62 35.30 81.05 47.40 92.13 43.14 58.62 28.54
LlaMa-Guard-2 75.88 73.54 59.81 55.91 42.14 33.56 40.33 35.80 35.80 32.93
LlaMa-Guard-3 78.21 74.23 68.82 63.66 46.29 42.17 49.4 45.30 42.65 39.38
Table 2: Benchmarking the performance of guardrails on our test suite. Here we report the F1 score for classifying
user prompt safety. En and Mul denotes the performance of English and the non-English, respectively. Number in
bold and underline highlights the best and the second-best performance across different models, respectively.
handling multilingual harmful inputs. Additionally,
although MD-Judge has a better performance on
English across different datasets, its performance
on multilingual inputs is low. Also, for the XSafety
dataset, we observe that the Aegis-Defensive model
performs better on non-English dat

In [46]:
len(pages)

12

In [47]:
text_splitter = RecursiveCharacterTextSplitter(chunk_size=10000, chunk_overlap=1000)
context = "\n\n".join(str(p.page_content) for p in pages)
texts = text_splitter.split_text(context)

In [49]:
from langchain_google_genai import GoogleGenerativeAIEmbeddings

In [50]:
embeddings = GoogleGenerativeAIEmbeddings(model="models/embedding-001",google_api_key=GOOGLE_API_KEY)

In [62]:
vector_index = Chroma.from_texts(texts, embeddings).as_retriever(search_kwargs={"k":5})


In [63]:
qa_chain = RetrievalQA.from_chain_type(
    model,
    retriever=vector_index,
    return_source_documents=True

)

In [66]:
question = "what is guardrail"
result = qa_chain({"query": question})
result["result"]

Number of requested results 5 is greater than number of elements in index 4, updating n_results = 4


'Guardrails are standalone models that can be used to detect and defend against toxic content. They are typically pre-trained on a large dataset of harmful content and can be used to identify and flag potentially harmful content in real-time.'

In [67]:
Markdown(result["result"])

Guardrails are standalone models that can be used to detect and defend against toxic content. They are typically pre-trained on a large dataset of harmful content and can be used to identify and flag potentially harmful content in real-time.

In [10]:
# If needed, install dependencies
# !pip install google-generativeai langchain chromadb tiktoken pypdf docx2txt openpyxl langchain-google-genai ipywidgets

import os
import tempfile
import warnings
import ipywidgets as widgets
from IPython.display import display, clear_output, Markdown, HTML

import google.generativeai as genai
from langchain_google_genai import ChatGoogleGenerativeAI, GoogleGenerativeAIEmbeddings

from langchain.chains import RetrievalQA
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.vectorstores import Chroma
from langchain.document_loaders import PyPDFLoader
import docx2txt
import openpyxl

warnings.filterwarnings("ignore")

# -------------------------
# Configuration
# -------------------------

GOOGLE_API_KEY = 'your api key'  # Replace with your actual API key
if GOOGLE_API_KEY == "YOUR_GOOGLE_API_KEY":
    raise ValueError("Please replace 'YOUR_GOOGLE_API_KEY' with your actual Google API key.")

genai.configure(api_key=GOOGLE_API_KEY)
MODEL_NAME = "gemini-pro"
EMBEDDING_MODEL = "models/embedding-001"
CHUNK_SIZE = 10000
CHUNK_OVERLAP = 1000

# -------------------------
# Helper Functions
# -------------------------

def load_pdf(file_path):
    loader = PyPDFLoader(file_path)
    pages = loader.load_and_split()
    text = "\n\n".join(p.page_content for p in pages)
    return text, len(pages)

def load_word(file_path):
    text = docx2txt.process(file_path)
    return text, text.count("\n") + 1

def load_excel(file_path):
    wb = openpyxl.load_workbook(file_path, data_only=True)
    all_text = []
    for sheet in wb.sheetnames:
        ws = wb[sheet]
        for row in ws.iter_rows(values_only=True):
            row_text = [str(cell) for cell in row if cell is not None]
            if row_text:
                all_text.append(" ".join(row_text))
    text = "\n".join(all_text)
    return text, text.count("\n") + 1

def process_files(uploaded_files):
    combined_text = ""
    doc_info_list = []
    for uploaded_file in uploaded_files:
        filename = uploaded_file['metadata']['name']
        suffix = filename.split(".")[-1].lower()
        with tempfile.NamedTemporaryFile(delete=False, suffix="."+suffix) as temp_file:
            temp_file.write(uploaded_file['content'])
            temp_file_path = temp_file.name

        try:
            if suffix == "pdf":
                extracted_text, pages_count = load_pdf(temp_file_path)
                doc_info_list.append({"name": filename, "type": "PDF", "length": len(extracted_text)})
            elif suffix in ["doc", "docx"]:
                extracted_text, line_count = load_word(temp_file_path)
                doc_info_list.append({"name": filename, "type": "Word", "length": len(extracted_text)})
            elif suffix in ["xls", "xlsx"]:
                extracted_text, line_count = load_excel(temp_file_path)
                doc_info_list.append({"name": filename, "type": "Excel", "length": len(extracted_text)})
            else:
                extracted_text = ""
                doc_info_list.append({"name": filename, "type": "Unknown", "length": 0})

            combined_text += extracted_text + "\n\n"
        except Exception as e:
            doc_info_list.append({"name": filename, "type": "Error", "length": 0})
            print(f"Error processing {filename}: {str(e)}")
        finally:
            os.remove(temp_file_path)
    return combined_text, doc_info_list

def build_retriever(text):
    text_splitter = RecursiveCharacterTextSplitter(chunk_size=CHUNK_SIZE, chunk_overlap=CHUNK_OVERLAP)
    texts = text_splitter.split_text(text)
    embeddings = GoogleGenerativeAIEmbeddings(model=EMBEDDING_MODEL, google_api_key=GOOGLE_API_KEY)
    vector_index = Chroma.from_texts(texts, embeddings)
    retriever = vector_index.as_retriever(search_kwargs={"k":5})
    return retriever

def build_qa_chain(retriever):
    model = ChatGoogleGenerativeAI(
        model=MODEL_NAME,
        google_api_key=GOOGLE_API_KEY,
        temperature=0.2,
        convert_system_message_to_human=True
    )
    qa_chain = RetrievalQA.from_chain_type(
        llm=model,
        retriever=retriever,
        chain_type="stuff",  # explicitly specify chain type
        return_source_documents=True
    )
    return qa_chain

qa_chain = None
doc_info = []
combined_text = ""
history = []

# -------------------------
# Define Widgets
# -------------------------
file_upload = widgets.FileUpload(
    accept='.pdf,.doc,.docx,.xls,.xlsx',  
    multiple=True,
    description='📄 Upload Files',
    disabled=False,
    style={'description_width': 'initial'},
    layout=widgets.Layout(width='100%')
)

process_button = widgets.Button(
    description='✅ Process Documents',
    button_style='success',
    tooltip='Click to process uploaded documents',
    icon='check'
)

output = widgets.Output()

question_input = widgets.Text(
    value='',
    placeholder='Enter your question here',
    description='❓ Your Question:',
    disabled=False,
    layout=widgets.Layout(width='80%')
)

ask_button = widgets.Button(
    description='💬 Get Answer',
    button_style='info',
    tooltip='Click to get answer to your question',
    icon='search'
)

answer_output = widgets.Output()
show_sources_checkbox = widgets.Checkbox(
    value=False,
    description='📜 Show Source Documents',
    disabled=False
)

history_output = widgets.Output()

def update_history():
    with history_output:
        clear_output()
        if history:
            display(Markdown("### 🗒️ **Question and Answer History:**"))
            for entry in history:
                display(Markdown(f"**Q:** {entry['question']}"))
                display(Markdown(f"**A:** {entry['answer']}"))
        else:
            display(Markdown("**No history yet.**"))

def on_process_button_clicked(b):
    global qa_chain, doc_info, combined_text, history
    with output:
        clear_output()
        if not file_upload.value:
            display(Markdown("**🚫 Please upload at least one document.**"))
            return
        display(Markdown("**⌛ Processing documents... Please wait.**"))
        uploaded_files = []
        for filename, fileinfo in file_upload.value.items():
            uploaded_files.append({
                'metadata': {'name': filename},
                'content': fileinfo['content']
            })
        combined_text, doc_info = process_files(uploaded_files)
        if combined_text.strip():
            display(Markdown("**🔍 Building retriever...**"))
            retriever = build_retriever(combined_text)
            # Test the retriever:
            test_docs = retriever.get_relevant_documents("test")
            if not test_docs:
                display(Markdown("**⚠️ Warning: No documents retrieved for a test query.**"))
            display(Markdown("**🤖 Building QA chain...**"))
            qa_chain = build_qa_chain(retriever)
            display(Markdown("**✅ Document processing complete! You can now ask questions.**"))
            display(Markdown("### 📑 Uploaded Documents:"))
            for info in doc_info:
                display(Markdown(f"- **{info['name']}** (Type: {info['type']}), Extracted length: {info['length']} chars"))
            history = []
            update_history()
        else:
            display(Markdown("**⚠️ No text extracted from the provided documents.**"))

def on_ask_button_clicked(b):
    global qa_chain, history
    with answer_output:
        clear_output()
        if qa_chain is None:
            display(Markdown("**🚫 Please upload and process documents first.**"))
            return
        question = question_input.value.strip()
        if not question:
            display(Markdown("**⚠️ Please enter a valid question.**"))
            return
        display(Markdown("**⌛ Generating answer... Please wait.**"))
        try:
            result = qa_chain({"query": question})
            answer = result["result"]
            source_docs = result.get("source_documents", [])
            history.append({"question": question, "answer": answer})
            update_history()
            display(Markdown(f"**Q:** {question}"))
            display(Markdown(f"**A:** {answer}"))
            if show_sources_checkbox.value and source_docs:
                display(Markdown("**📜 Source Documents:**"))
                for idx, doc in enumerate(source_docs, start=1):
                    snippet = doc.page_content[:200].replace('\n', ' ') + "..."
                    display(Markdown(f"{idx}. {snippet}"))
        except Exception as e:
            display(Markdown(f"**⚠️ Error generating answer:** {str(e)}"))

process_button.on_click(on_process_button_clicked)
ask_button.on_click(on_ask_button_clicked)

upload_box = widgets.VBox([
    widgets.HTML("<h3>📄 Upload Documents</h3>"),
    file_upload,
    process_button,
    output
])

question_box = widgets.HBox([
    question_input,
    ask_button
])

ask_box = widgets.VBox([
    widgets.HTML("<h3>❓ Ask a Question</h3>"),
    question_box,
    show_sources_checkbox,
    answer_output
])

history_section = widgets.VBox([history_output])

app_layout = widgets.VBox([
    widgets.HTML("<h1 style='text-align: center; color: #4B0082;'>📚 RAG-based Document Q&A Interface</h1>"),
    upload_box,
    widgets.HTML("<hr>"),
    ask_box,
    widgets.HTML("<hr>"),
    history_section
])

display(app_layout)

styles = """
<style>
    .widget-label {
        font-weight: bold;
        font-size: 14px;
    }
    .output_wrapper, .output {
        border: 1px solid #ddd;
        padding: 10px;
        border-radius: 4px;
        background-color: #f9f9f9;
    }
    h1, h3 {
        font-family: 'Arial', sans-serif;
    }
    button {
        font-size: 14px;
    }
    .widget-upload {
        border: 2px dashed #4B0082;
        padding: 20px;
        border-radius: 10px;
        background-color: #f0e6ff;
    }
    .widget-text {
        border: 2px solid #4B0082;
        border-radius: 5px;
        padding: 5px;
    }
    .btn-success {
        background-color: #28a745;
        color: white;
    }
    .btn-info {
        background-color: #17a2b8;
        color: white;
    }
    .widget-checkbox {
        font-size: 14px;
    }
</style>
"""
display(HTML(styles))


VBox(children=(HTML(value="<h1 style='text-align: center; color: #4B0082;'>📚 RAG-based Document Q&A Interface<…