In [None]:
!pip install openai langchain langchain-openai langchain-community faiss-cpu requests beautifulsoup4 pandas tqdm --quiet

In [None]:
import os
import requests
from bs4 import BeautifulSoup
import pandas as pd
import re
import urllib3
from openai import OpenAI, AuthenticationError
from langchain_openai import OpenAIEmbeddings
from langchain_community.vectorstores import FAISS
from langchain_openai import ChatOpenAI
from langchain_core.prompts import PromptTemplate
from langchain_core.runnables import RunnablePassthrough
from langchain_core.output_parsers import StrOutputParser
from tqdm import tqdm
import gradio as gr
from kaggle_secrets import UserSecretsClient

# Disable SSL warnings for expired certificate
urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)

# Set OpenAI API key from Kaggle Secrets
try:
    user_secrets = UserSecretsClient()
    os.environ["OPENAI_API_KEY"] = user_secrets.get_secret("OPENAI_API_KEY")
except Exception as e:
    raise ValueError(f"Failed to load OPENAI_API_KEY from Kaggle Secrets: {str(e)}. Please ensure the secret is set.")

# Verify API key
if not os.environ.get("OPENAI_API_KEY"):
    raise ValueError("OPENAI_API_KEY is not set. Please configure it in Kaggle Secrets.")

# Initialize OpenAI client
client = OpenAI()
try:
    client.models.list()  # Test API key validity
    print("✅ OpenAI API key validated successfully.")
except AuthenticationError as e:
    raise AuthenticationError(f"OpenAI authentication failed: {str(e)}. Please check your API key in Kaggle Secrets.")

PPC_URL = "https://www.pakistani.org/pakistan/legislation/1860/actXLVof1860.html"

In [None]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import re
import urllib3

# Disable SSL warnings for expired certificate
urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)

PPC_URL = "https://www.pakistani.org/pakistan/legislation/1860/actXLVof1860.html"

def scrape_ppc_full_advanced():
    resp = requests.get(PPC_URL, verify=False)
    soup = BeautifulSoup(resp.text, "html.parser")

    # Get raw text only from main content area
    raw_text = soup.get_text(separator="\n")
    raw_text = re.sub(r'\n+', '\n', raw_text).strip()

    # Cut everything before main title and after appendix/footer
    start_idx = raw_text.lower().find("pakistan penal code")
    end_idx = raw_text.lower().rfind("schedule")
    main_text = raw_text[start_idx:end_idx].strip()

    # Split into lines
    lines = [line.strip() for line in main_text.split("\n") if line.strip()]

    data = []
    current_chapter = ""
    current_section = ""
    current_text = []

    def save_section():
        if current_section and current_text:
            data.append({
                "id": f"PPC_{current_section}",
                "chapter": current_chapter,
                "source": "PPC",
                "type": "article",
                "language": "en",
                "official_text": " ".join(current_text).strip(),
                "summary_text": ""
            })

    for line in lines:
        # Detect Chapter headings
        if re.match(r'Chapter\s+[A-Z0-9]+', line, re.IGNORECASE):
            current_chapter = line
            continue

        # Detect Section numbers in multiple formats
        sec_match = re.match(r'(Section|S\.)\s*(\d+[A-Z]?)', line)
        num_match = re.match(r'^(\d+[A-Z]?)\.', line)  # e.g., "375."
        if sec_match or num_match:
            save_section()
            current_section = sec_match.group(2) if sec_match else num_match.group(1)
            current_text = [line]
        else:
            current_text.append(line)

    # Save last section
    save_section()

    return pd.DataFrame(data)

ppc_df = scrape_ppc_full_advanced()
print(f"Scraped {len(ppc_df)} PPC sections")
ppc_df.head(10)


In [None]:
def summarize_text(text, lang="en"):
    try:
        response = client.chat.completions.create(
            model="gpt-4o-mini",
            messages=[
                {"role": "system", "content": f"Summarize the following legal text in {lang.capitalize()}. Keep it concise (15-80 words)."},
                {"role": "user", "content": text[:2000]}  # Truncate to avoid token limits
            ],
            max_tokens=100,
            temperature=0.3
        )
        return response.choices[0].message.content.strip()
    except AuthenticationError as e:
        raise AuthenticationError(f"OpenAI authentication failed: {str(e)}")
    except Exception as e:
        print(f"Error summarizing text: {str(e)}")
        return f"Summary failed due to API error: {str(e)}"

ppc_df["summary_en"] = ""
ppc_df["summary_ur"] = ""

batch_size = 8
total_batches = (len(ppc_df) + batch_size - 1) // batch_size

for i in tqdm(range(total_batches)):
    start = i * batch_size
    end = min(start + batch_size, len(ppc_df))
    batch_texts = ppc_df.iloc[start:end]["official_text"].tolist()

    try:
        en_summaries = [summarize_text(text, "en") for text in batch_texts]
        ur_summaries = [summarize_text(text, "ur") for text in batch_texts]
    except Exception as e:
        print(f"Error in batch {i}: {str(e)}")
        en_summaries = ["Error" for _ in batch_texts]
        ur_summaries = ["Error" for _ in batch_texts]

    ppc_df.loc[start:end-1, "summary_en"] = en_summaries
    ppc_df.loc[start:end-1, "summary_ur"] = ur_summaries

    # Save partial checkpoint every 5 batches
    if i % 5 == 0:
        ppc_df.to_csv("ppc_bilingual_partial.csv", index=False)

# Final save
ppc_df.to_csv("ppc_bilingual.csv", index=False)
print("✅ Saved bilingual PPC dataset as ppc_bilingual.csv")
ppc_df.head()

In [None]:
# Use combined English + Urdu summaries for embeddings
texts_for_index = (ppc_df["summary_en"] + " " + ppc_df["summary_ur"]).tolist()
metadatas = ppc_df.to_dict(orient="records")

# Initialize OpenAI embeddings
try:
    embeddings = OpenAIEmbeddings(model="text-embedding-3-small")
except AuthenticationError as e:
    raise AuthenticationError(f"Failed to initialize embeddings: {str(e)}")

# Create FAISS vector store with LangChain
vectorstore = FAISS.from_texts(
    texts=texts_for_index,
    embedding=embeddings,
    metadatas=metadatas
)

# Save the vector store
vectorstore.save_local("ppc_faiss_index")
print(f"FAISS index built with {vectorstore.index.ntotal} entries.")

In [None]:
# Load the vector store
try:
    vectorstore = FAISS.load_local("ppc_faiss_index", embeddings=embeddings, allow_dangerous_deserialization=True)
except Exception as e:
    raise Exception(f"Failed to load FAISS index: {str(e)}")

# Set up retriever
retriever = vectorstore.as_retriever(search_kwargs={"k": 5})

# Initialize OpenAI LLM
try:
    llm = ChatOpenAI(model="gpt-4o-mini", temperature=0.2)
except AuthenticationError as e:
    raise AuthenticationError(f"Failed to initialize LLM: {str(e)}")

# Define prompt template
prompt = PromptTemplate.from_template("""
Answer the question based on the following context from the Pakistan Penal Code:
{context}

Question: {question}
Answer in a clear, concise manner:
""")

# Build RAG chain
rag_chain = (
    {"context": retriever | (lambda docs: "\n\n".join([f"Section {d.metadata['id']}: {d.page_content}" for d in docs])),
     "question": RunnablePassthrough()}
    | prompt
    | llm
    | StrOutputParser()
)

# Test the RAG chain
query = "What is the punishment for theft in PPC?"
print("Query:", query)
try:
    print("Answer:", rag_chain.invoke(query))
except Exception as e:
    print(f"Error during RAG query: {str(e)}")

In [None]:
def chatbot(query):
    try:
        return rag_chain.invoke(query)
    except Exception as e:
        return f"Error: {str(e)}"

demo = gr.Interface(
    fn=chatbot,
    inputs=gr.Textbox(lines=2, placeholder="Ask a question about PPC (in English or Urdu)..."),
    outputs="text",
    title="⚖️ PPC RAG Chatbot with OpenAI & LangChain",
    description="Powered by OpenAI for summarization/generation and LangChain for RAG. Supports bilingual queries."
)

demo.launch(share=True)