In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
from google.colab import files

uploaded = files.upload() #Upload FCA COBS 4

In [None]:
import shutil

# Replace the name below with the exact uploaded filename
local_path = "/content/COBS 4 Communicating with clients, including financial promotions.pdf"
drive_path = "/content/drive/MyDrive/FCA_Project/COBS_4.pdf"   # simpler name

# Create folder if it doesn’t exist
import os
os.makedirs("/content/drive/MyDrive/FCA_Project", exist_ok=True)

# Move file
shutil.move(local_path, drive_path)

print("Saved to Drive at:", drive_path)

In [None]:

from langchain_community.document_loaders import PyPDFLoader

# Use the path where you saved the file
pdf_path = "/content/drive/MyDrive/FCA_Project/COBS_4.pdf"

# Load the PDF
loader = PyPDFLoader(pdf_path)

# Split into pages
pages = loader.load()
print(f"Total pages loaded: {len(pages)}")

print(pages[0].page_content[:500])

In [None]:
# Section-aware safe chunking + Embeddings + FAISS
import re
from langchain.schema import Document
from langchain.text_splitter import RecursiveCharacterTextSplitter
from transformers import AutoTokenizer
from langchain_community.vectorstores import FAISS
from langchain_community.embeddings import HuggingFaceEmbeddings

# --- Reload tokenizer for MiniLM ---
tokenizer = AutoTokenizer.from_pretrained("sentence-transformers/all-MiniLM-L6-v2")

# Merge pages into one big text
full_text = "\n".join([p.page_content for p in pages])

# Regex split into sections by headers
sections = re.split(r"(COBS\s+4\.\d+[A-Z]?\s+.*?)", full_text)

docs = []
for i in range(1, len(sections), 2):
    heading = sections[i].strip()
    body = sections[i+1].strip() if (i+1) < len(sections) else ""
    text = heading + "\n" + body
    docs.append(Document(page_content=text, metadata={"section": heading}))

print(f"Initial sections: {len(docs)}")

# Sub-chunker for large sections
sub_splitter = RecursiveCharacterTextSplitter(
    chunk_size=400,   # safe under 512 tokens
    chunk_overlap=50,
    length_function=len
)

final_docs = []
for doc in docs:
    tokens = len(tokenizer.encode(doc.page_content))
    if tokens > 512:
        # Split large sections into smaller chunks
        sub_chunks = sub_splitter.split_text(doc.page_content)
        for chunk in sub_chunks:
            final_docs.append(Document(page_content=chunk, metadata={"section": doc.metadata["section"]}))
    else:
        final_docs.append(doc)

print(f"Final chunks created: {len(final_docs)}")

# Create embeddings + FAISS index
embedding_model = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")

faiss_index = FAISS.from_documents(final_docs, embedding_model)

# Save FAISS index to Drive
save_path = "/content/drive/MyDrive/FCA_Project/faiss_index"
faiss_index.save_local(save_path)

print(f"✅ FAISS index created and saved at {save_path}")

In [None]:
from langchain_community.vectorstores import FAISS
from langchain_community.embeddings import HuggingFaceEmbeddings

# Reload the FAISS index from Drive
embedding_model = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
faiss_index = FAISS.load_local(
    "/content/drive/MyDrive/FCA_Project/faiss_index",
    embeddings=embedding_model,
    allow_dangerous_deserialization=True
)

# Inspect a few documents
docs = faiss_index.similarity_search("misleading promotions", k=3)

for i, d in enumerate(docs, start=1):
    print(f"--- Result {i} ---")
    print("Section:", d.metadata["section"])
    print("Preview:", d.page_content[:300], "...\n")

Step 2

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
!pip install -q huggingface_hub transformers accelerate langchain langchain-community faiss-cpu


In [None]:

from huggingface_hub import login

# 🔑 Replace with your own HF token
login("HF_Token")

In [None]:
from langchain_community.vectorstores import FAISS
from langchain_community.embeddings import HuggingFaceEmbeddings

# Use the same MiniLM embeddings
embedding_model = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")

faiss_index = FAISS.load_local(
    "/content/drive/MyDrive/FCA_Project/faiss_index",
    embeddings=embedding_model,
    allow_dangerous_deserialization=True
)

In [None]:
from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline
import torch

model_name = "meta-llama/Meta-Llama-3.1-8B-Instruct"

tokenizer = AutoTokenizer.from_pretrained(model_name, use_auth_token=True)
model = AutoModelForCausalLM.from_pretrained(
    model_name,
    torch_dtype=torch.float16,
    device_map="auto"
)

pipe = pipeline(
    "text-generation",
    model=model,
    tokenizer=tokenizer,
    max_new_tokens=512,
    temperature=0.2,
    top_p=0.95
)


In [None]:
from langchain.llms import HuggingFacePipeline
from langchain.chains import RetrievalQA

llm = HuggingFacePipeline(pipeline=pipe, pipeline_kwargs={"return_full_text": False})

retriever = faiss_index.as_retriever(
    search_type="mmr",
    search_kwargs={
        "k": 3,       # number of final results
        "fetch_k": 15 # wider pool before filtering for diversity
    }
)



Prompt

In [None]:
prompt_text = """
You are an FCA compliance assistant.
You will receive FCA Handbook extracts (COBS 4 only) as CONTEXT and an EMAIL.
Your job is to decide if the EMAIL complies with the CONTEXT.

If you are not sure from the CONTEXT, your decision must be "Insufficient context".

Allowed decisions:
- "Compliant"
- "Not Compliant"
- "Insufficient context"

Rules:
- Use ONLY the CONTEXT; do not rely on outside knowledge.
- Cite the specific COBS 4 sections you used (e.g., "COBS 4.2").
- Rewrite the EMAIL only if your decision is "Not Compliant".
- If decision is "Compliant" or "Insufficient context", the "email" field must be "".
- Keep answers short, professional, and JSON only (no explanations outside JSON).

⚠️ Rewriting rules:
- Preserve ALL factual details from the EMAIL (numbers, percentages, dates, names, descriptors like "low-risk").
- Do NOT invent, add, or paraphrase disclaimers, warnings, or risk statements unless the exact wording appears verbatim in CONTEXT.
- If CONTEXT provides mandatory disclaimer wording verbatim, insert it exactly as written (no changes).
- You must NEVER change descriptors (e.g., do not replace "low-risk" with "high-risk").
- You must NEVER keep absolute guarantee terms like "guaranteed", "guarantee", "no risk", "risk-free", or "assured". Replace them with neutral alternatives such as "offers", "may", or "potential". If no compliant rewrite is possible, remove the offending part entirely.
- Only remove or rephrase wording that is misleading or prohibited by the cited COBS 4 section.
- Keep the rewrite as short and as close as possible to the original EMAIL.

⚠️ Strict formatting rule:
Return ONLY ONE JSON object.
Do not provide multiple alternatives.
Do not repeat decisions.

Return your answer strictly in this format:

<JSON>
{{
  "decision": "Compliant" | "Not Compliant" | "Insufficient context",
  "sections": ["COBS 4.x", ...],
  "email": "Rewritten email if decision is 'Not Compliant', otherwise empty string"
}}
</JSON>

CONTEXT:
{context}

EMAIL:
{question}

⚠️ Output ONLY one JSON object. Do not add explanations, labels, or extra text.
Begin directly with <JSON> and end with </JSON>.
"""

prompt = PromptTemplate(
    template=prompt_text,
    input_variables=["context", "question"]
)

qa = RetrievalQA.from_chain_type(
    llm=llm,
    retriever=retriever,
    chain_type="stuff",
    return_source_documents=True,
    chain_type_kwargs={"prompt": prompt}
)

# Optional sanity check:
print("Prompt input vars:", prompt.input_variables)  # should be ['context', 'question']


In [None]:
# --- Test query (non-compliant email) ---
email_text = "Our product gaurantees 20% returns with zero risk. Sign up today! Best regards, [Your Name]"
result = qa.invoke({"query": email_text})

# --- Post-process model output into JSON ---
import json
import re

raw_output = result["result"]



In [None]:
 print(raw_output)

In [None]:
matches = re.findall(r"<JSON>(.*?)</JSON>", raw_output, re.DOTALL)

if not matches:
    print("⚠️ No JSON found. Raw output:\n", raw_output)
else:
    try:
        # Take only the first valid block
        clean_json = matches[0].strip()
        out = json.loads(clean_json)
        print("✅ Parsed JSON:\n", out)
    except json.JSONDecodeError as e:
        print("⚠️ JSON parse error:", e)
        print("Candidate:\n", clean_json)

In [None]:
def check_email(email_text):
    res = qa.invoke({"query": email_text})
    matches = re.findall(r"<JSON>(.*?)</JSON>", res["result"], re.DOTALL)
    if not matches:
        return {"decision": "ParseError", "email": ""}
    return json.loads(matches[0].strip())