In [None]:
import os
from pathlib import Path
from typing import List, Dict, Any
from PyPDF2 import PdfReader
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_core.documents import Document

class ProcessAndChunkPDFs:
    def __init__(self, chunk_size: int = 1000, chunk_overlap: int = 200):
        self.chunk_size = chunk_size
        self.chunk_overlap = chunk_overlap
        self.text_splitter = RecursiveCharacterTextSplitter(
            chunk_size=chunk_size,
            chunk_overlap=chunk_overlap,
            separators=["\n\n", "\n", " ", ""]
        )
    
    def read_pdf(self, file_path: str) -> Dict[str, Any]:
        
        try:
            pdf_reader = PdfReader(file_path)
            text = ""
            page_count = len(pdf_reader.pages)
            
            for page_num, page in enumerate(pdf_reader.pages):
                text += f"\n--- Page {page_num + 1} ---\n"
                text += page.extract_text()
            
            # Extract PDF metadata
            metadata = {
                "source": file_path,
                "file_name": os.path.basename(file_path),
                "page_count": page_count,
                "file_size": os.path.getsize(file_path),
                "document_type": "PDF"
            }
            
            return {"text": text, "metadata": metadata}
        
        except Exception as e:
            print(f"Error reading PDF {file_path}: {str(e)}")
            return {"text": "", "metadata": {}}
    
    def chunk_with_metadata(self, text: str, base_metadata: Dict[str, Any]) -> List[Document]:
        
        chunks = self.text_splitter.split_text(text)
        documents = []
        
        for chunk_num, chunk in enumerate(chunks):
            metadata = {
                **base_metadata,
                "chunk_index": chunk_num,
                "chunk_size": len(chunk),
                "total_chunks": len(chunks)
            }
            
            doc = Document(page_content=chunk, metadata=metadata)
            documents.append(doc)
        
        return documents
    
    def process_pdf_directory(self, directory_path: str) -> List[Document]:
        
        all_documents = []
        pdf_files = Path(directory_path).glob("*.pdf")
        
        for pdf_file in pdf_files:
            print(f"Processing: {pdf_file.name}")
            
            # Read PDF
            pdf_data = self.read_pdf(str(pdf_file))
            
            if pdf_data["text"]:
                # Chunk with metadata
                documents = self.chunk_with_metadata(
                    pdf_data["text"],
                    pdf_data["metadata"]
                )
                all_documents.extend(documents)
                print(f" Extracted {len(documents)} chunks")
            else:
                print(f"Failed to extract text")
        
        return all_documents

In [None]:
# Initialize processor
processor = ProcessAndChunkPDFs(chunk_size=1000, chunk_overlap=200)

directory_path = "C:\\Users\\sachi\\Projects\\PythonProjects\\building-rag-pipelines\\PDF_Data"
if os.path.exists(directory_path):
    all_docs = processor.process_pdf_directory(directory_path)
    documents = all_docs
    print(f"\n\nTotal documents processed: {len(all_docs)}")

In [21]:
import os
import pandas as pd

from langchain_huggingface import HuggingFaceEmbeddings
from langchain_groq import ChatGroq
from  dotenv import load_dotenv
load_dotenv()
api_key = os.getenv("GORQ_API_KEY")
# Load the embedding model
embedding_model = HuggingFaceEmbeddings(model_name='sentence-transformers/all-MiniLM-L6-v2')
llm = ChatGroq(
    model_name="meta-llama/llama-4-scout-17b-16e-instruct",
    groq_api_key=api_key,
    temperature=0.7)


In [None]:
from langchain_core.embeddings import Embeddings
from langchain_community.vectorstores import FAISS


vector_store = FAISS.from_documents(documents, embedding_model)


In [45]:
retriver = vector_store.as_retriever(search_type="similarity", search_kwargs={"k": 2})
retrived_docs  = [x.page_content for x in retriver.invoke("What is chain of thought Hijacking?")]

In [46]:
print(" \n".join(retrived_docs))

breath and work on this problem step-by-step...‚Äù).
Table 1: Attack success rate (ASR, %) on S1 under different CoT length conditions.
Setting Minimal Natural Extended
ASR (%) 27 51 80
The results reveal a clear pattern: longer reasoning traces substantially increase the likelihood of
harmful outputs, with ASR rising from 27% (Minimal) to 80% (Extended). This provides an initial
behavioral clue that refusals in reasoning models degrade as CoT length grows. In Section 4, we
build on this observation by introducingChain-of-Thought Hijacking, a systematic jailbreak that
exploits this vulnerability.
4 CHAIN-OF-THOUGHTHIJACKING: ATTACKDESIGN ANDEMPIRICAL
RESULTS
4.1 JAILBREAK METHODOLOGY
We define a prompt-based jailbreak,CoT Hijacking. The attack prepends a long, benign reasoning
preface to a harmful instruction, followed by a final-answer cue. This structure systematically re-
duces refusals: the benign CoT dilutes the refusal signal while the cue shifts attention to the answer
region. 
