# CSR RAG Project - Step 2: Chunking and Vector Database (FAISS Version)
# This notebook chunks the extracted text and creates embeddings using FAISS


## Setup and Imports


In [1]:
import json
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_community.embeddings import HuggingFaceEmbeddings
from langchain_community.vectorstores import FAISS
from langchain_core.documents import Document
import os
import pickle

  from pydantic.v1.fields import FieldInfo as FieldInfoV1



## Configuration



In [2]:
# %%
# Input file (from previous step)
INPUT_FILE = "extracted_csr_data.json"

# Chunking settings
CHUNK_SIZE = 1000  # tokens (approximately)
CHUNK_OVERLAP = 200  # tokens overlap between chunks

# Vector database settings
VECTOR_DB_PATH = "faiss_index"

# Embedding model (multilingual, free, local)
EMBEDDING_MODEL = "sentence-transformers/paraphrase-multilingual-mpnet-base-v2"


## Load Extracted Data




In [3]:
# %%
print("Loading extracted CSR data...")
with open(INPUT_FILE, "r", encoding="utf-8") as f:
    extracted_data = json.load(f)

print(f"‚úÖ Loaded {len(extracted_data)} documents")

Loading extracted CSR data...
‚úÖ Loaded 25 documents



## Initialize Text Splitter


In [4]:

text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=CHUNK_SIZE,
    chunk_overlap=CHUNK_OVERLAP,
    length_function=len,
    separators=["\n\n", "\n", ". ", " ", ""]
)

print(f"‚úÖ Text splitter initialized")
print(f"   Chunk size: {CHUNK_SIZE} characters")
print(f"   Overlap: {CHUNK_OVERLAP} characters")

‚úÖ Text splitter initialized
   Chunk size: 1000 characters
   Overlap: 200 characters



## Create Chunks with Metadata

In [5]:
def create_chunks(extracted_data):
    """
    Convert extracted data to LangChain documents with chunks
    """
    all_chunks = []
    stats = {
        "total_documents": len(extracted_data),
        "total_chunks": 0,
        "chunks_by_company": {}
    }
    
    print("\n" + "=" * 60)
    print("CHUNKING DOCUMENTS")
    print("=" * 60)
    
    for doc_data in extracted_data:
        company = doc_data["company"]
        year = doc_data["year"]
        text = doc_data["text"]
        
        chunks = text_splitter.split_text(text)
        
        print(f"\nüìÑ {company} {year}:")
        print(f"   Original length: {len(text):,} characters")
        print(f"   Created: {len(chunks)} chunks")
        
        for i, chunk in enumerate(chunks):
            metadata = {
                "company": company,
                "year": year,
                "source_file": doc_data["source_file"],
                "chunk_index": i,
                "total_chunks": len(chunks),
                "page_count": doc_data["page_count"],
                "chunk_id": f"{company}_{year}_chunk_{i}"
            }
            
            doc = Document(
                page_content=chunk,
                metadata=metadata
            )
            
            all_chunks.append(doc)
        
        stats["total_chunks"] += len(chunks)
        if company not in stats["chunks_by_company"]:
            stats["chunks_by_company"][company] = 0
        stats["chunks_by_company"][company] += len(chunks)
    
    return all_chunks, stats

chunks, chunking_stats = create_chunks(extracted_data)


CHUNKING DOCUMENTS

üìÑ Danone 2019:
   Original length: 75,916 characters
   Created: 101 chunks

üìÑ Danone 2020:
   Original length: 75,916 characters
   Created: 101 chunks

üìÑ Danone 2021:
   Original length: 75,717 characters
   Created: 104 chunks

üìÑ Danone 2022:
   Original length: 75,717 characters
   Created: 104 chunks

üìÑ Danone 2024:
   Original length: 135,522 characters
   Created: 176 chunks

üìÑ Indofood 2020:
   Original length: 107,239 characters
   Created: 137 chunks

üìÑ Indofood 2021:
   Original length: 215,448 characters
   Created: 294 chunks

üìÑ Indofood 2022:
   Original length: 272,688 characters
   Created: 358 chunks

üìÑ Indofood 2023:
   Original length: 334,431 characters
   Created: 436 chunks

üìÑ Indofood 2024:
   Original length: 363,712 characters
   Created: 480 chunks

üìÑ Mayora 2019:
   Original length: 13,627 characters
   Created: 18 chunks

üìÑ Mayora 2020:
   Original length: 82,823 characters
   Created: 115 chunks

üìÑ


## Display Chunking Statistics

In [6]:
print("\n" + "=" * 60)
print("CHUNKING STATISTICS")
print("=" * 60)
print(f"Total documents: {chunking_stats['total_documents']}")
print(f"Total chunks created: {chunking_stats['total_chunks']}")
print(f"Average chunks per document: {chunking_stats['total_chunks']/chunking_stats['total_documents']:.1f}")

print("\nüìä Chunks by Company:")
for company, count in sorted(chunking_stats['chunks_by_company'].items()):
    print(f"   {company}: {count} chunks")


CHUNKING STATISTICS
Total documents: 25
Total chunks created: 5155
Average chunks per document: 206.2

üìä Chunks by Company:
   Danone: 586 chunks
   Indofood: 1705 chunks
   Mayora: 525 chunks
   Ultra_jaya: 103 chunks
   Unilever: 2236 chunks



## Preview Sample Chunks

In [7]:
print("\n" + "=" * 60)
print("SAMPLE CHUNKS")
print("=" * 60)

for i, chunk in enumerate(chunks[:3]):
    print(f"\n--- Chunk {i+1} ---")
    print(f"Company: {chunk.metadata['company']}")
    print(f"Year: {chunk.metadata['year']}")
    print(f"Chunk ID: {chunk.metadata['chunk_id']}")
    print(f"Length: {len(chunk.page_content)} characters")
    print(f"\nContent preview (first 200 chars):")
    print(chunk.page_content[:200] + "...")




SAMPLE CHUNKS

--- Chunk 1 ---
Company: Danone
Year: 2019
Chunk ID: Danone_2019_chunk_0
Length: 904 characters

Content preview (first 200 chars):
--- Page 1 --- Melestarikan Kebaikan Lingkungan Laporan Keberlanjutan 2020 PT Tirta Investama (Danone-AQUA) 28 --- Page 2 --- Komitmen Danone-AQUA terhadap pelestarian lingkungan tercermin dalam setia...

--- Chunk 2 ---
Company: Danone
Year: 2019
Chunk ID: Danone_2019_chunk_1
Length: 790 characters

Content preview (first 200 chars):
. Beranjak dari pemahaman ini, maka kami menaruh perhatian besar terhadap pemantauan kinerja serta upaya Perusahaan dalam memitigasi dan mengatasi dampak lingkungan yang disebabkan oleh operasi kami. ...

--- Chunk 3 ---
Company: Danone
Year: 2019
Chunk ID: Danone_2019_chunk_2
Length: 875 characters

Content preview (first 200 chars):
. Merupakan upaya Danone-AQUA dalam menciptakan siklus hidup baru untuk seluruh kemasan plastik yang ada di pasaran dengan mengoptimalkan pengumpulan sampah secara bertanggung ja


## Initialize Embedding Model


In [8]:
print("\n" + "=" * 60)
print("LOADING EMBEDDING MODEL")
print("=" * 60)
print("This may take a few minutes on first run...")
print("The model will be downloaded and cached locally.")

embeddings = HuggingFaceEmbeddings(
    model_name=EMBEDDING_MODEL,
    model_kwargs={'device': 'cpu'},
    encode_kwargs={'normalize_embeddings': True}
)

print("‚úÖ Embedding model loaded!")
print(f"   Model: {EMBEDDING_MODEL}")

test_embedding = embeddings.embed_query("test")
print(f"   Embedding dimension: {len(test_embedding)}")



LOADING EMBEDDING MODEL
This may take a few minutes on first run...
The model will be downloaded and cached locally.


  embeddings = HuggingFaceEmbeddings(


modules.json:   0%|          | 0.00/229 [00:00<?, ?B/s]

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


config_sentence_transformers.json:   0%|          | 0.00/122 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/723 [00:00<?, ?B/s]

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model.safetensors:   0%|          | 0.00/1.11G [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/402 [00:00<?, ?B/s]

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


sentencepiece.bpe.model:   0%|          | 0.00/5.07M [00:00<?, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/239 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

‚úÖ Embedding model loaded!
   Model: sentence-transformers/paraphrase-multilingual-mpnet-base-v2
   Embedding dimension: 768



## Create FAISS Vector Database

In [9]:
print("\n" + "=" * 60)
print("CREATING FAISS VECTOR DATABASE")
print("=" * 60)
print("This will take several minutes...")
print(f"Creating embeddings for {len(chunks)} chunks...\n")

# Create FAISS vector store
vectorstore = FAISS.from_documents(
    documents=chunks,
    embedding=embeddings
)

print("\n‚úÖ Vector database created successfully!")
print(f"   Total vectors: {len(chunks)}")

# Save to disk
vectorstore.save_local(VECTOR_DB_PATH)
print(f"   Saved to: {VECTOR_DB_PATH}")


CREATING FAISS VECTOR DATABASE
This will take several minutes...
Creating embeddings for 5155 chunks...


‚úÖ Vector database created successfully!
   Total vectors: 5155
   Saved to: faiss_index



## Test Retrieval


In [10]:
print("\n" + "=" * 60)
print("TESTING RETRIEVAL")
print("=" * 60)

test_queries = [
    "What is Unilever's water conservation program?",
    "Apa program CSR Indofood?",
    "energy efficiency initiatives"
]

for query in test_queries:
    print(f"\nüîç Query: '{query}'")
    print("-" * 60)
    
    results = vectorstore.similarity_search(query, k=3)
    
    for i, doc in enumerate(results, 1):
        print(f"\nResult {i}:")
        print(f"  Company: {doc.metadata['company']}")
        print(f"  Year: {doc.metadata['year']}")
        print(f"  Chunk: {doc.metadata['chunk_index']+1}/{doc.metadata['total_chunks']}")
        print(f"  Preview: {doc.page_content[:150]}...")


TESTING RETRIEVAL

üîç Query: 'What is Unilever's water conservation program?'
------------------------------------------------------------

Result 1:
  Company: Unilever
  Year: 2024
  Chunk: 86/507
  Preview: . Sesuai komitmen keberlanjutan dalam GAP, Unilever secara global menargetkan penerapan program pengelolaan air (water stewardship) di 100 area rawan ...

Result 2:
  Company: Unilever
  Year: 2022
  Chunk: 168/472
  Preview: . Globally, Unilever is part of Water Resources Group (WRG) 2030, working towards promoting water management resilience and transformative change in c...

Result 3:
  Company: Unilever
  Year: 2020
  Chunk: 301/331
  Preview: . Unilever mengambil langkah nyata dalam menjaga ketersedian air melalui efisiensi penggunaan air dan pengurangan pencemaran terhadap air. Dalam efisi...

üîç Query: 'Apa program CSR Indofood?'
------------------------------------------------------------

Result 1:
  Company: Indofood
  Year: 2020
  Chunk: 135/137
  Preview: . Selur


## Verify Database

In [11]:
print("\n" + "=" * 60)
print("DATABASE VERIFICATION")
print("=" * 60)

# Test loading from disk
test_vectorstore = FAISS.load_local(
    VECTOR_DB_PATH, 
    embeddings,
    allow_dangerous_deserialization=True
)

print(f"‚úÖ Database verified!")
print(f"   Successfully loaded from disk")

# Calculate approximate size
import os
total_size = sum(
    os.path.getsize(os.path.join(VECTOR_DB_PATH, f))
    for f in os.listdir(VECTOR_DB_PATH)
    if os.path.isfile(os.path.join(VECTOR_DB_PATH, f))
)
print(f"   Database size: {total_size/1024/1024:.2f} MB")


DATABASE VERIFICATION
‚úÖ Database verified!
   Successfully loaded from disk
   Database size: 20.13 MB




## Save Metadata Summary


In [12]:
summary = {
    "created_at": chunking_stats,
    "chunk_size": CHUNK_SIZE,
    "chunk_overlap": CHUNK_OVERLAP,
    "embedding_model": EMBEDDING_MODEL,
    "total_chunks": len(chunks),
    "companies": list(chunking_stats['chunks_by_company'].keys()),
    "years": [2019, 2020, 2021, 2022, 2023],
    "vector_db_type": "FAISS"
}

with open("vector_db_summary.json", "w") as f:
    json.dump(summary, f, indent=2)

print("\nüíæ Summary saved to: vector_db_summary.json")


üíæ Summary saved to: vector_db_summary.json



## Next Steps

In [13]:
print("\n" + "=" * 60)
print("‚úÖ STEP 2 COMPLETE!")
print("=" * 60)
print("\nNext: Open 03_rag_chatbot_FAISS.ipynb")


‚úÖ STEP 2 COMPLETE!

Next: Open 03_rag_chatbot_FAISS.ipynb
