## Testdata Generation - Advanced RAG

In [None]:
import glob
import os
import pandas as pd
import initials
from langchain_text_splitters import CharacterTextSplitter, RecursiveCharacterTextSplitter
from langchain_experimental.text_splitter import SemanticChunker
from ragas.testset.generator import TestsetGenerator

# Initialize TestsetGenerator with embedding and model from initials
generator = TestsetGenerator.from_langchain(generator_llm=initials.model, critic_llm=initials.model, embeddings=initials.embedding)

# Character splitting settings
text_splitter = CharacterTextSplitter(
    separator='',
    chunk_size=500,
    chunk_overlap=25,
)

# CharacterTextSplitter with separator
text_splitter_separator = CharacterTextSplitter(
    separator="\n\n",
    chunk_size=250,
    chunk_overlap=100,
)

#RecursiveCharacterTextSplitter
text_splitter_recursive = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=100)

# Semantic Splitting
text_splitter_semantic = SemanticChunker(initials.embedding)

# Define test directory path
test_directory = "/Users/taha/Desktop/rag/test_data_naive"

TEST_SIZE = 10

# Output CSV file path
output_file_path = os.path.join(test_directory, "_testset_hyde_semantic.csv")

# Skip if CSV already exists
if os.path.exists(output_file_path):
    print(f"[INFO] CSV already exists, skipping...")
else:
    print(f"\n[INFO] Processing test directory")

    # Collect all .txt files in the test directory
    txt_files = glob.glob(os.path.join(test_directory, "*.txt"))

    print(f"[INFO] Found {len(txt_files)} files in test directory")

    # Load all files
    documents = []
    for file_path in txt_files:
        with open(file_path, 'r', encoding='utf-8') as f:
            documents.append(f.read())   

    # Apply splitting
    chunks = text_splitter_semantic.create_documents(documents)

    print(f"[INFO] Splitting complete, {len(chunks)} chunks created.")

    # Create a test set of N items
    testset = generator.generate_with_langchain_docs(chunks, test_size=TEST_SIZE)
    testset_df = testset.to_pandas()
    print(f"[INFO] Testset generated with size", TEST_SIZE)

    # Save DataFrame as CSV
    testset_df.to_csv(output_file_path, index=False)
    print(f"[INFO] Results saved to {output_file_path}")

    print(f"[INFO] Completed processing test directory")


[INFO] Processing test directory
[INFO] Found 190 files in test directory
[INFO] Splitting complete, 552 chunks created.


Filename and doc_id are the same for all nodes.                     
Generating: 100%|██████████| 10/10 [00:26<00:00,  2.66s/it]


[INFO] Testset generated with size 10
[INFO] Results saved to /Users/taha/Desktop/rag/test_data_naive/_testset_character_chunksize500_overlap25.csv
[INFO] Completed processing test directory
