### Creating testdata folder

###### IMPORTANT NOTE: The version v.0.1.21 of RAGAS has been used to create the test data. 
###### The higher version v.0.2x has a significantly different test data generation structure, which is why v.0.1.21 was preferred.
###### The existing test data CSV files are already located in the corresponding data folder.
###### If you want to create new test data with this notebook, please use a different environment and install it with `pip install ragas==0.1.21`.
###### The latest version has been used for evaluation with RAGAS metrics, 
###### and this version is specified in the requirements.txt file: `pip install git+https://github.com/explodinggradients/ragas.git`

### Create test_data folder with 8 categories

In [1]:
import os
import random
import shutil

# Define the source directory containing the original data
source_dir = '/Users/taha/Desktop/rag/data'
# Define the destination directory for the test data
test_dir = '/Users/taha/Desktop/rag/test_data_routing'

# Specify the number of files to select from each subfolder
file_count = 200

# Function to create test data by copying files from the source directory to the destination directory
def create_test_data(source, destination, count):
    # Create the main test directory if it doesn't already exist
    os.makedirs(destination, exist_ok=True)
    
    # Loop through each folder in the main source directory
    for folder in os.listdir(source):
        source_folder_path = os.path.join(source, folder)  # Path to the source subfolder
        dest_folder_path = os.path.join(destination, folder)  # Path to the destination subfolder
        
        # If the item is not a directory, skip it
        if not os.path.isdir(source_folder_path):
            continue
        
        # Create the corresponding destination subfolder
        os.makedirs(dest_folder_path, exist_ok=True)
        
        # List all .txt files in the current subfolder, excluding files that end with '_summary.txt'
        files = [
            f for f in os.listdir(source_folder_path)
            if f.endswith('.txt') and not f.endswith('_summary.txt')
        ]
        
        # Check if the subfolder has fewer files than the specified count
        if len(files) < count:
            print(f"Warning: The folder '{folder}' has fewer than {count} files.")
        
        # Randomly select up to 'count' files from the available files
        selected_files = random.sample(files, min(count, len(files)))
        
        # Copy each selected file from the source to the destination
        for file_name in selected_files:
            src_file_path = os.path.join(source_folder_path, file_name)
            dest_file_path = os.path.join(dest_folder_path, file_name)
            shutil.copy(src_file_path, dest_file_path)
            print(f"Copied file: {file_name} --> {dest_folder_path}")

# Run the function to create the test data set
create_test_data(source_dir, test_dir, file_count)

Copied file: telekomhilft.telekom.de_t5_Festnetz-Internet_pausieren-eines-Vertrages-wegen-laengerer-Abwesenheit_td-p_6973396_jump-to_first-unread-message.txt --> /Users/taha/Desktop/rag/test_data_routing/MagentaEINS
Copied file: telekomhilft.telekom.de_t5_Festnetz-Internet_Wow-Verguenstigung-entfaellt-bei-Tarifwechsel_td-p_6948541_jump-to_first-unread-message.txt --> /Users/taha/Desktop/rag/test_data_routing/MagentaEINS
Copied file: telekomhilft.telekom.de_t5_Mobilfunk_Ich-moechte-meine-Hauptkarte-Mobilfunk-inkl-Magenta1-Vorteil-und_td-p_6942473_jump-to_first-unread-message.txt --> /Users/taha/Desktop/rag/test_data_routing/MagentaEINS
Copied file: telekomhilft.telekom.de_t5_Mobilfunk_Family-Card-Start-1-Gen-soll-Magenta-Mobil-S-DTAG-werden_td-p_6954353_jump-to_first-unread-message.txt --> /Users/taha/Desktop/rag/test_data_routing/MagentaEINS
Copied file: telekomhilft.telekom.de_t5_Festnetz-Internet_Umstellung-MagentaZuhause-Regio-quot-manuell-quot_td-p_6969278_jump-to_first-unread-mess

### Create test_data folder without categories. 

In [None]:
import os
import random
import shutil

# Define the source directory containing the original data
source_dir = '/Users/taha/Desktop/rag/data'
# Define the destination directory where all selected files will be saved
test_dir = '/Users/taha/Desktop/rag/test_data'

# Specify the number of files to select in total
file_count = 200

# Function to create test data by selecting random files from all folders and copying them to a single directory
def create_naive_test_data(source, destination, count):
    # Create the main destination directory if it doesn't already exist
    os.makedirs(destination, exist_ok=True)

    # Collect all eligible .txt files (excluding _summary.txt) from all subdirectories
    all_files = []
    for folder in os.listdir(source):
        source_folder_path = os.path.join(source, folder)
        
        # Skip if the item is not a directory
        if not os.path.isdir(source_folder_path):
            continue
        
        # Add all .txt files (excluding _summary.txt) from this subfolder to the list
        files = [
            os.path.join(source_folder_path, f) for f in os.listdir(source_folder_path)
            if f.endswith('.txt') and not f.endswith('_summary.txt')
        ]
        all_files.extend(files)

    # Check if there are fewer files than the desired count
    if len(all_files) < count:
        print(f"Warning: Only {len(all_files)} files found, which is less than the specified {count}.")
    
    # Randomly select up to 'count' files from the combined list of all .txt files
    selected_files = random.sample(all_files, min(count, len(all_files)))
    
    # Copy each selected file into the single destination folder
    for file_path in selected_files:
        file_name = os.path.basename(file_path)
        dest_file_path = os.path.join(destination, file_name)
        shutil.copy(file_path, dest_file_path)
        print(f"Copied file: {file_name} --> {destination}")

# Run the function to create the naive test data set
create_naive_test_data(source_dir, test_dir, file_count)

### Naive RAG - Semantic Search - Character Splitting
#### Creating testdata with Character Splitting

In [None]:
import glob
import os
import pandas as pd
import initials
from langchain_text_splitters import CharacterTextSplitter, RecursiveCharacterTextSplitter
from langchain_experimental.text_splitter import SemanticChunker
from ragas.testset.generator import TestsetGenerator

# Initialize TestsetGenerator with embedding and model from initials
generator = TestsetGenerator.from_langchain(generator_llm=initials.model, critic_llm=initials.model, embeddings=initials.embedding)

# Character splitting settings
text_splitter = CharacterTextSplitter(
    separator='',
    chunk_size=250,
    chunk_overlap=25,
)

# CharacterTextSplitter with separator
text_splitter_separator = CharacterTextSplitter(
    separator="\n\n",
    chunk_size=250,
    chunk_overlap=100,
)

#RecursiveCharacterTextSplitter
text_splitter_recursive = RecursiveCharacterTextSplitter(chunk_size=250, chunk_overlap=25)

# Semantic Splitting
text_splitter_semantic = SemanticChunker(embeddings=initials.embedding, breakpoint_threshold_type="gradient")

# Define test directory path
test_directory = "/Users/taha/Desktop/rag/test_data"

TEST_SIZE = 10

# Output CSV file path
output_file_path = os.path.join(test_directory, "_testset_recursive_chunksize250_overlap25.csv")

# Skip if CSV already exists
if os.path.exists(output_file_path):
    print(f"[INFO] CSV already exists, skipping...")
else:
    print(f"\n[INFO] Processing test directory")

    # Collect all .txt files in the test directory
    #txt_files = glob.glob(os.path.join(test_directory, "*.txt"))
    txt_files = [file for file in glob.glob(os.path.join(test_directory, "*.txt")) if not file.endswith("_summary.txt")]

    print(f"[INFO] Found {len(txt_files)} files in test directory")

    # Load all files
    documents = []
    for file_path in txt_files:
        with open(file_path, 'r', encoding='utf-8') as f:
            documents.append(f.read())   

    # Apply splitting
    chunks = text_splitter_recursive.create_documents(documents)

    print(f"[INFO] Splitting complete, {len(chunks)} chunks created.")

    # Create a test set of N items
    testset = generator.generate_with_langchain_docs(chunks, test_size=TEST_SIZE)
    testset_df = testset.to_pandas()
    print(f"[INFO] Testset generated with size", TEST_SIZE)

    # Save DataFrame as CSV
    testset_df.to_csv(output_file_path, index=False)
    print(f"[INFO] Results saved to {output_file_path}")

    print(f"[INFO] Completed processing test directory")

  from .autonotebook import tqdm as notebook_tqdm



[INFO] Processing test directory
[INFO] Found 190 files in test directory
[INFO] Splitting complete, 1380 chunks created.


Filename and doc_id are the same for all nodes.                     
Generating: 100%|██████████| 10/10 [00:40<00:00,  4.03s/it]


[INFO] Testset generated with size 10
[INFO] Results saved to /Users/taha/Desktop/rag/test_data_naive/_testset_recursive_chunksize250_overlap25.csv
[INFO] Completed processing test directory
