<a href="https://colab.research.google.com/github/Dhanya-Zac/Multilingual-LLM-hallucination-test/blob/main/Friday_October_English_data_knowledge.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
#!/usr/bin/env python3
"""
Simple Usage Example for CSV to JSON Converter
"""

import pandas as pd
import json

# Simple, direct approach
def simple_csv_to_json(csv_file, json_file, max_rows=15000):
    """
    Simplest possible implementation
    """
    # Read CSV
    df = pd.read_csv(csv_file)

    # Keep first 15000 rows (delete everything from row 15001 onwards)
    df = df.iloc[:max_rows]

    # Convert to JSON (array of objects format)
    df.to_json(json_file, orient='records', indent=2)

    print(f"Converted {len(df)} rows from {csv_file} to {json_file}")


# One-liner version (for small files)
def oneliner_csv_to_json(csv_file, json_file):
    """Ultra-compact version"""
    pd.read_csv(csv_file).iloc[:15000].to_json(json_file, orient='records', indent=2)


# Usage examples
if __name__ == "__main__":
    # Example 1: Simple usage
    simple_csv_to_json('English.csv', 'English_dataset.json')

    # Example 2: One-liner
    # oneliner_csv_to_json('data.csv', 'output.json')

    # Example 3: With custom row limit
    # simple_csv_to_json('data.csv', 'output.json', max_rows=10000)

Converted 15000 rows from English.csv to English_dataset.json


In [None]:
# --- Imports ---
import json
import random
import time
import tiktoken
from openai import OpenAI
from typing import List, Tuple, Dict
import numpy as np
from tqdm import tqdm
import os

# --- Colab-specific imports ---
from google.colab import userdata, files
import io

class KnowledgeDatasetGPT:
    def __init__(
        self,
        path_to_knowledge_dataset: str = "datasets/",
        dataset_name: str = "English",
        model_name: str = "gpt-4o-mini"
    ):
        """
        Initialize the knowledge dataset creator for GPT-4o-mini

        Args:
            path_to_knowledge_dataset: Path to save the datasets
            dataset_name: Name of the dataset (default: "English")
            model_name: OpenAI model name
        """
        # --- Get API key securely from Colab ---
        api_key = userdata.get('gptapi')
        if not api_key:
            raise ValueError("API key not found in Colab userdata. Please set it with userdata.set('gptapi', 'your_key').")

        # Set seeds for reproducibility
        random.seed(42)
        np.random.seed(42)

        # Initialize OpenAI client
        self.client = OpenAI(api_key=api_key)
        self.model_name = model_name

        # Initialize tiktoken tokenizer
        self.tokenizer = tiktoken.encoding_for_model("gpt-4")

        self.dataset_name = dataset_name

        # Create directory if it doesn't exist
        os.makedirs(path_to_knowledge_dataset, exist_ok=True)

        # Load initial dataset
        initial_dataset = self.load_manual_dataset()

        # Create knowledge dataset
        self.create_knowledge_dataset(initial_dataset, path_to_knowledge_dataset)


    def load_manual_dataset(self) -> List[Tuple]:
        """
        Allow manual upload of a dataset file (English.json or .csv)
        Each row should contain at least 'question' and 'answer'.
        """
        print("Please upload your dataset file (JSON or CSV)...")
        uploaded = files.upload()

        if not uploaded:
            raise ValueError("No file uploaded. Please upload your English dataset.")

        file_name = list(uploaded.keys())[0]
        print(f"Uploaded file: {file_name}")

        # --- Parse JSON ---
        if file_name.endswith(".json"):
            data = json.load(io.BytesIO(uploaded[file_name]))
        # --- Parse CSV ---
        elif file_name.endswith(".csv"):
            import pandas as pd
            df = pd.read_csv(io.BytesIO(uploaded[file_name]))
            data = df.to_dict(orient="records")
        else:
            raise ValueError("Unsupported file format. Please upload a JSON or CSV file.")

        print(f"Loaded {len(data)} records from {file_name}")

        dataset = []
        for i, row in enumerate(data):
            if "question" not in row or "answer" not in row:
                continue

            prompt = f"question: {row['question']}\nanswer:"
            answer = str(row["answer"]).strip()
            answer_tokens = self.tokenize(answer)
            dataset.append([prompt, answer, answer_tokens])

            if i < 5:
                print(f"Example {i}: {prompt[:50]}... -> {answer}")

        return dataset


    def tokenize(self, text: str) -> List[int]:
        """Tokenize text using tiktoken"""
        return self.tokenizer.encode(text)


    def generate_with_temperature(self, prompt: str, temperature: float = 0.5, n: int = 5) -> List[str]:
        """Generate n completions with specified temperature using OpenAI API"""
        try:
            response = self.client.chat.completions.create(
                model=self.model_name,
                messages=[{"role": "user", "content": prompt}],
                max_tokens=20,
                temperature=temperature,
                n=n,
                stop=["\n", ".", "?"]
            )
            return [choice.message.content.strip() for choice in response.choices]
        except Exception as e:
            print(f"Error during generation: {e}")
            time.sleep(2)
            return [""] * n


    def generate_greedy(self, prompt: str) -> str:
        """Generate with greedy decoding (temperature=0)"""
        try:
            response = self.client.chat.completions.create(
                model=self.model_name,
                messages=[{"role": "user", "content": prompt}],
                max_tokens=20,
                temperature=0,
                n=1,
                stop=["\n", ".", "?"]
            )
            return response.choices[0].message.content.strip()
        except Exception as e:
            print(f"Error during greedy generation: {e}")
            time.sleep(2)
            return ""


    def create_knowledge_dataset(self, initial_dataset: List[Tuple], path_to_save: str):
        """Create knowledge, non-knowledge, and else datasets based on model responses"""
        knowledge_dataset, non_knowledge_dataset, else_dataset = [], [], []

        few_shot_examples = [
            "question: What is the capital of France?\nanswer: Paris\n",
            "question: Who wrote 'Romeo and Juliet'?\nanswer: William Shakespeare\n",
            "question: What is the square root of 64?\nanswer: 8\n",
            "question: Which element has the chemical symbol 'H'?\nanswer: Hydrogen\n",
            "question: What is the currency of Japan?\nanswer: Japanese Yen\n"
        ]

        for idx, point in enumerate(tqdm(initial_dataset, desc="Processing examples")):
            prompt, target_answer, answer_tokens = point

            # Save progress periodically
            if idx % 1000 == 0 and idx > 0:
                print(f"\nProgress: {idx}/{len(initial_dataset)}")
                self._save_datasets(knowledge_dataset, non_knowledge_dataset, else_dataset, path_to_save)

            few_shot_prompt = "".join(random.sample(few_shot_examples, 3))
            full_prompt = few_shot_prompt + prompt

            temp_generations = self.generate_with_temperature(full_prompt, temperature=0.5, n=5)
            greedy_generation = self.generate_greedy(full_prompt)

            all_generations = temp_generations + [greedy_generation]
            count_correct = sum(target_answer.lower() in gen.lower() for gen in all_generations)

            if count_correct == 6:
                knowledge_dataset.append([prompt, target_answer, answer_tokens, count_correct])
            elif count_correct == 0:
                non_knowledge_dataset.append([prompt, target_answer, answer_tokens, count_correct])
            else:
                else_dataset.append([prompt, target_answer, answer_tokens, count_correct])

            if idx < 3:
                print(f"\nExample {idx}: {prompt[:50]}...")
                print(f"Target: {target_answer}")
                print(f"Correct count: {count_correct}/6")
                print(f"Greedy: {greedy_generation}")

            time.sleep(0.1)

        print(f"\nFinal Results ‚Äî Knowledge: {len(knowledge_dataset)}, Non-knowledge: {len(non_knowledge_dataset)}, Else: {len(else_dataset)}")
        self._save_datasets(knowledge_dataset, non_knowledge_dataset, else_dataset, path_to_save)


    def _save_datasets(self, knowledge_dataset, non_knowledge_dataset, else_dataset, path_to_save):
        """Helper function to save datasets"""
        model_name_safe = self.model_name.replace("/", "_")
        dataset_name_safe = self.dataset_name.replace(" ", "_")

        for name, data in {
            "knowledge": knowledge_dataset,
            "non_knowledge": non_knowledge_dataset,
            "else": else_dataset
        }.items():
            path = os.path.join(path_to_save, f"{model_name_safe}_{dataset_name_safe}_{name}_dataset.json")
            with open(path, "w") as f:
                json.dump(data, f, indent=2)
            print(f"Saved {name} dataset to {path}")

if __name__ == "__main__":
    dataset_creator = KnowledgeDatasetGPT(
        path_to_knowledge_dataset="datasets/gpt4o_mini/",
        dataset_name="English",
        model_name="gpt-4o-mini"
    )


Please upload your dataset file (JSON or CSV)...


Saving English_dataset.json to English_dataset (1).json
Uploaded file: English_dataset (1).json
Loaded 15000 records from English_dataset (1).json
Example 0: question:  In which U.S. state were Henry Ford, Ma... -> Michigan
Example 1: question:  The ‚ÄòRing of Fire‚Äô is in which ocean?

... -> Pacific Ocean
Example 2: question:  What is the main constituent of natural... -> Methane
Example 3: question:  "Who wrote the 12 volume novel sequence... -> Anthony Powell
Example 4: question:  Conceived in 1999 in Melbourne, Austral... -> Moustaches


Processing examples:   0%|          | 1/15000 [00:03<14:28:19,  3.47s/it]


Example 0: question:  In which U.S. state were Henry Ford, Ma...
Target: Michigan
Correct count: 6/6
Greedy: Michigan


Processing examples:   0%|          | 2/15000 [00:07<14:52:57,  3.57s/it]


Example 1: question:  The ‚ÄòRing of Fire‚Äô is in which ocean?

...
Target: Pacific Ocean
Correct count: 6/6
Greedy: The ‚ÄòRing of Fire‚Äô is in the Pacific Ocean


Processing examples:   0%|          | 3/15000 [00:09<11:51:18,  2.85s/it]


Example 2: question:  What is the main constituent of natural...
Target: Methane
Correct count: 6/6
Greedy: Methane


Processing examples:   0%|          | 70/15000 [02:13<7:53:29,  1.90s/it]


KeyboardInterrupt: 

In [None]:
# --- Imports ---
import json
import random
import time
import tiktoken
from openai import OpenAI
from typing import List, Tuple, Dict, Optional
import numpy as np
from tqdm import tqdm
import os

# --- Colab-specific imports ---
from google.colab import userdata, files
import io

class KnowledgeDatasetGPT:
    def __init__(
        self,
        path_to_knowledge_dataset: str = "datasets/",
        dataset_name: str = "English",
        model_name: str = "gpt-4o-mini",
        batch_size: int = 2000
    ):
        """
        Initialize the knowledge dataset creator for GPT-4o-mini with batch processing

        Args:
            path_to_knowledge_dataset: Path to save the datasets
            dataset_name: Name of the dataset (default: "English")
            model_name: OpenAI model name
            batch_size: Number of examples to process before saving a batch (default: 2000)
        """
        # --- Get API key securely from Colab ---
        api_key = userdata.get('gptapi')
        if not api_key:
            raise ValueError("API key not found in Colab userdata. Please set it with userdata.set('gptapi', 'your_key').")

        # Set seeds for reproducibility
        random.seed(42)
        np.random.seed(42)

        # Initialize OpenAI client
        self.client = OpenAI(api_key=api_key)
        self.model_name = model_name
        self.batch_size = batch_size

        # Initialize tiktoken tokenizer
        self.tokenizer = tiktoken.encoding_for_model("gpt-4")

        self.dataset_name = dataset_name

        # Create directory if it doesn't exist
        os.makedirs(path_to_knowledge_dataset, exist_ok=True)

        # Create batch directory
        self.batch_dir = os.path.join(path_to_knowledge_dataset, "batches")
        os.makedirs(self.batch_dir, exist_ok=True)

        # Load initial dataset
        initial_dataset = self.load_manual_dataset()

        # Create knowledge dataset with batch processing
        self.create_knowledge_dataset(initial_dataset, path_to_knowledge_dataset)


    def load_manual_dataset(self) -> List[Tuple]:
        """
        Allow manual upload of a dataset file (English.json or .csv)
        Each row should contain at least 'question' and 'answer'.
        """
        print("Please upload your dataset file (JSON or CSV)...")
        uploaded = files.upload()

        if not uploaded:
            raise ValueError("No file uploaded. Please upload your English dataset.")

        file_name = list(uploaded.keys())[0]
        print(f"Uploaded file: {file_name}")

        # --- Parse JSON ---
        if file_name.endswith(".json"):
            data = json.load(io.BytesIO(uploaded[file_name]))
        # --- Parse CSV ---
        elif file_name.endswith(".csv"):
            import pandas as pd
            df = pd.read_csv(io.BytesIO(uploaded[file_name]))
            data = df.to_dict(orient="records")
        else:
            raise ValueError("Unsupported file format. Please upload a JSON or CSV file.")

        print(f"Loaded {len(data)} records from {file_name}")

        dataset = []
        for i, row in enumerate(data):
            if "question" not in row or "answer" not in row:
                continue

            prompt = f"question: {row['question']}\nanswer:"
            answer = str(row["answer"]).strip()
            answer_tokens = self.tokenize(answer)
            dataset.append([prompt, answer, answer_tokens])

            if i < 5:
                print(f"Example {i}: {prompt[:50]}... -> {answer}")

        return dataset


    def tokenize(self, text: str) -> List[int]:
        """Tokenize text using tiktoken"""
        return self.tokenizer.encode(text)


    def generate_with_temperature(self, prompt: str, temperature: float = 0.5, n: int = 5) -> List[str]:
        """Generate n completions with specified temperature using OpenAI API"""
        try:
            response = self.client.chat.completions.create(
                model=self.model_name,
                messages=[{"role": "user", "content": prompt}],
                max_tokens=20,
                temperature=temperature,
                n=n,
                stop=["\n", ".", "?"]
            )
            return [choice.message.content.strip() for choice in response.choices]
        except Exception as e:
            print(f"Error during generation: {e}")
            time.sleep(2)
            return [""] * n


    def generate_greedy(self, prompt: str) -> str:
        """Generate with greedy decoding (temperature=0)"""
        try:
            response = self.client.chat.completions.create(
                model=self.model_name,
                messages=[{"role": "user", "content": prompt}],
                max_tokens=20,
                temperature=0,
                n=1,
                stop=["\n", ".", "?"]
            )
            return response.choices[0].message.content.strip()
        except Exception as e:
            print(f"Error during greedy generation: {e}")
            time.sleep(2)
            return ""


    def create_knowledge_dataset(self, initial_dataset: List[Tuple], path_to_save: str):
        """Create knowledge, non-knowledge, and else datasets with batch processing"""

        # Batch tracking
        batch_num = 0
        examples_in_batch = 0

        # Current batch data
        knowledge_dataset = []
        non_knowledge_dataset = []
        else_dataset = []

        # Overall statistics
        total_knowledge = 0
        total_non_knowledge = 0
        total_else = 0

        # Metadata for tracking batches
        batch_metadata = {
            "model": self.model_name,
            "dataset_name": self.dataset_name,
            "batch_size": self.batch_size,
            "batches": []
        }

        few_shot_examples = [
            "question: What is the capital of France?\nanswer: Paris\n",
            "question: Who wrote 'Romeo and Juliet'?\nanswer: William Shakespeare\n",
            "question: What is the square root of 64?\nanswer: 8\n",
            "question: Which element has the chemical symbol 'H'?\nanswer: Hydrogen\n",
            "question: What is the currency of Japan?\nanswer: Japanese Yen\n"
        ]

        print(f"Processing {len(initial_dataset)} examples in batches of {self.batch_size}...")
        start_batch = 5  # ‚Üê change this number to resume from any batch you want
        start_index = (start_batch - 1) * self.batch_size
        initial_dataset = initial_dataset[start_index:]
        batch_num = start_batch - 1
        for idx, point in enumerate(tqdm(initial_dataset, desc="Processing examples")):
            prompt, target_answer, answer_tokens = point

            # Generate few-shot prompt
            few_shot_prompt = "".join(random.sample(few_shot_examples, 3))
            full_prompt = few_shot_prompt + prompt

            # Generate completions
            temp_generations = self.generate_with_temperature(full_prompt, temperature=0.5, n=5)
            greedy_generation = self.generate_greedy(full_prompt)

            # Count correct answers
            all_generations = temp_generations + [greedy_generation]
            count_correct = sum(target_answer.lower() in gen.lower() for gen in all_generations)

            # Classify and add to appropriate dataset
            if count_correct == 6:
                knowledge_dataset.append([prompt, target_answer, answer_tokens, count_correct])
            elif count_correct == 0:
                non_knowledge_dataset.append([prompt, target_answer, answer_tokens, count_correct])
            else:
                else_dataset.append([prompt, target_answer, answer_tokens, count_correct])

            examples_in_batch += 1

            # Print first few examples for debugging
            if idx < 3:
                print(f"\nExample {idx}: {prompt[:50]}...")
                print(f"Target: {target_answer}")
                print(f"Correct count: {count_correct}/6")
                print(f"Greedy: {greedy_generation}")

            # Check if we should save a batch
            if examples_in_batch >= self.batch_size or idx == len(initial_dataset) - 1:
                batch_num += 1

                # Save current batch
                batch_info = self._save_batch(
                    knowledge_dataset,
                    non_knowledge_dataset,
                    else_dataset,
                    batch_num,
                    self.batch_dir
                )

                # Update totals
                total_knowledge += len(knowledge_dataset)
                total_non_knowledge += len(non_knowledge_dataset)
                total_else += len(else_dataset)

                # Add batch info to metadata
                batch_metadata["batches"].append({
                    "batch_number": batch_num,
                    "examples_processed": examples_in_batch,
                    "knowledge_count": len(knowledge_dataset),
                    "non_knowledge_count": len(non_knowledge_dataset),
                    "else_count": len(else_dataset),
                    "total_processed": idx + 1
                })

                print(f"\n--- Batch {batch_num} Summary ---")
                print(f"Examples in batch: {examples_in_batch}")
                print(f"Knowledge: {len(knowledge_dataset)}")
                print(f"Non-knowledge: {len(non_knowledge_dataset)}")
                print(f"Else: {len(else_dataset)}")
                print(f"Total processed so far: {idx + 1}/{len(initial_dataset)}")
                print(f"Running totals - K: {total_knowledge}, NK: {total_non_knowledge}, E: {total_else}\n")

                # Clear current batch data to free memory
                knowledge_dataset = []
                non_knowledge_dataset = []
                else_dataset = []
                examples_in_batch = 0

            time.sleep(0.1)  # Rate limiting

        # Save metadata
        metadata_path = os.path.join(path_to_save, f"{self.model_name.replace('/', '_')}_{self.dataset_name}_metadata.json")
        with open(metadata_path, "w") as f:
            json.dump(batch_metadata, f, indent=2)
        print(f"\nSaved batch metadata to {metadata_path}")

        # Final summary
        print(f"\n=== Final Results ===")
        print(f"Total batches created: {batch_num}")
        print(f"Total Knowledge: {total_knowledge}")
        print(f"Total Non-knowledge: {total_non_knowledge}")
        print(f"Total Else: {total_else}")
        print(f"Total examples processed: {total_knowledge + total_non_knowledge + total_else}")

        # Ask if user wants to consolidate batches
        consolidate = input("\nDo you want to consolidate all batches into single files? (y/n): ")
        if consolidate.lower() == 'y':
            self._consolidate_batches(batch_num, self.batch_dir, path_to_save)


    def _save_batch(self, knowledge_dataset, non_knowledge_dataset, else_dataset, batch_num, batch_dir):
        """Save a single batch of datasets"""
        model_name_safe = self.model_name.replace("/", "_")
        dataset_name_safe = self.dataset_name.replace(" ", "_")

        batch_info = {}

        for name, data in {
            "knowledge": knowledge_dataset,
            "non_knowledge": non_knowledge_dataset,
            "else": else_dataset
        }.items():
            filename = f"{model_name_safe}_{dataset_name_safe}_{name}_batch_{batch_num:04d}.json"
            path = os.path.join(batch_dir, filename)

            with open(path, "w") as f:
                json.dump(data, f, indent=2)

            batch_info[name] = {
                "filename": filename,
                "count": len(data),
                "path": path
            }

            print(f"Saved {name} batch {batch_num} to {path} ({len(data)} examples)")

        return batch_info


    def _consolidate_batches(self, total_batches: int, batch_dir: str, final_dir: str):
        """Consolidate all batch files into single files for each category"""
        print("\nConsolidating batches...")

        model_name_safe = self.model_name.replace("/", "_")
        dataset_name_safe = self.dataset_name.replace(" ", "_")

        for category in ["knowledge", "non_knowledge", "else"]:
            consolidated_data = []

            # Read all batch files for this category
            for batch_num in range(1, total_batches + 1):
                batch_filename = f"{model_name_safe}_{dataset_name_safe}_{category}_batch_{batch_num:04d}.json"
                batch_path = os.path.join(batch_dir, batch_filename)

                if os.path.exists(batch_path):
                    with open(batch_path, "r") as f:
                        batch_data = json.load(f)
                        consolidated_data.extend(batch_data)
                    print(f"Loaded {len(batch_data)} examples from batch {batch_num} for {category}")

            # Save consolidated file
            final_filename = f"{model_name_safe}_{dataset_name_safe}_{category}_dataset_consolidated.json"
            final_path = os.path.join(final_dir, final_filename)

            with open(final_path, "w") as f:
                json.dump(consolidated_data, f, indent=2)

            print(f"Saved consolidated {category} dataset to {final_path} ({len(consolidated_data)} total examples)")

        print("\nConsolidation complete!")

        # Ask if user wants to delete batch files
        delete_batches = input("\nDo you want to delete the individual batch files? (y/n): ")
        if delete_batches.lower() == 'y':
            import shutil
            shutil.rmtree(batch_dir)
            print("Batch files deleted.")


if __name__ == "__main__":
    # Initialize with batch_size parameter
    dataset_creator = KnowledgeDatasetGPT(
        path_to_knowledge_dataset="datasets/gpt4o_mini/",
        dataset_name="English",
        model_name="gpt-4o-mini",
        batch_size=2000  # Save every 2000 examples
    )

Please upload your dataset file (JSON or CSV)...


Saving English_dataset.json to English_dataset (1).json
Uploaded file: English_dataset (1).json
Loaded 15000 records from English_dataset (1).json
Example 0: question:  In which U.S. state were Henry Ford, Ma... -> Michigan
Example 1: question:  The ‚ÄòRing of Fire‚Äô is in which ocean?

... -> Pacific Ocean
Example 2: question:  What is the main constituent of natural... -> Methane
Example 3: question:  "Who wrote the 12 volume novel sequence... -> Anthony Powell
Example 4: question:  Conceived in 1999 in Melbourne, Austral... -> Moustaches
Processing 15000 examples in batches of 2000...


Processing examples:   0%|          | 1/7000 [00:03<7:12:14,  3.71s/it]


Example 0: question:  "In which Sunday newspaper would you ge...
Target: Mail on sunday
Correct count: 1/6
Greedy: The "You Magazine" supplement is found in the Sunday Times


Processing examples:   0%|          | 2/7000 [00:07<6:52:07,  3.53s/it]


Example 1: question:  Famous for its military marches, in whi...
Target: Wiltshire
Correct count: 6/6
Greedy: Wootton Bassett is located in Wiltshire, England


Processing examples:   0%|          | 3/7000 [00:08<5:13:31,  2.69s/it]


Example 2: question:  Also the name of a popular TV character...
Target: Zebedee
Correct count: 6/6
Greedy: Zebedee


Processing examples:  29%|‚ñà‚ñà‚ñä       | 2000/7000 [1:01:37<2:12:49,  1.59s/it]

Saved knowledge batch 5 to datasets/gpt4o_mini/batches/gpt-4o-mini_English_knowledge_batch_0005.json (1381 examples)
Saved non_knowledge batch 5 to datasets/gpt4o_mini/batches/gpt-4o-mini_English_non_knowledge_batch_0005.json (454 examples)
Saved else batch 5 to datasets/gpt4o_mini/batches/gpt-4o-mini_English_else_batch_0005.json (165 examples)

--- Batch 5 Summary ---
Examples in batch: 2000
Knowledge: 1381
Non-knowledge: 454
Else: 165
Total processed so far: 2000/7000
Running totals - K: 1381, NK: 454, E: 165



Processing examples:  57%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñã    | 4000/7000 [2:03:58<1:36:35,  1.93s/it]

Saved knowledge batch 6 to datasets/gpt4o_mini/batches/gpt-4o-mini_English_knowledge_batch_0006.json (1338 examples)
Saved non_knowledge batch 6 to datasets/gpt4o_mini/batches/gpt-4o-mini_English_non_knowledge_batch_0006.json (471 examples)
Saved else batch 6 to datasets/gpt4o_mini/batches/gpt-4o-mini_English_else_batch_0006.json (191 examples)

--- Batch 6 Summary ---
Examples in batch: 2000
Knowledge: 1338
Non-knowledge: 471
Else: 191
Total processed so far: 4000/7000
Running totals - K: 2719, NK: 925, E: 356



Processing examples:  86%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñå | 6000/7000 [3:09:48<26:33,  1.59s/it]

Saved knowledge batch 7 to datasets/gpt4o_mini/batches/gpt-4o-mini_English_knowledge_batch_0007.json (1334 examples)
Saved non_knowledge batch 7 to datasets/gpt4o_mini/batches/gpt-4o-mini_English_non_knowledge_batch_0007.json (476 examples)
Saved else batch 7 to datasets/gpt4o_mini/batches/gpt-4o-mini_English_else_batch_0007.json (190 examples)

--- Batch 7 Summary ---
Examples in batch: 2000
Knowledge: 1334
Non-knowledge: 476
Else: 190
Total processed so far: 6000/7000
Running totals - K: 4053, NK: 1401, E: 546



Processing examples: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 7000/7000 [3:44:19<00:00,  1.92s/it]


Saved knowledge batch 8 to datasets/gpt4o_mini/batches/gpt-4o-mini_English_knowledge_batch_0008.json (627 examples)
Saved non_knowledge batch 8 to datasets/gpt4o_mini/batches/gpt-4o-mini_English_non_knowledge_batch_0008.json (266 examples)
Saved else batch 8 to datasets/gpt4o_mini/batches/gpt-4o-mini_English_else_batch_0008.json (107 examples)

--- Batch 8 Summary ---
Examples in batch: 1000
Knowledge: 627
Non-knowledge: 266
Else: 107
Total processed so far: 7000/7000
Running totals - K: 4680, NK: 1667, E: 653


Saved batch metadata to datasets/gpt4o_mini/gpt-4o-mini_English_metadata.json

=== Final Results ===
Total batches created: 8
Total Knowledge: 4680
Total Non-knowledge: 1667
Total Else: 653
Total examples processed: 7000

Consolidating batches...
Loaded 1381 examples from batch 5 for knowledge
Loaded 1338 examples from batch 6 for knowledge
Loaded 1334 examples from batch 7 for knowledge
Loaded 627 examples from batch 8 for knowledge
Saved consolidated knowledge dataset to data

In [1]:
import os
import json
import glob

def consolidate_all_batches(batch_dir="/content", final_dir="/content/final_consolidated"):
    """
    Combine all batches (1‚Äì4) and the already consolidated datasets (5‚Äì8)
    into one final consolidated file per category.
    """
    print("\nüîÑ Creating FINAL consolidated datasets...")

    model_name_safe = "gpt-4o-mini"
    dataset_name_safe = "English"

    os.makedirs(final_dir, exist_ok=True)

    for category in ["knowledge", "non_knowledge", "else"]:
        # Collect batch files (1‚Äì4)
        batch_pattern = os.path.join(batch_dir, f"{model_name_safe}_{dataset_name_safe}_{category}_batch_*.json")
        batch_files = sorted(glob.glob(batch_pattern))

        # Add the already consolidated file (from batches 5‚Äì8)
        old_consolidated = os.path.join(batch_dir, f"{model_name_safe}_{dataset_name_safe}_{category}_dataset_consolidated.json")

        # Combine both
        all_files = batch_files.copy()
        if os.path.exists(old_consolidated):
            all_files.append(old_consolidated)

        if not all_files:
            print(f"‚ö†Ô∏è No files found for {category}. Skipping...")
            continue

        consolidated_data = []
        for path in all_files:
            with open(path, "r") as f:
                data = json.load(f)
                consolidated_data.extend(data)
            print(f"‚úÖ Loaded {len(data)} examples from {os.path.basename(path)}")

        # Save final file
        final_filename = f"{model_name_safe}_{dataset_name_safe}_{category}_dataset_final_consolidated.json"
        final_path = os.path.join(final_dir, final_filename)

        with open(final_path, "w") as f:
            json.dump(consolidated_data, f, indent=2)

        print(f"üì¶ Saved FINAL {category} dataset to {final_path} ({len(consolidated_data)} total examples)")

    print("\nüéØ Final consolidation complete!")

    # Optional: create one combined file with all categories
    combined_data = []
    for category in ["knowledge", "non_knowledge", "else"]:
        final_filename = f"{model_name_safe}_{dataset_name_safe}_{category}_dataset_final_consolidated.json"
        final_path = os.path.join(final_dir, final_filename)
        if os.path.exists(final_path):
            with open(final_path, "r") as f:
                combined_data.extend(json.load(f))

    combined_path = os.path.join(final_dir, f"{model_name_safe}_{dataset_name_safe}_all_categories_final_consolidated.json")
    with open(combined_path, "w") as f:
        json.dump(combined_data, f, indent=2)

    print(f"\nüß© Combined all categories into {combined_path} ({len(combined_data)} total examples)")

if __name__ == "__main__":
    consolidate_all_batches()



üîÑ Creating FINAL consolidated datasets...
‚úÖ Loaded 1319 examples from gpt-4o-mini_English_knowledge_batch_0001.json
‚úÖ Loaded 1296 examples from gpt-4o-mini_English_knowledge_batch_0002.json
‚úÖ Loaded 1351 examples from gpt-4o-mini_English_knowledge_batch_0003.json
‚úÖ Loaded 1333 examples from gpt-4o-mini_English_knowledge_batch_0004.json
‚úÖ Loaded 4680 examples from gpt-4o-mini_English_knowledge_dataset_consolidated.json
üì¶ Saved FINAL knowledge dataset to /content/final_consolidated/gpt-4o-mini_English_knowledge_dataset_final_consolidated.json (9979 total examples)
‚úÖ Loaded 476 examples from gpt-4o-mini_English_non_knowledge_batch_0001.json
‚úÖ Loaded 487 examples from gpt-4o-mini_English_non_knowledge_batch_0002.json
‚úÖ Loaded 446 examples from gpt-4o-mini_English_non_knowledge_batch_0003.json
‚úÖ Loaded 469 examples from gpt-4o-mini_English_non_knowledge_batch_0004.json
‚úÖ Loaded 1667 examples from gpt-4o-mini_English_non_knowledge_dataset_consolidated.json
üì¶ Sav