In [None]:
#!/usr/bin/env python3
"""
Simple Usage Example for CSV to JSON Converter
"""

import pandas as pd
import json

# Simple, direct approach
def simple_csv_to_json(csv_file, json_file, max_rows=15000):
    """
    Simplest possible implementation
    """
    # Read CSV
    df = pd.read_csv(csv_file)

    # Keep first 15000 rows (delete everything from row 15001 onwards)
    df = df.iloc[:max_rows]

    # Convert to JSON (array of objects format)
    df.to_json(json_file, orient='records', indent=2)

    print(f"Converted {len(df)} rows from {csv_file} to {json_file}")



def oneliner_csv_to_json(csv_file, json_file):
    """Ultra-compact version"""
    pd.read_csv(csv_file).iloc[:15000].to_json(json_file, orient='records', indent=2)

if __name__ == "__main__":
    # Example 1: Simple usage
    simple_csv_to_json('turkish.csv', 'turkish_dataset.json')


Converted 15000 rows from turkish.csv to turkish_dataset.json


In [None]:
# --- Imports ---
import json
import random
import time
import tiktoken
from openai import OpenAI
from typing import List, Tuple, Dict, Optional, Set
import numpy as np
from tqdm import tqdm
import os
import re
import unicodedata
from difflib import SequenceMatcher
from collections import Counter

# --- Colab-specific imports ---
from google.colab import userdata, files
import io

class TurkishTextMatcher:
    """Advanced Turkish text matching with multi-tier strategy"""

    def __init__(self):
        # Turkish-specific character mappings
        self.turkish_lower_map = {
            'ƒ∞': 'i', 'I': 'ƒ±', 'ƒû': 'ƒü', '√ú': '√º',
            '≈û': '≈ü', '√ñ': '√∂', '√á': '√ß'
        }

        # Common stop words to ignore in token matching (Turkish and English)
        self.stop_words = {
            # Turkish
            've', 'veya', 'ile', 'i√ßin', 'bir', 'bu', 'da', 'de',
            'mi', 'mu', 'mƒ±', 'm√º', 'ki', 'ne', 'ya', 'ama', 'fakat',
            '√ß√ºnk√º', 'gibi', 'kadar', 'hem', 'daha', '√ßok', 'en',
            # English
            'the', 'of', 'and', 'a', 'an', 'in', 'on', 'at', 'to',
            'for', 'with', 'by', 'from', 'as', 'or', 'but', 'is', 'was'
        }

        # Common abbreviations and variations
        self.abbreviations = {
            'dr': 'doktor',
            'prof': 'profes√∂r',
            'st': 'saint',
            'abd': 'amerika birle≈üik devletleri',
            'usa': 'amerika birle≈üik devletleri',
            'uk': 'birle≈üik krallƒ±k',
            'eu': 'avrupa birliƒüi'
        }

    def turkish_lower(self, text: str) -> str:
        """Properly convert Turkish text to lowercase"""
        # First apply Turkish-specific mappings
        for upper, lower in self.turkish_lower_map.items():
            text = text.replace(upper, lower)
        # Then apply standard lowercase
        return text.lower()

    def normalize_text(self, text: str) -> str:
        """Normalize text for comparison"""
        # Convert to lowercase with Turkish awareness
        text = self.turkish_lower(text)

        # Remove punctuation but keep spaces
        text = re.sub(r'[^\w\s]', ' ', text, flags=re.UNICODE)

        # Normalize whitespace
        text = ' '.join(text.split())

        # Expand common abbreviations
        words = text.split()
        words = [self.abbreviations.get(w, w) for w in words]
        text = ' '.join(words)

        return text.strip()

    def extract_tokens(self, text: str, remove_stopwords: bool = True) -> List[str]:
        """Extract meaningful tokens from text"""
        normalized = self.normalize_text(text)
        tokens = normalized.split()

        if remove_stopwords:
            tokens = [t for t in tokens if t not in self.stop_words]

        return tokens

    def exact_match(self, answer: str, generation: str) -> float:
        """Check for exact match after normalization"""
        norm_answer = self.normalize_text(answer)
        norm_generation = self.normalize_text(generation)

        # Check if exact match or if answer is contained in generation
        if norm_answer == norm_generation:
            return 1.0
        elif norm_answer in norm_generation:
            # Penalize slightly if answer is just contained (not exact)
            return 0.95

        return 0.0

    def token_overlap_match(self, answer: str, generation: str) -> float:
        """Calculate token overlap score"""
        answer_tokens = set(self.extract_tokens(answer, remove_stopwords=True))
        generation_tokens = set(self.extract_tokens(generation, remove_stopwords=False))

        if not answer_tokens:
            return 0.0

        # Check if all answer tokens appear in generation
        overlap = answer_tokens.intersection(generation_tokens)
        coverage = len(overlap) / len(answer_tokens)

        # Bonus if tokens appear in same order
        if coverage == 1.0:
            answer_list = self.extract_tokens(answer, remove_stopwords=True)
            gen_list = self.extract_tokens(generation, remove_stopwords=False)

            # Check sequence preservation
            try:
                indices = [gen_list.index(token) for token in answer_list]
                if indices == sorted(indices):
                    return 0.9  # Full overlap with correct order
                else:
                    return 0.8  # Full overlap but different order
            except ValueError:
                return 0.8

        return coverage * 0.8  # Partial overlap

    def fuzzy_match(self, answer: str, generation: str, threshold: float = 0.85) -> float:
        """Fuzzy string matching using edit distance"""
        norm_answer = self.normalize_text(answer)
        norm_generation = self.normalize_text(generation)

        # For short answers, check if it's contained with small variations
        if len(norm_answer) <= 20:
            # Use SequenceMatcher for similarity
            matcher = SequenceMatcher(None, norm_answer, norm_generation)
            similarity = matcher.ratio()

            if similarity >= threshold:
                return similarity * 0.6  # Scale to max 0.6 for fuzzy matches

            # Also check if answer appears as substring with minor variations
            words_in_gen = norm_generation.split()
            for i in range(len(words_in_gen)):
                for j in range(i+1, min(i+6, len(words_in_gen)+1)):
                    substring = ' '.join(words_in_gen[i:j])
                    matcher = SequenceMatcher(None, norm_answer, substring)
                    if matcher.ratio() >= threshold:
                        return matcher.ratio() * 0.6

        return 0.0

    def partial_credit_match(self, answer: str, generation: str) -> float:
        """Give partial credit for partially correct answers"""
        answer_tokens = self.extract_tokens(answer, remove_stopwords=False)

        # For multi-word answers, check for important parts
        if len(answer_tokens) >= 2:
            # Check for proper nouns (capitalized in original)
            important_parts = []

            # Extract likely important parts (names, places, etc.)
            original_words = answer.split()
            for word in original_words:
                if word and word[0].isupper():
                    important_parts.append(self.turkish_lower(word))

            if not important_parts:
                # If no capitalized words, consider all non-stopwords important
                important_parts = [t for t in answer_tokens if t not in self.stop_words]

            if important_parts:
                norm_generation = self.normalize_text(generation)
                matches = sum(1 for part in important_parts if part in norm_generation)
                return (matches / len(important_parts)) * 0.4

        return 0.0

    def calculate_match_score(self, answer: str, generation: str) -> Tuple[float, str]:
        """
        Calculate overall match score using multi-tier strategy
        Returns: (score, match_type)
        """
        # Tier 1: Exact match
        exact_score = self.exact_match(answer, generation)
        if exact_score > 0:
            return exact_score, "exact"

        # Tier 2: Token overlap
        token_score = self.token_overlap_match(answer, generation)
        if token_score >= 0.8:
            return token_score, "token_overlap"

        # Tier 3: Fuzzy match
        fuzzy_score = self.fuzzy_match(answer, generation)
        if fuzzy_score > 0:
            return fuzzy_score, "fuzzy"

        # Tier 4: Partial credit
        partial_score = self.partial_credit_match(answer, generation)
        if partial_score > 0:
            return partial_score, "partial"

        # Also return the best non-zero score if any
        best_score = max(token_score, fuzzy_score, partial_score)
        if best_score > 0:
            if best_score == token_score:
                return token_score, "token_overlap_low"
            elif best_score == fuzzy_score:
                return fuzzy_score, "fuzzy_low"
            else:
                return partial_score, "partial_low"

        return 0.0, "no_match"


class KnowledgeDatasetGPT:
    def __init__(
        self,
        path_to_knowledge_dataset: str = "datasets/",
        dataset_name: str = "turkish",
        model_name: str = "gpt-4o-mini",
        batch_size: int = 2000
    ):
        """
        Initialize the knowledge dataset creator for GPT-4o-mini with advanced Turkish matching

        Args:
            path_to_knowledge_dataset: Path to save the datasets
            dataset_name: Name of the dataset (default: "turkish")
            model_name: OpenAI model name
            batch_size: Number of examples to process before saving a batch (default: 2000)
        """
        # --- Get API key securely from Colab ---
        api_key = userdata.get('gptapi')
        if not api_key:
            raise ValueError("API key not found in Colab userdata. Please set it with userdata.set('gptapi', 'your_key').")

        # Set seeds for reproducibility
        random.seed(42)
        np.random.seed(42)

        # Initialize OpenAI client
        self.client = OpenAI(api_key=api_key)
        self.model_name = model_name
        self.batch_size = batch_size

        # Initialize tiktoken tokenizer
        self.tokenizer = tiktoken.encoding_for_model("gpt-4")

        # Initialize Turkish text matcher
        self.matcher = TurkishTextMatcher()

        self.dataset_name = dataset_name

        # Create directory if it doesn't exist
        os.makedirs(path_to_knowledge_dataset, exist_ok=True)

        # Create batch directory
        self.batch_dir = os.path.join(path_to_knowledge_dataset, "batches")
        os.makedirs(self.batch_dir, exist_ok=True)

        # Load initial dataset
        initial_dataset = self.load_manual_dataset()

        # Create knowledge dataset with batch processing
        self.create_knowledge_dataset(initial_dataset, path_to_knowledge_dataset)


    def load_manual_dataset(self) -> List[Tuple]:
        """
        Allow manual upload of a dataset file (turkish.json or .csv)
        Each row should contain at least 'soru' and 'cevap'.
        """
        print("Please upload your dataset file (JSON or CSV)...")
        uploaded = files.upload()

        if not uploaded:
            raise ValueError("No file uploaded. Please upload your turkish dataset.")

        file_name = list(uploaded.keys())[0]
        print(f"Uploaded file: {file_name}")

        # --- Parse JSON ---
        if file_name.endswith(".json"):
            data = json.load(io.BytesIO(uploaded[file_name]))
        # --- Parse CSV ---
        elif file_name.endswith(".csv"):
            import pandas as pd
            df = pd.read_csv(io.BytesIO(uploaded[file_name]))
            data = df.to_dict(orient="records")
        else:
            raise ValueError("Unsupported file format. Please upload a JSON or CSV file.")

        print(f"Loaded {len(data)} records from {file_name}")

        dataset = []
        for i, row in enumerate(data):
            if "soru" not in row or "cevap" not in row:
                continue

            prompt = f"soru: {row['soru']}\ncevap:"
            cevap = str(row["cevap"]).strip()
            cevap_tokens = self.tokenize(cevap)
            dataset.append([prompt, cevap, cevap_tokens])

            if i < 5:
                print(f"Example {i}: {prompt[:50]}... -> {cevap}")

        return dataset


    def tokenize(self, text: str) -> List[int]:
        """Tokenize text using tiktoken"""
        return self.tokenizer.encode(text)


    def generate_with_temperature(self, prompt: str, temperature: float = 0.5, n: int = 5) -> List[str]:
        """Generate n completions with specified temperature using OpenAI API"""
        try:
            response = self.client.chat.completions.create(
                model=self.model_name,
                messages=[{"role": "user", "content": prompt}],
                max_tokens=20,
                temperature=temperature,
                n=n,
                stop=["\n", ".", "?"]
            )
            return [choice.message.content.strip() for choice in response.choices]
        except Exception as e:
            print(f"Error during generation: {e}")
            time.sleep(2)
            return [""] * n


    def generate_greedy(self, prompt: str) -> str:
        """Generate with greedy decoding (temperature=0)"""
        try:
            response = self.client.chat.completions.create(
                model=self.model_name,
                messages=[{"role": "user", "content": prompt}],
                max_tokens=20,
                temperature=0,
                n=1,
                stop=["\n", ".", "?"]
            )
            return response.choices[0].message.content.strip()
        except Exception as e:
            print(f"Error during greedy generation: {e}")
            time.sleep(2)
            return ""


    def create_knowledge_dataset(self, initial_dataset: List[Tuple], path_to_save: str):
        """Create knowledge, non-knowledge, and partial knowledge datasets with advanced matching"""

        # Batch tracking
        batch_num = 0
        examples_in_batch = 0

        # Current batch data - now with three categories
        knowledge_dataset = []
        partial_knowledge_dataset = []
        non_knowledge_dataset = []

        # Overall statistics
        total_knowledge = 0
        total_partial = 0
        total_non_knowledge = 0

        # Match type statistics
        match_type_stats = Counter()

        # Metadata for tracking batches
        batch_metadata = {
            "model": self.model_name,
            "dataset_name": self.dataset_name,
            "batch_size": self.batch_size,
            "matching_strategy": "multi-tier Turkish-aware",
            "batches": []
        }

        few_shot_examples = [
            "soru: Fransa'nƒ±n ba≈ükenti neresidir?\ncevap: Paris\n",
            "soru: Romeo ve Juliet'i kim yazdƒ±?\ncevap: William Shakespeare\n",
            "soru: 64'√ºn karek√∂k√º nedir?\ncevap: 8\n",
            "soru: Kimyasal sembol√º H olan element hangisidir?\ncevap: Hidrojen\n",
            "soru: Japonya'nƒ±n para birimi nedir?\ncevap: Japon Yeni\n"
        ]

        print(f"Processing {len(initial_dataset)} examples in batches of {self.batch_size}...")
        print("Using advanced Turkish-aware string matching...")

        # Allow resuming from a specific batch
        start_batch = 1  # Change this to resume from a different batch
        start_index = (start_batch - 1) * self.batch_size
        initial_dataset = initial_dataset[start_index:]
        batch_num = start_batch - 1

        for idx, point in enumerate(tqdm(initial_dataset, desc="Processing examples")):
            prompt, target_cevap, cevap_tokens = point

            # Generate few-shot prompt
            few_shot_prompt = "".join(random.sample(few_shot_examples, 3))
            full_prompt =  few_shot_prompt + prompt

            # Generate completions
            temp_generations = self.generate_with_temperature(full_prompt, temperature=0.5, n=5)
            greedy_generation = self.generate_greedy(full_prompt)

            # Calculate match scores for all generations
            all_generations = temp_generations + [greedy_generation]
            scores = []
            match_types = []

            for gen in all_generations:
                score, match_type = self.matcher.calculate_match_score(target_cevap, gen)
                scores.append(score)
                match_types.append(match_type)
                if score > 0:
                    match_type_stats[match_type] += 1

            # Calculate average score
            avg_score = np.mean(scores)
            max_score = np.max(scores)
            num_matches = sum(1 for s in scores if s > 0)

            # Enhanced classification based on scores
            example_data = {
                "prompt": prompt,
                "target": target_cevap,
                "tokens": cevap_tokens,
                "avg_score": avg_score,
                "max_score": max_score,
                "num_matches": num_matches,
                "scores": scores,
                "match_types": match_types,
                "generations": all_generations[:3]  # Save first 3 generations for analysis
            }

            # Classify based on average score
            if avg_score >= 0.8:
                knowledge_dataset.append(example_data)
                category = "KNOWLEDGE"
            elif avg_score >= 0.3:
                partial_knowledge_dataset.append(example_data)
                category = "PARTIAL"
            else:
                non_knowledge_dataset.append(example_data)
                category = "NON-KNOWLEDGE"

            examples_in_batch += 1

            # Print detailed info for first few examples
            if idx < 5:
                print(f"\n{'='*60}")
                print(f"Example {idx}: {prompt[:60]}...")
                print(f"Target answer: '{target_cevap}'")
                print(f"Greedy generation: '{greedy_generation}'")
                print(f"Match scores: {[f'{s:.2f}' for s in scores]}")
                print(f"Match types: {match_types[:2]}...")
                print(f"Average score: {avg_score:.3f}")
                print(f"Category: {category}")
                print(f"{'='*60}")

            # Check if we should save a batch
            if examples_in_batch >= self.batch_size or idx == len(initial_dataset) - 1:
                batch_num += 1

                # Save current batch
                batch_info = self._save_batch(
                    knowledge_dataset,
                    partial_knowledge_dataset,
                    non_knowledge_dataset,
                    batch_num,
                    self.batch_dir
                )

                # Update totals
                total_knowledge += len(knowledge_dataset)
                total_partial += len(partial_knowledge_dataset)
                total_non_knowledge += len(non_knowledge_dataset)

                # Add batch info to metadata
                batch_metadata["batches"].append({
                    "batch_number": batch_num,
                    "examples_processed": examples_in_batch,
                    "knowledge_count": len(knowledge_dataset),
                    "partial_count": len(partial_knowledge_dataset),
                    "non_knowledge_count": len(non_knowledge_dataset),
                    "total_processed": idx + 1 + start_index
                })

                print(f"\n--- Batch {batch_num} Summary ---")
                print(f"Examples in batch: {examples_in_batch}")
                print(f"Knowledge (‚â•0.8): {len(knowledge_dataset)}")
                print(f"Partial (0.3-0.8): {len(partial_knowledge_dataset)}")
                print(f"Non-knowledge (<0.3): {len(non_knowledge_dataset)}")
                print(f"Total processed so far: {idx + 1 + start_index}/{len(initial_dataset) + start_index}")
                print(f"Running totals - K: {total_knowledge}, P: {total_partial}, NK: {total_non_knowledge}")

                # Clear current batch data to free memory
                knowledge_dataset = []
                partial_knowledge_dataset = []
                non_knowledge_dataset = []
                examples_in_batch = 0

            time.sleep(0.1)  # Rate limiting

        # Add match type statistics to metadata
        batch_metadata["match_type_statistics"] = dict(match_type_stats)

        # Save metadata
        metadata_path = os.path.join(path_to_save, f"{self.model_name.replace('/', '_')}_{self.dataset_name}_metadata.json")
        with open(metadata_path, "w", encoding='utf-8') as f:
            json.dump(batch_metadata, f, indent=2, ensure_ascii=False)
        print(f"\nSaved batch metadata to {metadata_path}")

        # Final summary
        print(f"\n{'='*60}")
        print(f"=== FINAL RESULTS ===")
        print(f"{'='*60}")
        print(f"Total batches created: {batch_num}")
        print(f"Total Knowledge (avg score ‚â•0.8): {total_knowledge}")
        print(f"Total Partial Knowledge (0.3-0.8): {total_partial}")
        print(f"Total Non-knowledge (score <0.3): {total_non_knowledge}")
        print(f"Total examples processed: {total_knowledge + total_partial + total_non_knowledge}")
        print(f"\n--- Match Type Distribution ---")
        for match_type, count in match_type_stats.most_common():
            print(f"{match_type}: {count}")
        print(f"{'='*60}")

        # Ask if user wants to consolidate batches
        consolidate = input("\nDo you want to consolidate all batches into single files? (y/n): ")
        if consolidate.lower() == 'y':
            self._consolidate_batches(batch_num, self.batch_dir, path_to_save)


    def _save_batch(self, knowledge_dataset, partial_dataset, non_knowledge_dataset, batch_num, batch_dir):
        """Save a single batch of datasets in simple format"""
        model_name_safe = self.model_name.replace("/", "_")
        dataset_name_safe = self.dataset_name.replace(" ", "_")

        batch_info = {}

        for name, data in {
            "knowledge": knowledge_dataset,
            "partial_knowledge": partial_dataset,
            "non_knowledge": non_knowledge_dataset
        }.items():
            filename = f"{model_name_safe}_{dataset_name_safe}_{name}_batch_{batch_num:04d}.json"
            path = os.path.join(batch_dir, filename)

            # Convert to simple format: [prompt, target, tokens, match_count]
            simple_data = []
            for item in data:
                # Create simple format matching original structure
                simple_item = [
                    item["prompt"],
                    item["target"],
                    item["tokens"],
                    item["num_matches"]  # Number of generations that matched (0-6)
                ]
                simple_data.append(simple_item)

            with open(path, "w", encoding='utf-8') as f:
                json.dump(simple_data, f, indent=2, ensure_ascii=False)

            batch_info[name] = {
                "filename": filename,
                "count": len(data),
                "path": path
            }

            print(f"Saved {name} batch {batch_num} to {path} ({len(data)} examples)")

        return batch_info


    def _consolidate_batches(self, total_batches: int, batch_dir: str, final_dir: str):
        """Consolidate all batch files into single files for each category"""
        print("\nConsolidating batches...")

        model_name_safe = self.model_name.replace("/", "_")
        dataset_name_safe = self.dataset_name.replace(" ", "_")

        consolidated_stats = {}

        for category in ["knowledge", "partial_knowledge", "non_knowledge"]:
            consolidated_data = []

            # Read all batch files for this category
            for batch_num in range(1, total_batches + 1):
                batch_filename = f"{model_name_safe}_{dataset_name_safe}_{category}_batch_{batch_num:04d}.json"
                batch_path = os.path.join(batch_dir, batch_filename)

                if os.path.exists(batch_path):
                    with open(batch_path, "r", encoding='utf-8') as f:
                        batch_data = json.load(f)
                        consolidated_data.extend(batch_data)

                    print(f"Loaded {len(batch_data)} examples from batch {batch_num} for {category}")

            # Calculate statistics for this category
            if consolidated_data:
                match_counts = [item[3] for item in consolidated_data]  # 4th element is match count
                consolidated_stats[category] = {
                    "count": len(consolidated_data),
                    "avg_matches": np.mean(match_counts),
                    "min_matches": np.min(match_counts),
                    "max_matches": np.max(match_counts)
                }

            # Save consolidated file in simple format
            final_filename = f"{model_name_safe}_{dataset_name_safe}_{category}_dataset_consolidated.json"
            final_path = os.path.join(final_dir, final_filename)

            with open(final_path, "w", encoding='utf-8') as f:
                json.dump(consolidated_data, f, indent=2, ensure_ascii=False)

            print(f"Saved consolidated {category} dataset to {final_path}")
            print(f"  Total examples: {len(consolidated_data)}")
            if category in consolidated_stats:
                stats = consolidated_stats[category]
                print(f"  Match stats - Avg: {stats['avg_matches']:.1f}/6, "
                      f"Min: {stats['min_matches']}, Max: {stats['max_matches']}")

        # Save consolidated statistics
        stats_path = os.path.join(final_dir, f"{model_name_safe}_{dataset_name_safe}_consolidated_stats.json")
        with open(stats_path, "w", encoding='utf-8') as f:
            json.dump(consolidated_stats, f, indent=2, ensure_ascii=False)
        print(f"\nSaved consolidated statistics to {stats_path}")

        print("\nConsolidation complete!")

        # Ask if user wants to delete batch files
        delete_batches = input("\nDo you want to delete the individual batch files? (y/n): ")
        if delete_batches.lower() == 'y':
            import shutil
            shutil.rmtree(batch_dir)
            print("Batch files deleted.")


    def analyze_results(self, consolidated_file: str):
        """Analyze a consolidated dataset file to understand matching patterns"""
        print(f"\nAnalyzing {consolidated_file}...")

        with open(consolidated_file, 'r', encoding='utf-8') as f:
            data = json.load(f)

        if not data:
            print("No data found in file.")
            return

        print(f"Total examples: {len(data)}")

        # Analyze match counts distribution
        match_distribution = {0: 0, 1: 0, 2: 0, 3: 0, 4: 0, 5: 0, 6: 0}

        for item in data:
            # item format: [prompt, target, tokens, match_count]
            match_count = item[3]
            if match_count in match_distribution:
                match_distribution[match_count] += 1

        print("\n--- Match Count Distribution ---")
        for count, freq in sorted(match_distribution.items()):
            percentage = (freq / len(data)) * 100
            print(f"{count}/6 matches: {freq} examples ({percentage:.1f}%)")

        # Show some examples
        print("\n--- Sample Examples ---")
        for i, item in enumerate(data[:5]):
            print(f"\nExample {i+1}:")
            print(f"  Question: {item[0][:80]}...")
            print(f"  Answer: {item[1]}")
            print(f"  Matches: {item[3]}/6")


if __name__ == "__main__":
    # Initialize with batch_size parameter
    dataset_creator = KnowledgeDatasetGPT(
        path_to_knowledge_dataset="datasets/gpt4o_mini/",
        dataset_name="turkish",
        model_name="gpt-4o-mini",
        batch_size=2000  # Save every 2000 examples
    )

    # Optional: Analyze results after processing
    # dataset_creator.analyze_results("datasets/gpt4o_mini/gpt-4o-mini_turkish_knowledge_dataset_consolidated.json")

Please upload your dataset file (JSON or CSV)...


Saving turkish_dataset.json to turkish_dataset (1).json
Uploaded file: turkish_dataset (1).json
Loaded 15000 records from turkish_dataset (1).json
Example 0: soru: Henry Ford, Magic Johnson ve Berry Gordy han... -> Michigan
Example 1: soru: Ring of Fire' hangi okyanusta bulunuyor?
cev... -> Pasifik Okyanusu
Example 2: soru: Doƒüal gazƒ±n ana bile≈üeni nedir?
cevap:... -> Metan
Example 3: soru: "Zamanƒ±n M√ºziƒüine Dans" adlƒ± 12 ciltlik roma... -> Anthony Powell
Example 4: soru: 1999 yƒ±lƒ±nda Avustralya‚Äônƒ±n Melbourne kentin... -> Bƒ±yƒ±klar
Processing 15000 examples in batches of 2000...
Using advanced Turkish-aware string matching...


Processing examples:   0%|          | 1/15000 [00:01<6:47:44,  1.63s/it]


Example 0: soru: Henry Ford, Magic Johnson ve Berry Gordy hangi ABD eya...
Target answer: 'Michigan'
Greedy generation: 'Michigan'
Match scores: ['0.95', '0.95', '1.00', '0.95', '0.95', '1.00']
Match types: ['exact', 'exact']...
Average score: 0.967
Category: KNOWLEDGE


Processing examples:   0%|          | 2/15000 [00:03<6:39:34,  1.60s/it]


Example 1: soru: Ring of Fire' hangi okyanusta bulunuyor?
cevap:...
Target answer: 'Pasifik Okyanusu'
Greedy generation: 'cevap: Pasifik Okyanusu'
Match scores: ['0.95', '0.95', '0.95', '0.95', '0.95', '0.95']
Match types: ['exact', 'exact']...
Average score: 0.950
Category: KNOWLEDGE


Processing examples:   0%|          | 3/15000 [00:04<6:39:07,  1.60s/it]


Example 2: soru: Doƒüal gazƒ±n ana bile≈üeni nedir?
cevap:...
Target answer: 'Metan'
Greedy generation: 'cevap: Metan (CH‚ÇÑ)'
Match scores: ['0.95', '0.95', '0.95', '0.95', '0.95', '0.95']
Match types: ['exact', 'exact']...
Average score: 0.950
Category: KNOWLEDGE


Processing examples:   0%|          | 4/15000 [00:07<7:42:20,  1.85s/it]


Example 3: soru: "Zamanƒ±n M√ºziƒüine Dans" adlƒ± 12 ciltlik roman serisini...
Target answer: 'Anthony Powell'
Greedy generation: 'cevap: "Zamanƒ±n M√ºziƒüine Dans" adlƒ± roman serisini yazan yazar'
Match scores: ['0.00', '0.00', '0.00', '0.00', '0.00', '0.00']
Match types: ['no_match', 'no_match']...
Average score: 0.000
Category: NON-KNOWLEDGE


Processing examples:   0%|          | 5/15000 [00:08<7:01:05,  1.68s/it]


Example 4: soru: 1999 yƒ±lƒ±nda Avustralya‚Äônƒ±n Melbourne kentinde ortaya ...
Target answer: 'Bƒ±yƒ±klar'
Greedy generation: 'Movember kampanyasƒ±, erkeklerin bƒ±yƒ±k bƒ±rakmalarƒ±nƒ± i√ßerir'
Match scores: ['0.00', '0.00', '0.00', '0.00', '0.00', '0.00']
Match types: ['no_match', 'no_match']...
Average score: 0.000
Category: NON-KNOWLEDGE


Processing examples:  13%|‚ñà‚ñé        | 2000/15000 [50:07<4:46:52,  1.32s/it]

Saved knowledge batch 1 to datasets/gpt4o_mini/batches/gpt-4o-mini_turkish_knowledge_batch_0001.json (1044 examples)
Saved partial_knowledge batch 1 to datasets/gpt4o_mini/batches/gpt-4o-mini_turkish_partial_knowledge_batch_0001.json (285 examples)
Saved non_knowledge batch 1 to datasets/gpt4o_mini/batches/gpt-4o-mini_turkish_non_knowledge_batch_0001.json (671 examples)

--- Batch 1 Summary ---
Examples in batch: 2000
Knowledge (‚â•0.8): 1044
Partial (0.3-0.8): 285
Non-knowledge (<0.3): 671
Total processed so far: 2000/15000
Running totals - K: 1044, P: 285, NK: 671


Processing examples:  27%|‚ñà‚ñà‚ñã       | 4000/15000 [1:42:08<4:55:16,  1.61s/it]

Saved knowledge batch 2 to datasets/gpt4o_mini/batches/gpt-4o-mini_turkish_knowledge_batch_0002.json (1066 examples)
Saved partial_knowledge batch 2 to datasets/gpt4o_mini/batches/gpt-4o-mini_turkish_partial_knowledge_batch_0002.json (269 examples)
Saved non_knowledge batch 2 to datasets/gpt4o_mini/batches/gpt-4o-mini_turkish_non_knowledge_batch_0002.json (665 examples)

--- Batch 2 Summary ---
Examples in batch: 2000
Knowledge (‚â•0.8): 1066
Partial (0.3-0.8): 269
Non-knowledge (<0.3): 665
Total processed so far: 4000/15000
Running totals - K: 2110, P: 554, NK: 1336


Processing examples:  40%|‚ñà‚ñà‚ñà‚ñà      | 6000/15000 [2:45:19<4:07:40,  1.65s/it]

Saved knowledge batch 3 to datasets/gpt4o_mini/batches/gpt-4o-mini_turkish_knowledge_batch_0003.json (1098 examples)
Saved partial_knowledge batch 3 to datasets/gpt4o_mini/batches/gpt-4o-mini_turkish_partial_knowledge_batch_0003.json (267 examples)
Saved non_knowledge batch 3 to datasets/gpt4o_mini/batches/gpt-4o-mini_turkish_non_knowledge_batch_0003.json (635 examples)

--- Batch 3 Summary ---
Examples in batch: 2000
Knowledge (‚â•0.8): 1098
Partial (0.3-0.8): 267
Non-knowledge (<0.3): 635
Total processed so far: 6000/15000
Running totals - K: 3208, P: 821, NK: 1971


Processing examples:  53%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñé    | 8000/15000 [3:46:04<3:11:34,  1.64s/it]

Saved knowledge batch 4 to datasets/gpt4o_mini/batches/gpt-4o-mini_turkish_knowledge_batch_0004.json (1022 examples)
Saved partial_knowledge batch 4 to datasets/gpt4o_mini/batches/gpt-4o-mini_turkish_partial_knowledge_batch_0004.json (283 examples)
Saved non_knowledge batch 4 to datasets/gpt4o_mini/batches/gpt-4o-mini_turkish_non_knowledge_batch_0004.json (695 examples)

--- Batch 4 Summary ---
Examples in batch: 2000
Knowledge (‚â•0.8): 1022
Partial (0.3-0.8): 283
Non-knowledge (<0.3): 695
Total processed so far: 8000/15000
Running totals - K: 4230, P: 1104, NK: 2666


Processing examples:  67%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñã   | 10000/15000 [4:47:32<2:11:50,  1.58s/it]

Saved knowledge batch 5 to datasets/gpt4o_mini/batches/gpt-4o-mini_turkish_knowledge_batch_0005.json (1048 examples)
Saved partial_knowledge batch 5 to datasets/gpt4o_mini/batches/gpt-4o-mini_turkish_partial_knowledge_batch_0005.json (276 examples)
Saved non_knowledge batch 5 to datasets/gpt4o_mini/batches/gpt-4o-mini_turkish_non_knowledge_batch_0005.json (676 examples)

--- Batch 5 Summary ---
Examples in batch: 2000
Knowledge (‚â•0.8): 1048
Partial (0.3-0.8): 276
Non-knowledge (<0.3): 676
Total processed so far: 10000/15000
Running totals - K: 5278, P: 1380, NK: 3342


Processing examples:  80%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà  | 12000/15000 [5:47:58<1:30:16,  1.81s/it]

Saved knowledge batch 6 to datasets/gpt4o_mini/batches/gpt-4o-mini_turkish_knowledge_batch_0006.json (1076 examples)
Saved partial_knowledge batch 6 to datasets/gpt4o_mini/batches/gpt-4o-mini_turkish_partial_knowledge_batch_0006.json (260 examples)
Saved non_knowledge batch 6 to datasets/gpt4o_mini/batches/gpt-4o-mini_turkish_non_knowledge_batch_0006.json (664 examples)

--- Batch 6 Summary ---
Examples in batch: 2000
Knowledge (‚â•0.8): 1076
Partial (0.3-0.8): 260
Non-knowledge (<0.3): 664
Total processed so far: 12000/15000
Running totals - K: 6354, P: 1640, NK: 4006


Processing examples:  93%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñé| 14000/15000 [6:46:42<25:46,  1.55s/it]

Saved knowledge batch 7 to datasets/gpt4o_mini/batches/gpt-4o-mini_turkish_knowledge_batch_0007.json (1089 examples)
Saved partial_knowledge batch 7 to datasets/gpt4o_mini/batches/gpt-4o-mini_turkish_partial_knowledge_batch_0007.json (278 examples)
Saved non_knowledge batch 7 to datasets/gpt4o_mini/batches/gpt-4o-mini_turkish_non_knowledge_batch_0007.json (633 examples)

--- Batch 7 Summary ---
Examples in batch: 2000
Knowledge (‚â•0.8): 1089
Partial (0.3-0.8): 278
Non-knowledge (<0.3): 633
Total processed so far: 14000/15000
Running totals - K: 7443, P: 1918, NK: 4639


Processing examples: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 15000/15000 [7:12:33<00:00,  1.73s/it]


Saved knowledge batch 8 to datasets/gpt4o_mini/batches/gpt-4o-mini_turkish_knowledge_batch_0008.json (513 examples)
Saved partial_knowledge batch 8 to datasets/gpt4o_mini/batches/gpt-4o-mini_turkish_partial_knowledge_batch_0008.json (138 examples)
Saved non_knowledge batch 8 to datasets/gpt4o_mini/batches/gpt-4o-mini_turkish_non_knowledge_batch_0008.json (349 examples)

--- Batch 8 Summary ---
Examples in batch: 1000
Knowledge (‚â•0.8): 513
Partial (0.3-0.8): 138
Non-knowledge (<0.3): 349
Total processed so far: 15000/15000
Running totals - K: 7956, P: 2056, NK: 4988

Saved batch metadata to datasets/gpt4o_mini/gpt-4o-mini_turkish_metadata.json

=== FINAL RESULTS ===
Total batches created: 8
Total Knowledge (avg score ‚â•0.8): 7956
Total Partial Knowledge (0.3-0.8): 2056
Total Non-knowledge (score <0.3): 4988
Total examples processed: 15000

--- Match Type Distribution ---
exact: 52603
partial: 3846
fuzzy: 2170
token_overlap_low: 793
token_overlap: 649

Do you want to consolidate all b

TypeError: Object of type int64 is not JSON serializable

In [None]:
#!/usr/bin/env python3
"""
Script to combine multiple JSON batch files into a single JSON file.
Combines files: gpt-4o-mini_turkish_knowledge_batch_0001.json through batch_0008.json
"""

import json
import os
from pathlib import Path
from typing import List, Dict, Any, Union
import logging

# Configure logging
logging.basicConfig(
    level=logging.INFO,
    format='%(asctime)s - %(levelname)s - %(message)s'
)
logger = logging.getLogger(__name__)


def combine_json_batch_files(
    input_directory: str,
    output_file: str,
    file_prefix: str = "gpt-4o-mini_turkish_knowledge_batch_",
    start_num: int = 1,
    end_num: int = 8
) -> Dict[str, Any]:
    """
    Combine multiple JSON batch files into a single JSON file.

    Args:
        input_directory: Directory containing the batch files
        output_file: Path for the combined output file
        file_prefix: Common prefix for batch files
        start_num: Starting batch number (inclusive)
        end_num: Ending batch number (inclusive)

    Returns:
        Dictionary with statistics about the combination process
    """

    # Initialize statistics
    stats = {
        'files_processed': 0,
        'total_items': 0,
        'errors': [],
        'missing_files': [],
        'output_file': output_file
    }

    # Initialize combined data container
    combined_data = []

    # Create Path object for input directory
    input_path = Path(input_directory)

    # Check if input directory exists
    if not input_path.exists():
        error_msg = f"Input directory does not exist: {input_directory}"
        logger.error(error_msg)
        raise FileNotFoundError(error_msg)

    # Process each batch file
    for batch_num in range(start_num, end_num + 1):
        # Construct filename with zero-padded number
        filename = f"{file_prefix}{batch_num:04d}.json"
        file_path = input_path / filename

        logger.info(f"Processing file {batch_num}/{end_num}: {filename}")

        # Check if file exists
        if not file_path.exists():
            logger.warning(f"File not found: {file_path}")
            stats['missing_files'].append(str(file_path))
            continue

        try:
            # Read and parse JSON file
            with open(file_path, 'r', encoding='utf-8') as f:
                data = json.load(f)

                # Handle different JSON structures
                if isinstance(data, list):
                    # If it's an array, extend the combined data
                    combined_data.extend(data)
                    items_added = len(data)

                elif isinstance(data, dict):
                    # If it's an object, check for common patterns
                    if 'data' in data and isinstance(data['data'], list):
                        # Common pattern: {"data": [...]}
                        combined_data.extend(data['data'])
                        items_added = len(data['data'])
                    elif 'items' in data and isinstance(data['items'], list):
                        # Alternative pattern: {"items": [...]}
                        combined_data.extend(data['items'])
                        items_added = len(data['items'])
                    else:
                        # Single object, add as item
                        combined_data.append(data)
                        items_added = 1
                else:
                    # Unknown structure, add as is
                    combined_data.append(data)
                    items_added = 1

                stats['files_processed'] += 1
                stats['total_items'] += items_added
                logger.info(f"  Added {items_added} items from {filename}")

        except json.JSONDecodeError as e:
            error_msg = f"JSON decode error in {file_path}: {str(e)}"
            logger.error(error_msg)
            stats['errors'].append(error_msg)

        except Exception as e:
            error_msg = f"Unexpected error reading {file_path}: {str(e)}"
            logger.error(error_msg)
            stats['errors'].append(error_msg)

    # Write combined data to output file
    try:
        output_path = Path(output_file)

        # Create output directory if it doesn't exist
        output_path.parent.mkdir(parents=True, exist_ok=True)

        # Write combined JSON
        with open(output_path, 'w', encoding='utf-8') as f:
            json.dump(combined_data, f, indent=2, ensure_ascii=False)

        logger.info(f"Successfully wrote combined data to {output_file}")

    except Exception as e:
        error_msg = f"Error writing output file: {str(e)}"
        logger.error(error_msg)
        stats['errors'].append(error_msg)
        raise

    # Log summary statistics
    logger.info("=" * 50)
    logger.info("COMBINATION SUMMARY:")
    logger.info(f"  Files processed: {stats['files_processed']}/{end_num - start_num + 1}")
    logger.info(f"  Total items combined: {stats['total_items']}")
    logger.info(f"  Output file: {stats['output_file']}")

    if stats['missing_files']:
        logger.warning(f"  Missing files: {len(stats['missing_files'])}")
        for file in stats['missing_files']:
            logger.warning(f"    - {file}")

    if stats['errors']:
        logger.error(f"  Errors encountered: {len(stats['errors'])}")
        for error in stats['errors']:
            logger.error(f"    - {error}")

    logger.info("=" * 50)

    return stats


def main():
    """Main function to run the batch combination process."""

    # Configuration
    INPUT_DIR = "/content/datasets/gpt4o_mini/batches"
    OUTPUT_FILE = "/content/datasets/gpt4o_mini/combined_turkish_knowledge.json"


    try:
        # Run the combination process
        stats = combine_json_batch_files(
            input_directory=INPUT_DIR,
            output_file=OUTPUT_FILE,
            file_prefix="gpt-4o-mini_turkish_knowledge_batch_",
            start_num=1,
            end_num=8
        )

        # Print final success message
        print(f"\n‚úÖ Successfully combined {stats['files_processed']} files")
        print(f"üìä Total items in combined file: {stats['total_items']}")
        print(f"üìÅ Output saved to: {stats['output_file']}")

        # Return success code
        return 0

    except Exception as e:
        print(f"\n‚ùå Failed to combine files: {str(e)}")
        return 1


if __name__ == "__main__":
    exit(main())


‚úÖ Successfully combined 8 files
üìä Total items in combined file: 7956
üìÅ Output saved to: /content/datasets/gpt4o_mini/combined_turkish_knowledge.json


In [None]:
#!/usr/bin/env python3
"""
Script to combine multiple JSON batch files into a single JSON file.
Combines files: gpt-4o-mini_turkish_knowledge_batch_0001.json through batch_0008.json
"""

import json
import os
from pathlib import Path
from typing import List, Dict, Any, Union
import logging

# Configure logging
logging.basicConfig(
    level=logging.INFO,
    format='%(asctime)s - %(levelname)s - %(message)s'
)
logger = logging.getLogger(__name__)


def combine_json_batch_files(
    input_directory: str,
    output_file: str,
    file_prefix: str = "gpt-4o-mini_turkish_non_knowledge_batch_",
    start_num: int = 1,
    end_num: int = 8
) -> Dict[str, Any]:
    """
    Combine multiple JSON batch files into a single JSON file.

    Args:
        input_directory: Directory containing the batch files
        output_file: Path for the combined output file
        file_prefix: Common prefix for batch files
        start_num: Starting batch number (inclusive)
        end_num: Ending batch number (inclusive)

    Returns:
        Dictionary with statistics about the combination process
    """

    # Initialize statistics
    stats = {
        'files_processed': 0,
        'total_items': 0,
        'errors': [],
        'missing_files': [],
        'output_file': output_file
    }

    # Initialize combined data container
    combined_data = []

    # Create Path object for input directory
    input_path = Path(input_directory)

    # Check if input directory exists
    if not input_path.exists():
        error_msg = f"Input directory does not exist: {input_directory}"
        logger.error(error_msg)
        raise FileNotFoundError(error_msg)

    # Process each batch file
    for batch_num in range(start_num, end_num + 1):
        # Construct filename with zero-padded number
        filename = f"{file_prefix}{batch_num:04d}.json"
        file_path = input_path / filename

        logger.info(f"Processing file {batch_num}/{end_num}: {filename}")

        # Check if file exists
        if not file_path.exists():
            logger.warning(f"File not found: {file_path}")
            stats['missing_files'].append(str(file_path))
            continue

        try:
            # Read and parse JSON file
            with open(file_path, 'r', encoding='utf-8') as f:
                data = json.load(f)

                # Handle different JSON structures
                if isinstance(data, list):
                    # If it's an array, extend the combined data
                    combined_data.extend(data)
                    items_added = len(data)

                elif isinstance(data, dict):
                    # If it's an object, check for common patterns
                    if 'data' in data and isinstance(data['data'], list):
                        # Common pattern: {"data": [...]}
                        combined_data.extend(data['data'])
                        items_added = len(data['data'])
                    elif 'items' in data and isinstance(data['items'], list):
                        # Alternative pattern: {"items": [...]}
                        combined_data.extend(data['items'])
                        items_added = len(data['items'])
                    else:
                        # Single object, add as item
                        combined_data.append(data)
                        items_added = 1
                else:
                    # Unknown structure, add as is
                    combined_data.append(data)
                    items_added = 1

                stats['files_processed'] += 1
                stats['total_items'] += items_added
                logger.info(f"  Added {items_added} items from {filename}")

        except json.JSONDecodeError as e:
            error_msg = f"JSON decode error in {file_path}: {str(e)}"
            logger.error(error_msg)
            stats['errors'].append(error_msg)

        except Exception as e:
            error_msg = f"Unexpected error reading {file_path}: {str(e)}"
            logger.error(error_msg)
            stats['errors'].append(error_msg)

    # Write combined data to output file
    try:
        output_path = Path(output_file)

        # Create output directory if it doesn't exist
        output_path.parent.mkdir(parents=True, exist_ok=True)

        # Write combined JSON
        with open(output_path, 'w', encoding='utf-8') as f:
            json.dump(combined_data, f, indent=2, ensure_ascii=False)

        logger.info(f"Successfully wrote combined data to {output_file}")

    except Exception as e:
        error_msg = f"Error writing output file: {str(e)}"
        logger.error(error_msg)
        stats['errors'].append(error_msg)
        raise

    # Log summary statistics
    logger.info("=" * 50)
    logger.info("COMBINATION SUMMARY:")
    logger.info(f"  Files processed: {stats['files_processed']}/{end_num - start_num + 1}")
    logger.info(f"  Total items combined: {stats['total_items']}")
    logger.info(f"  Output file: {stats['output_file']}")

    if stats['missing_files']:
        logger.warning(f"  Missing files: {len(stats['missing_files'])}")
        for file in stats['missing_files']:
            logger.warning(f"    - {file}")

    if stats['errors']:
        logger.error(f"  Errors encountered: {len(stats['errors'])}")
        for error in stats['errors']:
            logger.error(f"    - {error}")

    logger.info("=" * 50)

    return stats


def main():
    """Main function to run the batch combination process."""

    # Configuration
    INPUT_DIR = "/content/datasets/gpt4o_mini/batches"
    OUTPUT_FILE = "/content/datasets/gpt4o_mini/combined_turkish_non_knowledge.json"


    try:
        # Run the combination process
        stats = combine_json_batch_files(
            input_directory=INPUT_DIR,
            output_file=OUTPUT_FILE,
            file_prefix="gpt-4o-mini_turkish_non_knowledge_batch_",
            start_num=1,
            end_num=8
        )

        # Print final success message
        print(f"\n‚úÖ Successfully combined {stats['files_processed']} files")
        print(f"üìä Total items in combined file: {stats['total_items']}")
        print(f"üìÅ Output saved to: {stats['output_file']}")

        # Return success code
        return 0

    except Exception as e:
        print(f"\n‚ùå Failed to combine files: {str(e)}")
        return 1


if __name__ == "__main__":
    exit(main())


‚úÖ Successfully combined 8 files
üìä Total items in combined file: 4988
üìÅ Output saved to: /content/datasets/gpt4o_mini/combined_turkish_non_knowledge.json
