In [None]:
#!/usr/bin/env python3
"""
Simple Usage Example for CSV to JSON Converter
"""

import pandas as pd
import json

# Simple, direct approach
def simple_csv_to_json(csv_file, json_file, max_rows=15000):
    """
    Simplest possible implementation
    """
    # Read CSV
    df = pd.read_csv(csv_file)

    # Keep first 15000 rows (delete everything from row 15001 onwards)
    df = df.iloc[:max_rows]

    # Convert to JSON (array of objects format)
    df.to_json(json_file, orient='records', indent=2)

    print(f"Converted {len(df)} rows from {csv_file} to {json_file}")



def oneliner_csv_to_json(csv_file, json_file):
    """Ultra-compact version"""
    pd.read_csv(csv_file).iloc[:15000].to_json(json_file, orient='records', indent=2)

if __name__ == "__main__":
    # Example 1: Simple usage
    simple_csv_to_json('turkish.csv', 'turkish_dataset.json')


Converted 15000 rows from turkish.csv to turkish_dataset.json


In [None]:
# --- Imports ---
import json
import random
import time
from openai import OpenAI  # Use OpenAI SDK for DeepSeek
from typing import List, Tuple, Dict, Optional, Set
import numpy as np
from tqdm import tqdm
import os
import re
import unicodedata
from difflib import SequenceMatcher
from collections import Counter
import sys

# --- Transformers import for DeepSeek tokenizer ---
try:
    from transformers import AutoTokenizer
except ImportError:
    print("Error: transformers library not installed.")
    print("Please install it using: pip install transformers")
    sys.exit(1)

# --- Colab-specific imports ---
try:
    from google.colab import userdata, files
    import io
    IN_COLAB = True
except ImportError:
    IN_COLAB = False
    print("Note: Not running in Google Colab. File upload functionality will be limited.")


class TurkishTextMatcher:
    """Advanced Turkish text matching with multi-tier strategy"""

    def __init__(self):
        # Turkish-specific character mappings
        self.turkish_lower_map = {
            'ƒ∞': 'i', 'I': 'ƒ±', 'ƒû': 'ƒü', '√ú': '√º',
            '≈û': '≈ü', '√ñ': '√∂', '√á': '√ß'
        }

        # Common stop words to ignore in token matching (Turkish and English)
        self.stop_words = {
            # Turkish
            've', 'veya', 'ile', 'i√ßin', 'bir', 'bu', 'da', 'de',
            'mi', 'mu', 'mƒ±', 'm√º', 'ki', 'ne', 'ya', 'ama', 'fakat',
            '√ß√ºnk√º', 'gibi', 'kadar', 'hem', 'daha', '√ßok', 'en',
            # English
            'the', 'of', 'and', 'a', 'an', 'in', 'on', 'at', 'to',
            'for', 'with', 'by', 'from', 'as', 'or', 'but', 'is', 'was'
        }

        # Common abbreviations and variations
        self.abbreviations = {
            'dr': 'doktor',
            'prof': 'profes√∂r',
            'st': 'saint',
            'abd': 'amerika birle≈üik devletleri',
            'usa': 'amerika birle≈üik devletleri',
            'uk': 'birle≈üik krallƒ±k',
            'eu': 'avrupa birliƒüi'
        }

    def turkish_lower(self, text: str) -> str:
        """Properly convert Turkish text to lowercase"""
        # First apply Turkish-specific mappings
        for upper, lower in self.turkish_lower_map.items():
            text = text.replace(upper, lower)
        # Then apply standard lowercase
        return text.lower()

    def normalize_text(self, text: str) -> str:
        """Normalize text for comparison"""
        # Convert to lowercase with Turkish awareness
        text = self.turkish_lower(text)

        # Remove punctuation but keep spaces
        text = re.sub(r'[^\w\s]', ' ', text, flags=re.UNICODE)

        # Normalize whitespace
        text = ' '.join(text.split())

        # Expand common abbreviations
        words = text.split()
        words = [self.abbreviations.get(w, w) for w in words]
        text = ' '.join(words)

        return text.strip()

    def extract_tokens(self, text: str, remove_stopwords: bool = True) -> List[str]:
        """Extract meaningful tokens from text"""
        normalized = self.normalize_text(text)
        tokens = normalized.split()

        if remove_stopwords:
            tokens = [t for t in tokens if t not in self.stop_words]

        return tokens

    def exact_match(self, answer: str, generation: str) -> float:
        """Check for exact match after normalization"""
        norm_answer = self.normalize_text(answer)
        norm_generation = self.normalize_text(generation)

        # Check if exact match or if answer is contained in generation
        if norm_answer == norm_generation:
            return 1.0
        elif norm_answer in norm_generation:
            # Penalize slightly if answer is just contained (not exact)
            return 0.95

        return 0.0

    def token_overlap_match(self, answer: str, generation: str) -> float:
        """Calculate token overlap score"""
        answer_tokens = set(self.extract_tokens(answer, remove_stopwords=True))
        generation_tokens = set(self.extract_tokens(generation, remove_stopwords=False))

        if not answer_tokens:
            return 0.0

        # Check if all answer tokens appear in generation
        overlap = answer_tokens.intersection(generation_tokens)
        coverage = len(overlap) / len(answer_tokens)

        # Bonus if tokens appear in same order
        if coverage == 1.0:
            answer_list = self.extract_tokens(answer, remove_stopwords=True)
            gen_list = self.extract_tokens(generation, remove_stopwords=False)

            # Check sequence preservation
            try:
                indices = [gen_list.index(token) for token in answer_list]
                if indices == sorted(indices):
                    return 0.9  # Full overlap with correct order
                else:
                    return 0.8  # Full overlap but different order
            except ValueError:
                return 0.8

        return coverage * 0.8  # Partial overlap

    def fuzzy_match(self, answer: str, generation: str, threshold: float = 0.85) -> float:
        """Fuzzy string matching using edit distance"""
        norm_answer = self.normalize_text(answer)
        norm_generation = self.normalize_text(generation)

        # For short answers, check if it's contained with small variations
        if len(norm_answer) <= 20:
            # Use SequenceMatcher for similarity
            matcher = SequenceMatcher(None, norm_answer, norm_generation)
            similarity = matcher.ratio()

            if similarity >= threshold:
                return similarity * 0.6  # Scale to max 0.6 for fuzzy matches

            # Also check if answer appears as substring with minor variations
            words_in_gen = norm_generation.split()
            for i in range(len(words_in_gen)):
                for j in range(i+1, min(i+6, len(words_in_gen)+1)):
                    substring = ' '.join(words_in_gen[i:j])
                    matcher = SequenceMatcher(None, norm_answer, substring)
                    if matcher.ratio() >= threshold:
                        return matcher.ratio() * 0.6

        return 0.0

    def partial_credit_match(self, answer: str, generation: str) -> float:
        """Give partial credit for partially correct answers"""
        answer_tokens = self.extract_tokens(answer, remove_stopwords=False)

        # For multi-word answers, check for important parts
        if len(answer_tokens) >= 2:
            # Check for proper nouns (capitalized in original)
            important_parts = []

            # Extract likely important parts (names, places, etc.)
            original_words = answer.split()
            for word in original_words:
                if word and word[0].isupper():
                    important_parts.append(self.turkish_lower(word))

            if not important_parts:
                # If no capitalized words, consider all non-stopwords important
                important_parts = [t for t in answer_tokens if t not in self.stop_words]

            if important_parts:
                norm_generation = self.normalize_text(generation)
                matches = sum(1 for part in important_parts if part in norm_generation)
                return (matches / len(important_parts)) * 0.4

        return 0.0

    def calculate_match_score(self, answer: str, generation: str) -> Tuple[float, str]:
        """
        Calculate overall match score using multi-tier strategy
        Returns: (score, match_type)
        """
        # Tier 1: Exact match
        exact_score = self.exact_match(answer, generation)
        if exact_score > 0:
            return exact_score, "exact"

        # Tier 2: Token overlap
        token_score = self.token_overlap_match(answer, generation)
        if token_score >= 0.8:
            return token_score, "token_overlap"

        # Tier 3: Fuzzy match
        fuzzy_score = self.fuzzy_match(answer, generation)
        if fuzzy_score > 0:
            return fuzzy_score, "fuzzy"

        # Tier 4: Partial credit
        partial_score = self.partial_credit_match(answer, generation)
        if partial_score > 0:
            return partial_score, "partial"

        # Also return the best non-zero score if any
        best_score = max(token_score, fuzzy_score, partial_score)
        if best_score > 0:
            if best_score == token_score:
                return token_score, "token_overlap_low"
            elif best_score == fuzzy_score:
                return fuzzy_score, "fuzzy_low"
            else:
                return partial_score, "partial_low"

        return 0.0, "no_match"


class KnowledgeDatasetDeepSeek:
    def __init__(
        self,
        path_to_knowledge_dataset: str = "datasets/",
        dataset_name: str = "turkish",
        model_name: str = "deepseek-chat",  # DeepSeek model name for API
        batch_size: int = 2000,
        tokenizer_model: str = "deepseek-ai/DeepSeek-V3"  # DeepSeek V3 tokenizer
    ):
        """
        Initialize the knowledge dataset creator for DeepSeek V3 with advanced Turkish matching

        Args:
            path_to_knowledge_dataset: Path to save the datasets
            dataset_name: Name of the dataset (default: "turkish")
            model_name: DeepSeek model name for API (deepseek-chat or deepseek-reasoner)
            batch_size: Number of examples to process before saving a batch (default: 2000)
            tokenizer_model: HuggingFace model ID for tokenizer (default: DeepSeek V3)
        """
        print(f"Initializing Turkish Knowledge Dataset Creator with DeepSeek V3...")
        print(f"Model: {model_name}")
        print(f"Tokenizer: {tokenizer_model}")
        print(f"Batch size: {batch_size}")

        # --- Get API key securely ---
        if IN_COLAB:
            api_key = userdata.get('deepseek')
            if not api_key:
                raise ValueError("API key not found in Colab userdata. Please set it with userdata.set('deepseek', 'your_key').")
        else:
            # For non-Colab environments, try environment variable
            api_key = os.environ.get('DEEPSEEK_API_KEY')
            if not api_key:
                raise ValueError("DEEPSEEK_API_KEY environment variable not set.")

        # Set seeds for reproducibility
        random.seed(42)
        np.random.seed(42)

        # Initialize OpenAI client with DeepSeek base URL
        self.client = OpenAI(
            api_key=api_key,
            base_url="https://api.deepseek.com"  # DeepSeek API endpoint
        )
        self.model_name = model_name
        self.batch_size = batch_size

        # Initialize DeepSeek V3 tokenizer
        print(f"\nLoading DeepSeek V3 tokenizer from {tokenizer_model}...")
        print("This may take a moment on first run as it downloads the tokenizer files...")

        try:
            self.tokenizer = AutoTokenizer.from_pretrained(
                tokenizer_model,
                trust_remote_code=True,  # DeepSeek may use custom tokenizer code
                cache_dir=os.path.join(path_to_knowledge_dataset, ".tokenizer_cache")  # Cache tokenizer files
            )
            print(f"‚úì Tokenizer loaded successfully!")
            print(f"  Vocabulary size: {self.tokenizer.vocab_size}")
            print(f"  Model max length: {self.tokenizer.model_max_length}")

            # Test tokenizer with Turkish text
            test_text = "Merhaba d√ºnya! ƒ∞stanbul'dan selamlar."
            test_tokens = self.tokenizer.encode(test_text, add_special_tokens=False)
            print(f"  Turkish test: '{test_text}' -> {len(test_tokens)} tokens")

        except Exception as e:
            print(f"\n‚ùå Error loading DeepSeek V3 tokenizer: {e}")
            print("\nTroubleshooting:")
            print("1. Ensure you have internet connection for first-time download")
            print("2. Try installing/updating transformers: pip install --upgrade transformers")
            print("3. If behind proxy, configure proxy settings")
            print(f"4. Check if model '{tokenizer_model}' exists on HuggingFace")
            raise

        # Initialize Turkish text matcher
        self.matcher = TurkishTextMatcher()
        print("\n‚úì Turkish text matcher initialized with multi-tier matching strategy")

        self.dataset_name = dataset_name

        # Create directory if it doesn't exist
        os.makedirs(path_to_knowledge_dataset, exist_ok=True)

        # Create batch directory
        self.batch_dir = os.path.join(path_to_knowledge_dataset, "batches")
        os.makedirs(self.batch_dir, exist_ok=True)

        # Load initial dataset
        initial_dataset = self.load_manual_dataset()

        # Create knowledge dataset with batch processing
        self.create_knowledge_dataset(initial_dataset, path_to_knowledge_dataset)

    def load_manual_dataset(self) -> List[Tuple]:
        """
        Allow manual upload of a dataset file (turkish.json or .csv)
        Each row should contain at least 'soru' and 'cevap'.
        """
        if IN_COLAB:
            print("Please upload your Turkish dataset file (JSON or CSV)...")
            uploaded = files.upload()

            if not uploaded:
                raise ValueError("No file uploaded. Please upload your Turkish dataset.")

            file_name = list(uploaded.keys())[0]
            print(f"Uploaded file: {file_name}")

            # --- Parse JSON ---
            if file_name.endswith(".json"):
                data = json.load(io.BytesIO(uploaded[file_name]))
            # --- Parse CSV ---
            elif file_name.endswith(".csv"):
                import pandas as pd
                df = pd.read_csv(io.BytesIO(uploaded[file_name]))
                data = df.to_dict(orient="records")
            else:
                raise ValueError("Unsupported file format. Please upload a JSON or CSV file.")
        else:
            # Non-Colab: Look for local file
            print("Looking for Turkish dataset file in current directory...")
            json_files = [f for f in os.listdir('.') if f.endswith('.json') and 'turkish' in f.lower()]
            csv_files = [f for f in os.listdir('.') if f.endswith('.csv') and 'turkish' in f.lower()]

            if json_files:
                file_name = json_files[0]
                print(f"Found JSON file: {file_name}")
                with open(file_name, 'r', encoding='utf-8') as f:
                    data = json.load(f)
            elif csv_files:
                file_name = csv_files[0]
                print(f"Found CSV file: {file_name}")
                import pandas as pd
                df = pd.read_csv(file_name, encoding='utf-8')
                data = df.to_dict(orient="records")
            else:
                raise ValueError("No Turkish dataset file found. Please place a .json or .csv file in the current directory.")

        print(f"Loaded {len(data)} records from {file_name}")

        dataset = []
        token_stats = []

        for i, row in enumerate(data):
            if "soru" not in row or "cevap" not in row:
                print(f"Skipping row {i}: missing 'soru' or 'cevap' field")
                continue

            prompt = f"soru: {row['soru']}\ncevap:"
            cevap = str(row["cevap"]).strip()
            cevap_tokens = self.tokenize(cevap)
            dataset.append([prompt, cevap, cevap_tokens])
            token_stats.append(len(cevap_tokens))

            if i < 5:
                print(f"Example {i}: {prompt[:50]}... -> {cevap} ({len(cevap_tokens)} tokens)")

        # Print tokenization statistics
        if token_stats:
            print(f"\nTokenization Statistics (DeepSeek V3):")
            print(f"  Average tokens per answer: {np.mean(token_stats):.2f}")
            print(f"  Min tokens: {np.min(token_stats)}")
            print(f"  Max tokens: {np.max(token_stats)}")
            print(f"  Median tokens: {np.median(token_stats):.2f}")

        return dataset

    def tokenize(self, text: str) -> List[int]:
        """
        Tokenize text using DeepSeek V3's tokenizer

        Args:
            text: Input text to tokenize (supports Turkish)

        Returns:
            List of token IDs
        """
        try:
            # Use DeepSeek's tokenizer without special tokens for accurate count
            tokens = self.tokenizer.encode(
                text,
                add_special_tokens=False,
                truncation=False,  # Don't truncate to get full token count
                return_tensors=None  # Return as list
            )
            return tokens
        except Exception as e:
            print(f"Warning: Tokenization failed for text: {text[:50]}... Error: {e}")
            # Return empty list if tokenization fails
            return []

    def generate_with_temperature(self, prompt: str, temperature: float = 0.5, n: int = 5) -> List[str]:
        """Generate n completions with specified temperature using DeepSeek API"""
        completions = []

        # DeepSeek may not support n parameter directly, so we make multiple calls
        for attempt in range(n):
            try:
                response = self.client.chat.completions.create(
                    model=self.model_name,
                    messages=[{"role": "user", "content": prompt}],
                    max_tokens=20,
                    temperature=temperature,
                    stop=["\n", ".", "?"]
                )
                # Extract just the answer part
                full_response = response.choices[0].message.content.strip()
                if "cevap:" in full_response:
                    answer = full_response.split("cevap:")[-1].strip()
                else:
                    answer = full_response
                completions.append(answer)
            except Exception as e:
                print(f"Error during generation (attempt {attempt + 1}/{n}): {e}")
                time.sleep(2)
                completions.append("")

        return completions

    def generate_greedy(self, prompt: str) -> str:
        """Generate with greedy decoding (temperature=0)"""
        try:
            response = self.client.chat.completions.create(
                model=self.model_name,
                messages=[{"role": "user", "content": prompt}],
                max_tokens=20,
                temperature=0,
                stop=["\n", ".", "?"]
            )
            # Extract just the answer part
            full_response = response.choices[0].message.content.strip()
            if "cevap:" in full_response:
                return full_response.split("cevap:")[-1].strip()
            return full_response
        except Exception as e:
            print(f"Error during greedy generation: {e}")
            time.sleep(2)
            return ""

    def create_knowledge_dataset(self, initial_dataset: List[Tuple], path_to_save: str):
        """Create knowledge, non-knowledge, and partial knowledge datasets with advanced Turkish matching"""

        # Batch tracking
        batch_num = 0
        examples_in_batch = 0

        # Current batch data - now with three categories
        knowledge_dataset = []
        partial_knowledge_dataset = []
        non_knowledge_dataset = []

        # Overall statistics
        total_knowledge = 0
        total_partial = 0
        total_non_knowledge = 0

        # Match type statistics
        match_type_stats = Counter()

        # Metadata for tracking batches
        batch_metadata = {
            "model": self.model_name,
            "tokenizer": "DeepSeek-V3",
            "dataset_name": self.dataset_name,
            "batch_size": self.batch_size,
            "matching_strategy": "multi-tier Turkish-aware",
            "tokenizer_vocab_size": self.tokenizer.vocab_size,
            "batches": []
        }

        few_shot_examples = [
            "soru: Fransa'nƒ±n ba≈ükenti neresidir?\ncevap: Paris\n",
            "soru: Romeo ve Juliet'i kim yazdƒ±?\ncevap: William Shakespeare\n",
            "soru: 64'√ºn karek√∂k√º nedir?\ncevap: 8\n",
            "soru: Kimyasal sembol√º H olan element hangisidir?\ncevap: Hidrojen\n",
            "soru: Japonya'nƒ±n para birimi nedir?\ncevap: Japon Yeni\n"
        ]

        print(f"\n{'='*60}")
        print(f"Processing {len(initial_dataset)} examples in batches of {self.batch_size}...")
        print(f"Using advanced Turkish-aware string matching...")
        print(f"Using DeepSeek model: {self.model_name}")
        print(f"Using tokenizer: DeepSeek V3 (vocab size: {self.tokenizer.vocab_size})")
        print(f"{'='*60}\n")

        # Allow resuming from a specific batch
        start_batch = 8  # Change this to resume from a different batch
        start_index = (start_batch - 1) * self.batch_size

        if start_batch > 1:
            print(f"‚ö†Ô∏è  Resuming from batch {start_batch} (index {start_index})")
            initial_dataset = initial_dataset[start_index:]
            batch_num = start_batch - 1

        # Progress tracking
        total_to_process = len(initial_dataset)

        for idx, point in enumerate(tqdm(initial_dataset, desc="Processing examples")):
            prompt, target_cevap, cevap_tokens = point

            # Generate few-shot prompt
            few_shot_prompt = "".join(random.sample(few_shot_examples, 3))
            full_prompt = few_shot_prompt + prompt

            # Generate completions
            temp_generations = self.generate_with_temperature(full_prompt, temperature=0.5, n=5)
            greedy_generation = self.generate_greedy(full_prompt)

            # Calculate match scores for all generations
            all_generations = temp_generations + [greedy_generation]
            scores = []
            match_types = []

            for gen in all_generations:
                score, match_type = self.matcher.calculate_match_score(target_cevap, gen)
                scores.append(score)
                match_types.append(match_type)
                if score > 0:
                    match_type_stats[match_type] += 1

            # Calculate average score
            avg_score = np.mean(scores)
            max_score = np.max(scores)
            num_matches = sum(1 for s in scores if s > 0)

            # Enhanced classification based on scores
            example_data = {
                "prompt": prompt,
                "target": target_cevap,
                "tokens": cevap_tokens,
                "avg_score": avg_score,
                "max_score": max_score,
                "num_matches": num_matches,
                "scores": scores,
                "match_types": match_types,
                "generations": all_generations[:3]  # Save first 3 generations for analysis
            }

            # Classify based on average score
            if avg_score >= 0.8:
                knowledge_dataset.append(example_data)
                category = "KNOWLEDGE"
            elif avg_score >= 0.3:
                partial_knowledge_dataset.append(example_data)
                category = "PARTIAL"
            else:
                non_knowledge_dataset.append(example_data)
                category = "NON-KNOWLEDGE"

            examples_in_batch += 1

            # Print detailed info for first few examples
            if idx < 5:
                print(f"\n{'='*60}")
                print(f"Example {idx}: {prompt[:60]}...")
                print(f"Target answer: '{target_cevap}' ({len(cevap_tokens)} tokens)")
                print(f"Greedy generation: '{greedy_generation}'")
                print(f"Match scores: {[f'{s:.2f}' for s in scores]}")
                print(f"Match types: {match_types[:2]}...")
                print(f"Average score: {avg_score:.3f}")
                print(f"Category: {category}")
                print(f"{'='*60}")

            # Check if we should save a batch
            if examples_in_batch >= self.batch_size or idx == len(initial_dataset) - 1:
                batch_num += 1

                # Save current batch
                batch_info = self._save_batch(
                    knowledge_dataset,
                    partial_knowledge_dataset,
                    non_knowledge_dataset,
                    batch_num,
                    self.batch_dir
                )

                # Update totals
                total_knowledge += len(knowledge_dataset)
                total_partial += len(partial_knowledge_dataset)
                total_non_knowledge += len(non_knowledge_dataset)

                # Add batch info to metadata
                batch_metadata["batches"].append({
                    "batch_number": batch_num,
                    "examples_processed": examples_in_batch,
                    "knowledge_count": len(knowledge_dataset),
                    "partial_count": len(partial_knowledge_dataset),
                    "non_knowledge_count": len(non_knowledge_dataset),
                    "total_processed": idx + 1 + start_index,
                    "timestamp": time.strftime("%Y-%m-%d %H:%M:%S")
                })

                print(f"\n{'‚îÄ'*50}")
                print(f"üì¶ Batch {batch_num} Summary")
                print(f"{'‚îÄ'*50}")
                print(f"Examples in batch: {examples_in_batch}")
                print(f"Knowledge (‚â•0.8): {len(knowledge_dataset)} ({'%.1f' % (100*len(knowledge_dataset)/examples_in_batch if examples_in_batch > 0 else 0)}%)")
                print(f"Partial (0.3-0.8): {len(partial_knowledge_dataset)} ({'%.1f' % (100*len(partial_knowledge_dataset)/examples_in_batch if examples_in_batch > 0 else 0)}%)")
                print(f"Non-knowledge (<0.3): {len(non_knowledge_dataset)} ({'%.1f' % (100*len(non_knowledge_dataset)/examples_in_batch if examples_in_batch > 0 else 0)}%)")
                print(f"Total processed: {idx + 1 + start_index}/{len(initial_dataset) + start_index}")
                print(f"Running totals - K: {total_knowledge}, P: {total_partial}, NK: {total_non_knowledge}")
                print(f"{'‚îÄ'*50}\n")

                # Clear current batch data to free memory
                knowledge_dataset = []
                partial_knowledge_dataset = []
                non_knowledge_dataset = []
                examples_in_batch = 0

                # Save metadata after each batch
                metadata_path = os.path.join(path_to_save, f"{self.model_name.replace('/', '_')}_{self.dataset_name}_metadata.json")
                with open(metadata_path, "w", encoding='utf-8') as f:
                    json.dump(batch_metadata, f, indent=2, ensure_ascii=False)

            # Rate limiting to avoid API throttling
            time.sleep(0.1)

        # Add match type statistics to metadata
        batch_metadata["match_type_statistics"] = dict(match_type_stats)

        # Final save of metadata
        metadata_path = os.path.join(path_to_save, f"{self.model_name.replace('/', '_')}_{self.dataset_name}_metadata.json")
        with open(metadata_path, "w", encoding='utf-8') as f:
            json.dump(batch_metadata, f, indent=2, ensure_ascii=False)
        print(f"\n‚úì Saved batch metadata to {metadata_path}")

        # Calculate final percentages
        total_processed = total_knowledge + total_partial + total_non_knowledge

        # Final summary
        print(f"\n{'='*60}")
        print(f"üéâ FINAL RESULTS")
        print(f"{'='*60}")
        print(f"Total batches created: {batch_num}")
        print(f"Total Knowledge (avg score ‚â•0.8): {total_knowledge} ({'%.1f' % (100*total_knowledge/total_processed if total_processed > 0 else 0)}%)")
        print(f"Total Partial Knowledge (0.3-0.8): {total_partial} ({'%.1f' % (100*total_partial/total_processed if total_processed > 0 else 0)}%)")
        print(f"Total Non-knowledge (score <0.3): {total_non_knowledge} ({'%.1f' % (100*total_non_knowledge/total_processed if total_processed > 0 else 0)}%)")
        print(f"Total examples processed: {total_processed}")
        print(f"\n--- Match Type Distribution ---")
        for match_type, count in match_type_stats.most_common():
            print(f"  {match_type}: {count}")
        print(f"{'='*60}\n")

        # Ask if user wants to consolidate batches
        if IN_COLAB:
            consolidate = input("\nüìÇ Do you want to consolidate all batches into single files? (y/n): ")
        else:
            # Auto-consolidate in non-interactive environments
            consolidate = 'y'
            print("\nüìÇ Auto-consolidating batches...")

        if consolidate.lower() == 'y':
            self._consolidate_batches(batch_num, self.batch_dir, path_to_save)

    def _save_batch(self, knowledge_dataset, partial_dataset, non_knowledge_dataset, batch_num, batch_dir):
        """Save a single batch of datasets in simple format"""
        model_name_safe = self.model_name.replace("/", "_")
        dataset_name_safe = self.dataset_name.replace(" ", "_")

        batch_info = {}

        for name, data in {
            "knowledge": knowledge_dataset,
            "partial_knowledge": partial_dataset,
            "non_knowledge": non_knowledge_dataset
        }.items():
            filename = f"{model_name_safe}_{dataset_name_safe}_{name}_batch_{batch_num:04d}.json"
            path = os.path.join(batch_dir, filename)

            # Convert to simple format: [prompt, target, tokens, match_count]
            simple_data = []
            for item in data:
                # Create simple format matching original structure
                simple_item = [
                    item["prompt"],
                    item["target"],
                    item["tokens"],
                    item["num_matches"]  # Number of generations that matched (0-6)
                ]
                simple_data.append(simple_item)

            # Save with metadata header for better tracking
            batch_data = {
                "batch_number": batch_num,
                "category": name,
                "count": len(simple_data),
                "timestamp": time.strftime("%Y-%m-%d %H:%M:%S"),
                "tokenizer": "DeepSeek-V3",
                "data": simple_data
            }

            with open(path, "w", encoding='utf-8') as f:
                json.dump(batch_data, f, indent=2, ensure_ascii=False)

            batch_info[name] = {
                "filename": filename,
                "count": len(data),
                "path": path
            }

            print(f"  ‚úì Saved {name} batch {batch_num} to {path} ({len(data)} examples)")

        return batch_info

    def _consolidate_batches(self, total_batches: int, batch_dir: str, final_dir: str):
        """Consolidate all batch files into single files for each category"""
        print("\nüìä Consolidating batches...")
        print(f"{'‚îÄ'*50}")

        model_name_safe = self.model_name.replace("/", "_")
        dataset_name_safe = self.dataset_name.replace(" ", "_")

        consolidated_stats = {}

        for category in ["knowledge", "partial_knowledge", "non_knowledge"]:
            consolidated_data = []
            batch_count = 0
            token_counts = []

            # Read all batch files for this category
            for batch_num in range(1, total_batches + 1):
                batch_filename = f"{model_name_safe}_{dataset_name_safe}_{category}_batch_{batch_num:04d}.json"
                batch_path = os.path.join(batch_dir, batch_filename)

                if os.path.exists(batch_path):
                    with open(batch_path, "r", encoding='utf-8') as f:
                        batch_content = json.load(f)

                        # Extract data from the new batch structure
                        if isinstance(batch_content, dict) and "data" in batch_content:
                            batch_data = batch_content["data"]
                        else:
                            # Fallback for old format
                            batch_data = batch_content

                        consolidated_data.extend(batch_data)
                        batch_count += 1

                        # Collect token counts
                        for item in batch_data:
                            if len(item) > 2 and item[2]:  # Check if tokens field exists
                                token_counts.append(len(item[2]))

                    print(f"  ‚Ä¢ Loaded {len(batch_data)} examples from batch {batch_num} for {category}")

            # Calculate statistics for this category
            if consolidated_data:
                match_counts = [item[3] for item in consolidated_data if len(item) > 3]  # 4th element is match count

                consolidated_stats[category] = {
                    "count": len(consolidated_data),
                    "avg_matches": float(np.mean(match_counts)) if match_counts else 0,
                    "min_matches": int(np.min(match_counts)) if match_counts else 0,
                    "max_matches": int(np.max(match_counts)) if match_counts else 0,
                    "token_stats": {
                        "avg_tokens": float(np.mean(token_counts)) if token_counts else 0,
                        "min_tokens": int(np.min(token_counts)) if token_counts else 0,
                        "max_tokens": int(np.max(token_counts)) if token_counts else 0,
                        "median_tokens": float(np.median(token_counts)) if token_counts else 0
                    }
                }

            # Save consolidated file with metadata
            final_filename = f"{model_name_safe}_{dataset_name_safe}_{category}_dataset_consolidated.json"
            final_path = os.path.join(final_dir, final_filename)

            consolidated_file = {
                "metadata": {
                    "category": category,
                    "total_examples": len(consolidated_data),
                    "total_batches": batch_count,
                    "tokenizer": "DeepSeek-V3",
                    "consolidation_timestamp": time.strftime("%Y-%m-%d %H:%M:%S"),
                    "statistics": consolidated_stats.get(category, {})
                },
                "data": consolidated_data
            }

            with open(final_path, "w", encoding='utf-8') as f:
                json.dump(consolidated_file, f, indent=2, ensure_ascii=False)

            print(f"\n‚úì Saved consolidated {category} dataset:")
            print(f"  Path: {final_path}")
            print(f"  Examples: {len(consolidated_data)}")
            if category in consolidated_stats:
                stats = consolidated_stats[category]
                print(f"  Match stats - Avg: {stats['avg_matches']:.1f}/6, Min: {stats['min_matches']}, Max: {stats['max_matches']}")
                if stats['token_stats']['avg_tokens'] > 0:
                    print(f"  Token stats - Avg: {stats['token_stats']['avg_tokens']:.1f}, Median: {stats['token_stats']['median_tokens']:.1f}")

        # Save consolidated statistics separately
        stats_path = os.path.join(final_dir, f"{model_name_safe}_{dataset_name_safe}_consolidated_stats.json")
        with open(stats_path, "w", encoding='utf-8') as f:
            json.dump(consolidated_stats, f, indent=2, ensure_ascii=False)
        print(f"\n‚úì Saved consolidated statistics to {stats_path}")

        print(f"\n{'‚îÄ'*50}")
        print("‚úÖ Consolidation complete!")
        print(f"{'‚îÄ'*50}")
        for category, stats in consolidated_stats.items():
            print(f"  {category.replace('_', ' ').title()}: {stats['count']} examples")

        # Ask if user wants to delete batch files
        if IN_COLAB:
            delete_batches = input("\nüóëÔ∏è  Do you want to delete the individual batch files to save space? (y/n): ")
        else:
            delete_batches = input("\nüóëÔ∏è  Delete individual batch files? (y/n): ")

        if delete_batches.lower() == 'y':
            import shutil
            try:
                shutil.rmtree(batch_dir)
                os.makedirs(batch_dir, exist_ok=True)  # Recreate empty batch directory
                print("‚úì Batch files deleted successfully.")
            except Exception as e:
                print(f"‚ö†Ô∏è  Error deleting batch files: {e}")
        else:
            print("‚ÑπÔ∏è  Batch files retained in:", batch_dir)

    def analyze_results(self, consolidated_file: str):
        """Analyze a consolidated dataset file to understand matching patterns"""
        print(f"\nAnalyzing {consolidated_file}...")

        with open(consolidated_file, 'r', encoding='utf-8') as f:
            file_content = json.load(f)

        # Handle new consolidated format with metadata
        if isinstance(file_content, dict) and "data" in file_content:
            data = file_content["data"]
            metadata = file_content.get("metadata", {})

            print("\n--- File Metadata ---")
            print(f"Category: {metadata.get('category', 'Unknown')}")
            print(f"Total examples: {metadata.get('total_examples', len(data))}")
            print(f"Tokenizer: {metadata.get('tokenizer', 'Unknown')}")

            if "statistics" in metadata and metadata["statistics"]:
                stats = metadata["statistics"]
                print(f"\n--- Match Statistics ---")
                print(f"Average matches: {stats.get('avg_matches', 0):.1f}/6")
                print(f"Min matches: {stats.get('min_matches', 0)}")
                print(f"Max matches: {stats.get('max_matches', 0)}")

                if "token_stats" in stats:
                    token_stats = stats["token_stats"]
                    print(f"\n--- Token Statistics (DeepSeek V3) ---")
                    print(f"Average tokens: {token_stats.get('avg_tokens', 0):.1f}")
                    print(f"Median tokens: {token_stats.get('median_tokens', 0):.1f}")
                    print(f"Min tokens: {token_stats.get('min_tokens', 0)}")
                    print(f"Max tokens: {token_stats.get('max_tokens', 0)}")
        else:
            # Handle old format
            data = file_content

        if not data:
            print("No data found in file.")
            return

        print(f"\nTotal examples in dataset: {len(data)}")

        # Analyze match counts distribution
        match_distribution = {0: 0, 1: 0, 2: 0, 3: 0, 4: 0, 5: 0, 6: 0}

        for item in data:
            # item format: [prompt, target, tokens, match_count]
            if len(item) > 3:
                match_count = item[3]
                if match_count in match_distribution:
                    match_distribution[match_count] += 1

        print("\n--- Match Count Distribution ---")
        for count, freq in sorted(match_distribution.items()):
            percentage = (freq / len(data)) * 100 if len(data) > 0 else 0
            bar = "‚ñà" * int(percentage / 2) if percentage > 0 else ""
            print(f"{count}/6 matches: {freq:5d} ({percentage:5.1f}%) {bar}")

        # Show some examples with Turkish text
        print("\n--- Sample Examples (Turkish Dataset) ---")
        for i, item in enumerate(data[:5]):
            print(f"\n√ñrnek {i+1}:")
            print(f"  Soru: {item[0][5:80]}...")  # Skip "soru:" prefix
            print(f"  Cevap: {item[1]}")
            print(f"  E≈üle≈ümeler: {item[3]}/6")
            if len(item) > 2 and item[2]:
                print(f"  Token sayƒ±sƒ±: {len(item[2])}")

    def get_token_info(self, text: str) -> Dict:
        """
        Get detailed tokenization information for Turkish text

        Args:
            text: Input text to analyze (supports Turkish)

        Returns:
            Dictionary with token information
        """
        tokens = self.tokenize(text)

        # Test Turkish-specific characters
        turkish_chars = sum(1 for c in text if c in 'ƒüƒûƒ±ƒ∞√∂√ñ≈ü≈û√º√ú√ß√á')

        return {
            "text": text[:100] + "..." if len(text) > 100 else text,
            "num_tokens": len(tokens),
            "token_ids": tokens[:10] + ["..."] if len(tokens) > 10 else tokens,
            "avg_chars_per_token": len(text) / len(tokens) if tokens else 0,
            "contains_turkish_chars": turkish_chars > 0,
            "turkish_char_count": turkish_chars
        }


def main():
    """Main function to run the Turkish knowledge dataset creator"""
    print("="*60)
    print("Turkish Knowledge Dataset Creator with DeepSeek V3")
    print("="*60)

    # Configuration
    config = {
        "path_to_knowledge_dataset": "datasets/deepseek/turkish/",
        "dataset_name": "turkish",
        "model_name": "deepseek-chat",  # or "deepseek-reasoner" for reasoning mode
        "batch_size": 2000,  # Save every 2000 examples
        "tokenizer_model": "deepseek-ai/DeepSeek-V3"  # DeepSeek V3 tokenizer
    }

    print("\nConfiguration:")
    for key, value in config.items():
        print(f"  {key}: {value}")
    print()

    # Initialize and run
    try:
        dataset_creator = KnowledgeDatasetDeepSeek(**config)
        print("\n‚úÖ Turkish dataset creation completed successfully!")

        # Optional: Analyze results after processing
        # Uncomment to analyze consolidated files
        # dataset_creator.analyze_results("datasets/deepseek/turkish/deepseek-chat_turkish_knowledge_dataset_consolidated.json")

    except Exception as e:
        print(f"\n‚ùå Error during dataset creation: {e}")
        raise


if __name__ == "__main__":
    main()

Turkish Knowledge Dataset Creator with DeepSeek V3

Configuration:
  path_to_knowledge_dataset: datasets/deepseek/turkish/
  dataset_name: turkish
  model_name: deepseek-chat
  batch_size: 2000
  tokenizer_model: deepseek-ai/DeepSeek-V3

Initializing Turkish Knowledge Dataset Creator with DeepSeek V3...
Model: deepseek-chat
Tokenizer: deepseek-ai/DeepSeek-V3
Batch size: 2000

Loading DeepSeek V3 tokenizer from deepseek-ai/DeepSeek-V3...
This may take a moment on first run as it downloads the tokenizer files...


tokenizer_config.json: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

‚úì Tokenizer loaded successfully!
  Vocabulary size: 128000
  Model max length: 131072
  Turkish test: 'Merhaba d√ºnya! ƒ∞stanbul'dan selamlar.' -> 15 tokens

‚úì Turkish text matcher initialized with multi-tier matching strategy
Please upload your Turkish dataset file (JSON or CSV)...


Saving turkish_dataset.json to turkish_dataset.json
Uploaded file: turkish_dataset.json
Loaded 15000 records from turkish_dataset.json
Example 0: soru: Henry Ford, Magic Johnson ve Berry Gordy han... -> Michigan (2 tokens)
Example 1: soru: Ring of Fire' hangi okyanusta bulunuyor?
cev... -> Pasifik Okyanusu (6 tokens)
Example 2: soru: Doƒüal gazƒ±n ana bile≈üeni nedir?
cevap:... -> Metan (2 tokens)
Example 3: soru: "Zamanƒ±n M√ºziƒüine Dans" adlƒ± 12 ciltlik roma... -> Anthony Powell (2 tokens)
Example 4: soru: 1999 yƒ±lƒ±nda Avustralya‚Äônƒ±n Melbourne kentin... -> Bƒ±yƒ±klar (5 tokens)

Tokenization Statistics (DeepSeek V3):
  Average tokens per answer: 3.24
  Min tokens: 1
  Max tokens: 20
  Median tokens: 3.00

Processing 15000 examples in batches of 2000...
Using advanced Turkish-aware string matching...
Using DeepSeek model: deepseek-chat
Using tokenizer: DeepSeek V3 (vocab size: 128000)

‚ö†Ô∏è  Resuming from batch 8 (index 14000)


Processing examples:   0%|          | 1/1000 [00:09<2:34:32,  9.28s/it]


Example 0: soru: Sherlock Holmes hangi enstr√ºmanƒ± √ßalardƒ±?
cevap:...
Target answer: 'Keman' (2 tokens)
Greedy generation: 'Keman'
Match scores: ['1.00', '1.00', '1.00', '1.00', '1.00', '1.00']
Match types: ['exact', 'exact']...
Average score: 1.000
Category: KNOWLEDGE


Processing examples:   0%|          | 2/1000 [00:17<2:26:28,  8.81s/it]


Example 1: soru: Gulliver‚Äôin ilk adƒ± nedir?
cevap:...
Target answer: 'Lemuel' (2 tokens)
Greedy generation: 'Lemuel Gulliver'
Match scores: ['1.00', '0.95', '1.00', '0.95', '0.95', '0.95']
Match types: ['exact', 'exact']...
Average score: 0.967
Category: KNOWLEDGE


Processing examples:   0%|          | 3/1000 [00:29<2:51:53, 10.34s/it]


Example 2: soru: 1913-1938 yƒ±llarƒ± arasƒ±nda bir nikelin arka y√ºz√ºnde ha...
Target answer: 'Buffalo' (2 tokens)
Greedy generation: 'Bu sorunun cevabƒ± "buffalo" (bufalo) olmalƒ±'
Match scores: ['0.95', '0.00', '0.95', '0.95', '0.95', '0.95']
Match types: ['exact', 'no_match']...
Average score: 0.792
Category: PARTIAL


Processing examples:   0%|          | 4/1000 [00:41<3:00:44, 10.89s/it]


Example 3: soru: Albert Finney, 1983 yapƒ±mƒ± hangi filmde ‚ÄòSir‚Äô karakter...
Target answer: 'The Dresser' (3 tokens)
Greedy generation: 'Albert Finney, 1983 yapƒ±mƒ± "The Dresser" (T√ºrk'
Match scores: ['0.95', '0.95', '0.95', '0.95', '0.95', '0.95']
Match types: ['exact', 'exact']...
Average score: 0.950
Category: KNOWLEDGE


Processing examples:   0%|          | 5/1000 [00:48<2:38:56,  9.58s/it]


Example 4: soru: 1909 yƒ±lƒ±nda yayƒ±mlanan ve d√∂nemin √ßaƒüda≈ü siyasi sorun...
Target answer: 'H G Wells' (3 tokens)
Greedy generation: 'H'
Match scores: ['0.13', '0.13', '0.13', '0.13', '0.13', '0.13']
Match types: ['partial', 'partial']...
Average score: 0.133
Category: NON-KNOWLEDGE


Processing examples: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 1000/1000 [2:50:45<00:00, 10.25s/it]


  ‚úì Saved knowledge batch 8 to datasets/deepseek/turkish/batches/deepseek-chat_turkish_knowledge_batch_0008.json (575 examples)
  ‚úì Saved partial_knowledge batch 8 to datasets/deepseek/turkish/batches/deepseek-chat_turkish_partial_knowledge_batch_0008.json (116 examples)
  ‚úì Saved non_knowledge batch 8 to datasets/deepseek/turkish/batches/deepseek-chat_turkish_non_knowledge_batch_0008.json (309 examples)

‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ
üì¶ Batch 8 Summary
‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ
Examples in batch: 1000
Knowledge (‚â•0.8): 575 (57.5%)
Partial (0.3-0.8): 116 (11.6%)
Non-knowledge (<0.3): 309 (30.9%)
Total processed: 15000/15000
Running totals - K: 575, P: 116, NK: 309
‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚î