# Hybrid Model Code Clone Dataset Generator

## Problem Statement
The original script used a small model (`codegemma:2b`) for all clone types. While fast, this model lacks the capability to handle complex Type-3 and Type-4 semantic transformations, resulting in mislabeled clones.

## Solution: Hybrid Approach
- **Easy Clones (Type-1, Type-2)**: Use `codegemma:2b` for fast, reliable generation
- **Hard Clones (Type-3, Type-4)**: Use `deepseek-coder:6.7b` for accurate semantic transformations

This notebook implements the complete hybrid workflow in a step-by-step manner.

## 1. Setup and Configuration
Import all required libraries and set up the configuration for the hybrid generation process.

In [None]:
"""
Import all required libraries
"""
import uuid
import time
import subprocess
import tempfile
import requests
from pathlib import Path
import jsonlines
import pandas as pd
from colorama import Fore, Style, init
from tqdm import tqdm
import re

# Initialize colorama for Windows compatibility
init(autoreset=True)

print(f"{Fore.GREEN}✓ All libraries imported successfully")

✓ All libraries imported successfully


In [None]:
"""
Configuration - Edit these paths and parameters
"""

# Path to CodeNet root directory
CODENET_ROOT = Path(r"D:/Projects/SLIIT/Research/Datasets/Project_CodeNet")

# CodeNet subdirectories
RAW_CODENET_DIR = CODENET_ROOT / "data"
METADATA_DIR = CODENET_ROOT / "metadata"

# Output paths
EASY_OUTPUT_PATH = Path("dataset/java_clones_easy_types.jsonl")
HARD_OUTPUT_PATH = Path("dataset/java_clones_hard_types.jsonl")
COMBINED_OUTPUT_PATH = Path("dataset/java_clones_10k.jsonl")
# Non-clone output paths
EASY_NONCLONES_OUTPUT_PATH = Path("dataset/java_nonclones_easy_types.jsonl")
HARD_NONCLONES_OUTPUT_PATH = Path("dataset/java_nonclones_hard_types.jsonl")
FINAL_DATASET_PATH = Path("dataset/java_complete_dataset.jsonl")
GENERATED_DIR = Path("generated")
SEEDS_DIR = Path("seeds")

# Execution settings
TIMEOUT_SECONDS = 30
MAX_PROBLEMS = None  # None for all problems, or int to limit

# Clone generation settings
TARGET_CLONES_PER_TYPE = 1  # Target per type
# Non-clone generation settings
TARGET_NONCLONES_EASY = 1 # Target easy non-clones (simple algorithmic differences)
TARGET_NONCLONES_HARD = 1  # Target hard non-clones (different problem domains)
MAX_CLONES_PER_PROBLEM = 1
MAX_CYCLES = 50

# Model settings - HYBRID APPROACH
EASY_MODEL = "deepseek-coder:6.7b"       # Fast model for Type-1, Type-2
HARD_MODEL = "deepseek-coder:6.7b"  # Capable model for Type-3, Type-4

# Logging
VERBOSE = True

# Create directories
for directory in [GENERATED_DIR, SEEDS_DIR, EASY_OUTPUT_PATH.parent]:
    directory.mkdir(exist_ok=True, parents=True)

print(f"{Fore.GREEN}✓ Configuration loaded")
print(f"{Fore.CYAN}Easy Model (Type-1/2): {EASY_MODEL}")
print(f"{Fore.CYAN}Hard Model (Type-3/4): {HARD_MODEL}")
print(f"{Fore.CYAN}Target per type: {TARGET_CLONES_PER_TYPE}")
print(f"{Fore.CYAN}CodeNet Root: {CODENET_ROOT}")

✓ Configuration loaded
Easy Model (Type-1/2): deepseek-coder:6.7b


Hard Model (Type-3/4): deepseek-coder:6.7b
Target per type: 1
CodeNet Root: D:\Projects\SLIIT\Research\Datasets\Project_CodeNet


In [None]:
def normalize_unicode_to_ascii(text):
    """Convert Unicode characters to ASCII equivalents."""
    replacements = {
        '\u201c': '"', '\u201d': '"', '\u2018': "'", '\u2019': "'",
        '\u201b': "'", '\u2013': '-', '\u2014': '-', '\u2015': '-',
        '\u00a0': ' ', '\u2009': ' ', '\u200a': ' ', '\u2026': '...',
        '\u00b4': "'", '\u02bb': "'", '\u02bc': "'"
    }
    
    for unicode_char, ascii_char in replacements.items():
        text = text.replace(unicode_char, ascii_char)
    
    cleaned = []
    for char in text:
        if ord(char) < 128 or char in ['\n', '\r', '\t']:
            cleaned.append(char)
        else:
            cleaned.append(' ')
    
    return ''.join(cleaned)

def sanitize_code_from_model(raw_text):
    """Sanitize and extract Java code from model output."""
    if raw_text is None:
        return None
    
    text = raw_text.strip()
    text = normalize_unicode_to_ascii(text)
    
    # Handle fenced code blocks
    if "```" in text:
        parts = text.split("```")
        for part in parts:
            if part.lower().startswith("java"):
                text = part[4:].lstrip()
                break
        else:
            # Fallback: find the largest block inside backticks
            candidates = [p for p in parts if len(p.strip()) > 20]
            if candidates:
                text = max(candidates, key=len)
    
    # Remove common LLM artifacts
    llm_artifacts = [
        r'< begin of sentence >', r'<begin of sentence>', r'< end of sentence >',
        r'<end of sentence>', r'<\|begin_of_text\|>', r'<\|end_of_text\|>',
        r'<s>', r'</s>', r'<\|startoftext\|>', r'<\|endoftext\|>',
        r'<\|file_separator\|>', r'<\|code_start\|>', r'<\|code_end\|>'
    ]
    
    for artifact in llm_artifacts:
        text = re.sub(artifact, '', text, flags=re.IGNORECASE)
    
    # Clean up specific System.out artifacts if they leaked into code
    # e.g., System.out< begin of sentence >println
    text = re.sub(r'System\.out\s*<[^>]+>\s*', 'System.out.', text)
    
    # Basic validation
    if "class Main" not in text:
        # Try to wrap it if it looks like code but missing class
        if "public static void main" in text:
             text = "public class Main {\n" + text + "\n}"
        else:
            return None
    
    return text.strip()

print(f"{Fore.GREEN}✓ Helper functions defined (Updated with improved sanitization)")

✓ Helper functions defined (Updated with improved sanitization)


In [None]:
"""
Prompt templates for clone generation
"""
TYPE1_PROMPT_TEMPLATE = """You are a Java code formatter. Transform this Java code by ONLY changing formatting while preserving all semantics.

**CRITICAL:** Your output must be raw Java code ONLY. Do not include any markdown, explanations, or special tokens.

RULES:
1. MUST have class name as "Main" (CRITICAL for compilation)
2. MUST have public static void main(String[] args) method
3. ONLY change formatting: whitespace, indentation, line breaks, comments
4. MUST preserve all identifiers, literals, and code structure
5. DO NOT rename variables, methods, or classes
6. DO NOT change any literals or expressions
7. DO NOT add, remove, or modify any statements
8. DO NOT change control flow structure
9. Output raw Java code ONLY (no markdown, no explanation)

Original Code:
<<<CODE_PLACEHOLDER>>>

Formatted Code:"""

TYPE2_PROMPT_TEMPLATE = """You are a Java refactoring assistant. Transform this code by renaming identifiers and changing literals while preserving exact behavior.

**CRITICAL:** Your output must be raw Java code ONLY. Do not include any markdown, explanations, or special tokens.

RULES:
1. MUST have class name as "Main" (CRITICAL for compilation)
2. MUST have public static void main(String[] args) method
3. CAN rename variables, parameters, and method names (EXCEPT main method)
4. CAN change literals (e.g., 10→0xA, true→(1==1), "test"→"TEST".toLowerCase())
5. MUST preserve exact control flow and structure
6. DO NOT add, remove, or reorder any statements
7. DO NOT change the algorithmic logic or approach
8. DO NOT modify control flow patterns (if/else, loops, etc.)
9. Structure and statement order MUST remain identical
10. Output raw Java code ONLY (no markdown, no explanation)

Original Code:
<<<CODE_PLACEHOLDER>>>

Refactored Code:"""

TYPE3_PROMPT_TEMPLATE = """You are a Java code mutator. Transform this code with statement-level modifications while preserving exact program behavior.

**CRITICAL:** Your output must be raw Java code ONLY. Do not include any markdown, explanations, or special tokens.

RULES:
1. MUST have class name as "Main" (CRITICAL for compilation)
2. MUST have public static void main(String[] args) method
3. CAN replace statements with equivalent ones (e.g., for↔while loops)
4. CAN introduce or remove temporary variables
5. CAN refactor expressions (e.g., a+b+c → temp=a+b; result=temp+c)
6. CAN add minimal dead code (unused variables, unreachable code after return)
7. CAN reorder independent statements
8. DO NOT change the underlying algorithm or approach
9. DO NOT switch to library-based solutions (streams, collections APIs)
10. MUST preserve exact input/output behavior
11. Observable behavior MUST be identical
12. Output raw Java code ONLY (no markdown, no explanation)

Original Code:
<<<CODE_PLACEHOLDER>>>

Modified Code:"""

TYPE4_PROMPT_TEMPLATE = """You are an expert Java programmer. Rewrite this code using a completely different algorithm while maintaining identical observable behavior.

**CRITICAL:** Your output must be raw Java code ONLY. Do not include any markdown, explanations, or special tokens.

RULES:
1. MUST have class name as "Main" (CRITICAL for compilation)
2. MUST have public static void main(String[] args) method
3. MUST preserve exact input format and parsing
4. MUST preserve exact output format and content
5. MUST have identical behavior for ALL possible inputs
6. CAN use completely different algorithms, data structures, approaches
7. CAN restructure the entire program logic
8. CAN use different computational strategies
9. Structure and implementation MAY be completely different
10. Observable input/output behavior MUST be identical
11. Output raw Java code ONLY (no markdown, no explanation)

Original Code:
<<<CODE_PLACEHOLDER>>>

Rewritten Code:"""

EASY_NONCLONE_PROMPT_TEMPLATE = """You are a Java programmer. Create a simple, different Java program that solves a basic algorithmic problem.

**CRITICAL:** Your output must be raw Java code ONLY. Do not include any markdown, explanations, or special tokens.

RULES:
1. MUST have class name as "Main" (CRITICAL for compilation)
2. MUST have public static void main(String[] args) method
3. Create a program for a COMPLETELY DIFFERENT problem domain
4. DO NOT reuse any variable names from the reference code
5. DO NOT use similar control-flow patterns
6. DO NOT use similar data structures
7. Must solve a clearly different algorithmic problem
8. Use basic concepts: simple loops, arrays, basic arithmetic
9. Must be functionally complete and compilable
10. Different problem goal and output meaning required
11. Output raw Java code ONLY (no markdown, no explanation)

Reference Code (CREATE SOMETHING COMPLETELY DIFFERENT):
<<<CODE_PLACEHOLDER>>>

New Different Program:"""

HARD_NONCLONE_PROMPT_TEMPLATE = """You are an expert Java programmer. Create a sophisticated Java program that has similar structure but different semantics from the reference code.

**CRITICAL:** Your output must be raw Java code ONLY. Do not include any markdown, explanations, or special tokens.

RULES:
1. MUST have class name as "Main" (CRITICAL for compilation)
2. MUST have public static void main(String[] args) method
3. MUST have similar control flow patterns (similar if/else, loop structures)
4. MUST have similar program skeleton and structure
5. MUST solve a DIFFERENT semantic problem with DIFFERENT output meaning
6. MUST NOT have behavioral equivalence with the reference code
7. Use advanced concepts: collections, recursion, object-oriented design
8. High structural similarity but different algorithmic goal required
9. Must be functionally complete and compilable
10. Different problem domain but similar complexity
11. Output raw Java code ONLY (no markdown, no explanation)

Reference Code (CREATE SIMILAR STRUCTURE, DIFFERENT SEMANTICS):
<<<CODE_PLACEHOLDER>>>

New Structurally Similar Program:"""


print(f"{Fore.GREEN}✓ Prompt templates defined (including non-clone templates)")
print(f"{Fore.CYAN}  Clone templates: Type-1, Type-2, Type-3, Type-4")
print(f"{Fore.CYAN}  Non-clone templates: Easy, Hard")

✓ Prompt templates defined (including non-clone templates)
  Clone templates: Type-1, Type-2, Type-3, Type-4
  Non-clone templates: Easy, Hard


In [None]:
"""
CodeNet data loading functions
"""

def list_problems():
    """List all available problem directories in CodeNet."""
    if not RAW_CODENET_DIR.exists():
        log(f"CodeNet directory not found: {RAW_CODENET_DIR}", Fore.RED)
        return []
    
    problems = []
    for item in RAW_CODENET_DIR.iterdir():
        if item.is_dir() and item.name.startswith('p') and len(item.name) == 6 and item.name[1:].isdigit():
            problems.append(item.name)
    
    return sorted(problems)

def load_submissions_csv(problem_id):
    """Load metadata CSV for a problem."""
    csv_path = METADATA_DIR / f"{problem_id}.csv"
    
    if not csv_path.exists():
        return None
    
    try:
        df = pd.read_csv(csv_path)
        return df
    except Exception as e:
        log(f"Error reading CSV {csv_path}: {e}", Fore.RED)
        return None

def choose_seed(problem_id):
    """Select a seed Java submission from the problem."""
    df = load_submissions_csv(problem_id)
    
    if df is None or df.empty:
        return None, None
    
    java_accepted = df[
        (df['language'] == 'Java') &
        (df['status'] == 'Accepted')
    ]
    
    if java_accepted.empty:
        return None, None
    
    for _, row in java_accepted.iterrows():
        submission_id = row['submission_id']
        java_path = RAW_CODENET_DIR / problem_id / "Java" / f"{submission_id}.java"
        
        if not java_path.exists():
            continue
        
        if java_path.stat().st_size > 10240:  # 10KB limit
            continue
        
        try:
            code = java_path.read_text(encoding='utf-8')
            return code, submission_id
        except Exception:
            continue
    
    return None, None

def load_testcases(problem_id):
    """Load input/output testcases for a problem."""
    testcases_dir = CODENET_ROOT / "derived" / "input_output" / "data" / problem_id
    
    input_file = testcases_dir / "input.txt"
    output_file = testcases_dir / "output.txt"
    
    if not input_file.exists() or not output_file.exists():
        return []
    
    try:
        input_text = input_file.read_text(encoding='utf-8')
        output_text = output_file.read_text(encoding='utf-8')
        return [(input_text, output_text)]
    except Exception:
        return []

print(f"{Fore.GREEN}✓ CodeNet loading functions defined")

✓ CodeNet loading functions defined


In [None]:
def log(message, color=Fore.WHITE):
    """Simple logger function."""
    try:
        print(f"{color}{message}{Style.RESET_ALL}")
    except Exception:
        print(message)

def ask_model_ollama(prompt, model_name, max_tokens=1500, temperature=0.1):
    """Call Ollama API to generate code."""
    url = "http://localhost:11434/api/generate"
    
    payload = {
        "model": model_name,
        "prompt": prompt,
        "stream": False,
        "options": {
            "temperature": temperature,
            "num_predict": max_tokens
        }
    }
    
    try:
        response = requests.post(url, json=payload, timeout=120)
        
        if response.status_code == 200:
            result = response.json()
            generated_text = result.get("response", "")
            return generated_text, None
        else:
            error_msg = f"Ollama API error: HTTP {response.status_code}"
            return None, error_msg
            
    except requests.exceptions.ConnectionError:
        return None, "Cannot connect to Ollama. Is it running? (ollama serve)"
    except requests.exceptions.Timeout:
        return None, "Ollama API timeout"
    except Exception as e:
        return None, f"Ollama API error: {str(e)}"

def quick_check_code_quality(code_str):
    """Check code quality without compilation."""
    if not code_str or len(code_str) < 50:
        return False, "Code too short"
    
    if "class Main" not in code_str:
        return False, "Missing 'class Main'"
    
    if "main(" not in code_str:
        return False, "Missing main method"
    
    # Check for suspicious patterns
    suspicious = [
        "TODO:", "FIXME:", "[Your code here]", "// ... rest of",
        "// Original code", "// Explanation:", "Note that", 
        "< begin of sentence >", "<begin of sentence>",
        "< end of sentence >", "<end of sentence>"
    ]
    
    for pattern in suspicious:
        if pattern in code_str:
            return False, f"Contains suspicious pattern: {pattern}"
    
    # Check basic syntax balance - Relaxed to avoid false positives on generics or bitwise ops
    if code_str.count('{') != code_str.count('}'):
        return False, "Unbalanced braces"
    
    if code_str.count('(') != code_str.count(')'):
        return False, "Unbalanced parentheses"
    
    # Check for incomplete statements
    trimmed = code_str.strip()
    if trimmed and trimmed[-1] not in ['}', ';', '*', '/']:
        return False, "Code appears incomplete"
    
    return True, "OK"

# [REMAINING VALIDATION FUNCTIONS KEPT AS IS]
def compile_java(temp_dir):
    """Compile Main.java in temp_dir."""
    java_file = Path(temp_dir) / "Main.java"
    
    if not java_file.exists():
        return False, "Main.java not found"
    
    try:
        result = subprocess.run(
            ["javac", str(java_file)],
            cwd=temp_dir,
            stdout=subprocess.PIPE,
            stderr=subprocess.PIPE,
            timeout=30,
            check=False
        )
        
        if result.returncode != 0:
            error = result.stderr.decode('utf-8', errors='ignore')
            return False, f"Compilation error: {error[:500]}"
        
        return True, None
        
    except subprocess.TimeoutExpired:
        return False, "Compilation timeout"
    except FileNotFoundError:
        return False, "javac not found. Please install JDK."
    except Exception as e:
        return False, f"Compilation exception: {str(e)}"

def run_java_with_input(temp_dir, input_str, timeout=3):
    """Run compiled Java program with given input."""
    try:
        result = subprocess.run(
            ["java", "Main"],
            cwd=temp_dir,
            input=input_str.encode('utf-8'),
            stdout=subprocess.PIPE,
            stderr=subprocess.PIPE,
            timeout=timeout,
            check=False
        )
        
        if result.returncode != 0:
            error = result.stderr.decode('utf-8', errors='ignore')
            return None, f"Runtime error: {error[:500]}"
        
        output = result.stdout.decode('utf-8', errors='ignore')
        return output, None
        
    except subprocess.TimeoutExpired:
        return None, "Execution timeout"
    except Exception as e:
        return None, f"Execution exception: {str(e)}"

def normalize_output(text):
    """Normalize output text."""
    lines = text.strip().split('\n')
    return '\n'.join(line.rstrip() for line in lines)

def validate_java(code_str, problem_id):
    """Validate Java code by compiling and running against testcases."""
    testcases = load_testcases(problem_id)
    
    if not testcases:
        return "no_tests"
    
    with tempfile.TemporaryDirectory() as temp_dir:
        java_file = Path(temp_dir) / "Main.java"
        
        try:
            java_file.write_text(code_str, encoding='utf-8')
        except Exception:
            return "compile_error"
        
        compile_success, compile_error = compile_java(temp_dir)
        
        if not compile_success:
            return f"compile_error: {compile_error}" # Enhanced return to include error message
        
        for idx, (input_text, expected_output) in enumerate(testcases):
            output, error = run_java_with_input(temp_dir, input_text, timeout=TIMEOUT_SECONDS)
            
            if error:
                if "timeout" in error.lower():
                    return "timeout"
                else:
                    return f"runtime_error: {error}" # Enhanced return
            
            norm_output = normalize_output(output)
            norm_expected = normalize_output(expected_output)
            
            if norm_output != norm_expected:
                return "wrong_answer"
        
        return "passed"

print(f"{Fore.GREEN}✓ Java validation functions defined (Updated checks and error reporting)")


✓ Java validation functions defined (Updated checks and error reporting)


## 2. Model Selection Logic
Define the hybrid model selection strategy based on clone type complexity.

In [None]:
"""
Check Ollama connectivity before starting generation
"""

def check_ollama_connection():
    """Check if Ollama is running and accessible."""
    url = "http://localhost:11434/api/tags"
    
    try:
        response = requests.get(url, timeout=5)
        if response.status_code == 200:
            models = response.json().get('models', [])
            model_names = [m.get('name', '') for m in models]
            return True, model_names
        else:
            return False, []
    except requests.exceptions.ConnectionError:
        return False, []
    except Exception as e:
        return False, []

# Check Ollama connection
print(f"{Fore.CYAN}{'='*60}")
print(f"{Fore.CYAN}Checking Ollama Connection...")
print(f"{Fore.CYAN}{'='*60}\n")

is_connected, available_models = check_ollama_connection()

if is_connected:
    print(f"{Fore.GREEN}✓ Ollama is running and accessible")
    print(f"\n{Fore.CYAN}Available models:")
    for model in available_models:
        print(f"  • {model}")
    
    # Check if required models are available
    required_models = [EASY_MODEL, HARD_MODEL]
    missing_models = []
    
    for req_model in set(required_models):  # Use set to avoid duplicates
        if not any(req_model in available for available in available_models):
            missing_models.append(req_model)
    
    if missing_models:
        print(f"\n{Fore.YELLOW}⚠ Warning: Required models not found:")
        for model in missing_models:
            print(f"  • {model}")
        print(f"\n{Fore.CYAN}To download missing models, run:")
        for model in missing_models:
            print(f"  ollama pull {model}")
    else:
        print(f"\n{Fore.GREEN}✓ All required models are available")
else:
    print(f"{Fore.RED}✗ Cannot connect to Ollama")
    print(f"\n{Fore.YELLOW}Please ensure Ollama is running:")
    print(f"  1. Open a new terminal")
    print(f"  2. Run: ollama serve")
    print(f"  3. Then re-run this notebook")
    print(f"\n{Fore.CYAN}If you don't have Ollama installed:")
    print(f"  Visit: https://ollama.ai/download")
    
    raise RuntimeError("Ollama is not running. Please start Ollama and try again.")

print(f"\n{Fore.GREEN}✓ Pre-flight checks complete!")


Checking Ollama Connection...

✓ Ollama is running and accessible

Available models:
  • deepseek-coder:6.7b
  • qwen2.5-coder:7b
  • qwen2.5-coder:0.5b
  • starcoder:1b
  • codegemma:2b

✓ All required models are available

✓ Pre-flight checks complete!


In [None]:
REPAIR_PROMPT_TEMPLATE = """You are a Java code repair assistant. The following Java code has errors. Fix the validation errors and output the corrected code.

**CRITICAL:** Your output must be raw Java code ONLY. Do not include any markdown, explanations, or special tokens.

Rules:
1. MUST have class name as "Main"
2. Fix SPECIFICALLY the error reported below
3. Preserve the original logic as much as possible
4. Output raw Java code ONLY

Original Code:
<<<CODE_PLACEHOLDER>>>

Validation Error:
<<<ERROR_PLACEHOLDER>>>

Fixed Code:"""

def get_model_for_clone_type(clone_type):
    """
    Return the appropriate model based on clone type complexity.
    
    Easy types (Type-1, Type-2): Use fast codegemma:2b
    Hard types (Type-3, Type-4): Use capable deepseek-coder:6.7b
    """
    if clone_type in ['type1', 'type2']:
        return EASY_MODEL
    elif clone_type in ['type3', 'type4']:
        return HARD_MODEL
    else:
        # Default to hard model for unknown types
        return HARD_MODEL

def generate_with_repair(prompt, model_name, problem_id=None, max_retries=3):
    """
    Generate code with a repair loop. 
    If validation fails, ask the model to fix it using the error message.
    """
    
    # 1. Initial Generation
    temperatures = [0.1, 0.3, 0.5]
    best_candidate = None
    
    for attempt in range(2): # 2 attempts at initial generation
        temp = temperatures[min(attempt, len(temperatures) - 1)]
        raw, error = ask_model_ollama(prompt, model_name, max_tokens=1500, temperature=temp)
        
        if error:
            log(f"[MODEL ERROR] {error}", Fore.RED)
            continue
            
        code = sanitize_code_from_model(raw)
        if not code:
            continue
            
        # Quick check
        valid_syntax, reason = quick_check_code_quality(code)
        if not valid_syntax:
            print(f"  {Fore.YELLOW}Syntax check failed: {reason}")
            # Try to repair syntax immediately
            # (Fall through to repair loop with this reason as error)
            error_msg = f"Syntax error: {reason}"
        else:
            # Full validation if problem_id is provided
            if problem_id:
                result = validate_java(code, problem_id)
                if result == "passed":
                    return code # Success!
                else:
                    error_msg = result
            else:
                 return code # No problem ID to validate against, so return code
        
        # If we are here, we have code that failed. Enter repair loop.
        # We only try to repair the FIRST valid-looking code we get from initial generation
        # to save time.
        
        print(f"  {Fore.YELLOW}Attempting repair for error: {error_msg}")
        
        current_code = code
        for repair_attempt in range(2): # 2 repair attempts
            repair_prompt = REPAIR_PROMPT_TEMPLATE.replace("<<<CODE_PLACEHOLDER>>>", current_code).replace("<<<ERROR_PLACEHOLDER>>>", str(error_msg))
            
            # Use the harder model for repairs usually, or same model
            repair_model = HARD_MODEL 
            
            raw_repair, err = ask_model_ollama(repair_prompt, repair_model, max_tokens=1500, temperature=0.1)
            
            if err: 
                break
                
            repaired_code = sanitize_code_from_model(raw_repair)
            if not repaired_code:
                continue
                
            # Validate repaired code
            valid_syntax, reason = quick_check_code_quality(repaired_code)
            if not valid_syntax:
                error_msg = f"Syntax error after repair: {reason}"
                current_code = repaired_code # Try to repair the repaired code?
                continue
            
            if problem_id:
                result = validate_java(repaired_code, problem_id)
                if result == "passed":
                    print(f"  {Fore.GREEN}Repair successful!")
                    return repaired_code
                else:
                    error_msg = result
                    current_code = repaired_code
            else:
                return repaired_code
                
        # If repair failed, we continue to next initial generation attempt (with different temp)
    
    return None

def generate_clone(code, clone_type):
    """Generate a clone based on the type using the appropriate model."""
    if clone_type == 'type1':
        prompt = TYPE1_PROMPT_TEMPLATE.replace("<<<CODE_PLACEHOLDER>>>", code)
    elif clone_type == 'type2':
        prompt = TYPE2_PROMPT_TEMPLATE.replace("<<<CODE_PLACEHOLDER>>>", code)
    elif clone_type == 'type3':
        prompt = TYPE3_PROMPT_TEMPLATE.replace("<<<CODE_PLACEHOLDER>>>", code)
    elif clone_type == 'type4':
        prompt = TYPE4_PROMPT_TEMPLATE.replace("<<<CODE_PLACEHOLDER>>>", code)
    else:
        return None
    
    model_name = get_model_for_clone_type(clone_type)
    
    # We don't pass problem_id here because generate_clone signature match
    # We will need to change how this is called or use a wrapper. 
    # Actually, let's just return the prompt and model and handle the loop in the main function?
    # No, to minimize changes, let's keep the signature but we can't do full validation inside here
    # without problem_id.
    # So we will modify the caller to pass problem_id to a new function `generate_clone_with_id`
    # OR we modify this function signature (it's defined in the notebook, so we can change it).
    
    return generate_with_retry(prompt, clone_type) # Fallback to old if not updated? 
    # Wait, we are replacing this cell.
    # We should update the signature to accept problem_id
    pass 

# Redefining generate_clone to accept problem_id is better, but requires changing the caller too.
# The caller is in Cell 22. We will update that too.

def generate_clone_v2(code, clone_type, problem_id):
    if clone_type == 'type1':
        prompt = TYPE1_PROMPT_TEMPLATE.replace("<<<CODE_PLACEHOLDER>>>", code)
    elif clone_type == 'type2':
        prompt = TYPE2_PROMPT_TEMPLATE.replace("<<<CODE_PLACEHOLDER>>>", code)
    elif clone_type == 'type3':
        prompt = TYPE3_PROMPT_TEMPLATE.replace("<<<CODE_PLACEHOLDER>>>", code)
    elif clone_type == 'type4':
        prompt = TYPE4_PROMPT_TEMPLATE.replace("<<<CODE_PLACEHOLDER>>>", code)
    else:
        return None
        
    model = get_model_for_clone_type(clone_type)
    return generate_with_repair(prompt, model, problem_id)

def get_model_for_nonclone_type(nonclone_type):
    if nonclone_type == 'easy':
        return EASY_MODEL
    elif nonclone_type == 'hard':
        return HARD_MODEL
    else:
        return HARD_MODEL

def generate_nonclone_v2(code, nonclone_type, problem_id=None):
    if nonclone_type == 'easy':
        prompt = EASY_NONCLONE_PROMPT_TEMPLATE.replace("<<<CODE_PLACEHOLDER>>>", code)
    elif nonclone_type == 'hard':
        prompt = HARD_NONCLONE_PROMPT_TEMPLATE.replace("<<<CODE_PLACEHOLDER>>>", code)
    else:
        return None
        
    model = get_model_for_nonclone_type(nonclone_type)
    # For non-clones, we just check compilation/sanity, we can't check 'passed' against original tests
    # because the problem is DIFFERENT. So passing problem_id might be misleading if used for test validation.
    # However, generate_with_repair uses problem_id to run tests.
    # For non-clones, we should probably pass None for problem_id to skip testcase validation inside the repair loop,
    # OR update generate_with_repair to handle 'compile_only' mode.
    
    return generate_with_repair(prompt, model, problem_id=None) 

print(f"{Fore.GREEN}✓ Model selection logic & Repair loop defined")

✓ Model selection logic & Repair loop defined


In [None]:
def generate_clones_for_types(clone_types, output_path, target_per_type):
    # Initialize dataset writer
    dataset_writer = jsonlines.open(output_path, mode='w', flush=True)
    
    # Load problems
    problems = list_problems()
    if not problems:
        print(f"{Fore.RED}✗ No problems found in CodeNet directory")
        return None
    
    print(f"Found {len(problems)} problems in CodeNet")
    
    problems_to_process = problems[:MAX_PROBLEMS] if MAX_PROBLEMS else problems
    
    # Clone counters
    clone_counters = {ct: 0 for ct in clone_types}
    
    stats = {
        'no_seed': 0,
        'seed_failed': 0,
        'failed': 0
    }
    
    current_problem_idx = 0
    problems_processed = 0
    
    # Main generation loop
    total_target = target_per_type * len(clone_types)
    with tqdm(total=total_target, desc=f"Generating {', '.join(clone_types)}") as pbar:
        while any(clone_counters[ct] < target_per_type for ct in clone_types):
            # Check if we've exhausted all problems and need to cycle through again
            if current_problem_idx >= len(problems_to_process):
                current_problem_idx = 0
                problems_processed += 1
                print(f"\n{Fore.YELLOW}Completed cycle {problems_processed}, cycling through problems again...")
                
                # Safety check to prevent infinite loop
                if problems_processed >= MAX_CYCLES:
                    print(f"\n{Fore.RED}Reached maximum cycles limit ({MAX_CYCLES}). Stopping generation.")
                    break
            
            problem_id = problems_to_process[current_problem_idx]
            current_problem_idx += 1
            
            try:
                # Load seed code
                seed_code, sub_id = choose_seed(problem_id)
                
                if not seed_code:
                    stats['no_seed'] += 1
                    continue
                
                # Load testcases
                test_cases = load_testcases(problem_id)
                if not test_cases:
                    stats['no_seed'] += 1
                    continue
                
                # Validate seed
                seed_result = validate_java(seed_code, problem_id)
                if isinstance(seed_result, str) and seed_result.startswith("compile_error"):
                     seed_result = "compile_error" # Normalize for check
                
                if seed_result != "passed":
                    stats['seed_failed'] += 1
                    continue
                
                # Generate clones for this problem
                clones_generated_this_problem = 0
                
                # Generate each type of clone
                for clone_type in clone_types:
                    if clone_counters[clone_type] >= target_per_type:
                        continue
                    
                    if clones_generated_this_problem >= MAX_CLONES_PER_PROBLEM:
                        break
                    
                    # Skip Type-4 for complex files
                    if clone_type == 'type4' and len(seed_code.split('\n')) >= 75:
                        continue
                    
                    # Generate clone
                    try:
                        # UPDATED CALL: Pass problem_id for repair loop
                        generated_code = generate_clone_v2(seed_code, clone_type, problem_id)
                        
                        if not generated_code:
                            stats['failed'] += 1
                            continue
                        
                        # Already validated inside generate_clone_v2 (mostly), but double check if returned
                        is_valid, reason = quick_check_code_quality(generated_code)
                        if not is_valid:
                            stats['failed'] += 1
                            continue
                        
                        result = validate_java(generated_code, problem_id)
                        
                        if result == "passed":
                            pair_id = f"{problem_id}_{clone_type}_{uuid.uuid4().hex[:8]}"
                            
                            record = {
                                'id': pair_id,
                                'code_1': seed_code,
                                'code_2': generated_code,
                                'label': "clone",
                                'clone_type': clone_type,
                                'language': 'Java',
                                'problem_id': problem_id,
                                'generator': get_model_for_clone_type(clone_type),
                                'timestamp': time.time()
                            }
                            
                            dataset_writer.write(record)
                            clone_counters[clone_type] += 1
                            clones_generated_this_problem += 1
                            pbar.update(1)
                        else:
                            stats['failed'] += 1
                    
                    except Exception as e:
                        stats['failed'] += 1
                        log(f"[{problem_id}] {clone_type} error: {repr(e)}", Fore.RED)
                
                # Progress update every 50 problems
                if current_problem_idx % 50 == 0:
                    print(f"\n{Fore.CYAN}Progress Update - Problem {current_problem_idx}/{len(problems_to_process)} (Cycle {problems_processed + 1}):")
                    for ct, count in clone_counters.items():
                        status = "✓" if count >= target_per_type else f"{count}/{target_per_type}"
                        print(f"  {ct}: {status}")
                    print()
            
            except Exception as e:
                stats['failed'] += 1
                log(f"[{problem_id}] Unexpected error: {e}", Fore.RED)
                continue
    
    # Close writer
    dataset_writer.close()
    
    return clone_counters, stats

print(f"{Fore.GREEN}✓ Core generation function defined (Updated with repair loop)")

✓ Core generation function defined (Updated with repair loop)


## 3. Generate Easy Clones (Type-1 and Type-2)
Use the fast `codegemma:2b` model to generate Type-1 and Type-2 clones efficiently.

In [None]:
"""
Step 1: Generate Easy Clones (Type-1 and Type-2) using codegemma:2b
"""

print(f"{Fore.CYAN}{'='*60}")
print(f"{Fore.CYAN}STEP 1: Generating Easy Clones (Type-1, Type-2)")
print(f"{Fore.CYAN}Model: {EASY_MODEL}")
print(f"{Fore.CYAN}Output: {EASY_OUTPUT_PATH}")
print(f"{Fore.CYAN}{'='*60}\n")

start_time = time.time()

easy_counters, easy_stats = generate_clones_for_types(
    clone_types=['type1', 'type2'],
    output_path=EASY_OUTPUT_PATH,
    target_per_type=TARGET_CLONES_PER_TYPE
)

elapsed_time = time.time() - start_time

print(f"\n{Fore.GREEN}{'='*60}")
print(f"{Fore.GREEN}✓ EASY CLONES GENERATION COMPLETE!")
print(f"{Fore.GREEN}{'='*60}")

print(f"\n{Fore.CYAN}Clone Counts:")
total_easy = 0
for clone_type, count in easy_counters.items():
    status = "✓ COMPLETE" if count >= TARGET_CLONES_PER_TYPE else "⚠ INCOMPLETE"
    print(f"  {clone_type}: {count}/{TARGET_CLONES_PER_TYPE} {status}")
    total_easy += count

print(f"\n{Fore.GREEN}Total easy clones: {total_easy}")
print(f"Time taken: {elapsed_time/60:.2f} minutes")
print(f"Dataset saved to: {EASY_OUTPUT_PATH}")

print(f"\n{Fore.YELLOW}Statistics:")
print(f"  No seed found: {easy_stats['no_seed']}")
print(f"  Seed validation failed: {easy_stats['seed_failed']}")
print(f"  Generation/validation failed: {easy_stats['failed']}")

STEP 1: Generating Easy Clones (Type-1, Type-2)
Model: deepseek-coder:6.7b
Output: dataset\java_clones_easy_types.jsonl

Found 4053 problems in CodeNet


Generating type1, type2: 100%|██████████| 2/2 [01:08<00:00, 34.30s/it]



✓ EASY CLONES GENERATION COMPLETE!

Clone Counts:
  type1: 1/1 ✓ COMPLETE
  type2: 1/1 ✓ COMPLETE

Total easy clones: 2
Time taken: 2.12 minutes
Dataset saved to: dataset\java_clones_easy_types.jsonl

Statistics:
  No seed found: 1
  Seed validation failed: 0
  Generation/validation failed: 0


## 4. Generate Hard Clones (Type-3 and Type-4)
Use the more capable `deepseek-coder:6.7b` model to generate complex Type-3 and Type-4 clones.

**Note:** This step will be significantly slower due to the larger model, but it ensures correct semantic transformations.

In [None]:
"""
Step 2: Generate Hard Clones (Type-3 and Type-4) using deepseek-coder:6.7b
"""

print(f"{Fore.CYAN}{'='*60}")
print(f"{Fore.CYAN}STEP 2: Generating Hard Clones (Type-3, Type-4)")
print(f"{Fore.CYAN}Model: {HARD_MODEL}")
print(f"{Fore.CYAN}Output: {HARD_OUTPUT_PATH}")
print(f"{Fore.CYAN}{'='*60}\n")

start_time = time.time()

hard_counters, hard_stats = generate_clones_for_types(
    clone_types=['type3', 'type4'],
    output_path=HARD_OUTPUT_PATH,
    target_per_type=TARGET_CLONES_PER_TYPE
)

elapsed_time = time.time() - start_time

print(f"\n{Fore.GREEN}{'='*60}")
print(f"{Fore.GREEN}✓ HARD CLONES GENERATION COMPLETE!")
print(f"{Fore.GREEN}{'='*60}")

print(f"\n{Fore.CYAN}Clone Counts:")
total_hard = 0
for clone_type, count in hard_counters.items():
    status = "✓ COMPLETE" if count >= TARGET_CLONES_PER_TYPE else "⚠ INCOMPLETE"
    print(f"  {clone_type}: {count}/{TARGET_CLONES_PER_TYPE} {status}")
    total_hard += count

print(f"\n{Fore.GREEN}Total hard clones: {total_hard}")
print(f"Time taken: {elapsed_time/60:.2f} minutes")
print(f"Dataset saved to: {HARD_OUTPUT_PATH}")

print(f"\n{Fore.YELLOW}Statistics:")
print(f"  No seed found: {hard_stats['no_seed']}")
print(f"  Seed validation failed: {hard_stats['seed_failed']}")
print(f"  Generation/validation failed: {hard_stats['failed']}")

STEP 2: Generating Hard Clones (Type-3, Type-4)
Model: deepseek-coder:6.7b
Output: dataset\java_clones_hard_types.jsonl

Found 4053 problems in CodeNet


Generating type3, type4:  50%|█████     | 1/2 [00:28<00:28, 28.41s/it]

  Attempting repair for error: wrong_answer
  Attempting repair for error: wrong_answer
  Repair successful!


Generating type3, type4: 100%|██████████| 2/2 [02:31<00:00, 75.78s/it]



✓ HARD CLONES GENERATION COMPLETE!

Clone Counts:
  type3: 1/1 ✓ COMPLETE
  type4: 1/1 ✓ COMPLETE

Total hard clones: 2
Time taken: 2.55 minutes
Dataset saved to: dataset\java_clones_hard_types.jsonl

Statistics:
  No seed found: 1
  Seed validation failed: 0
  Generation/validation failed: 0


## 5. Combine Datasets
Merge the easy and hard clone datasets into a single comprehensive dataset.

In [None]:
"""
Step 3: Combine Easy and Hard Clone Datasets
"""

print(f"{Fore.CYAN}{'='*60}")
print(f"{Fore.CYAN}STEP 3: Combining Datasets")
print(f"{Fore.CYAN}{'='*60}\n")

# Read both datasets
easy_records = []
hard_records = []

if EASY_OUTPUT_PATH.exists():
    with jsonlines.open(EASY_OUTPUT_PATH) as reader:
        easy_records = list(reader)
    print(f"{Fore.GREEN}✓ Loaded {len(easy_records)} records from {EASY_OUTPUT_PATH}")
else:
    print(f"{Fore.YELLOW}⚠ Easy clones file not found: {EASY_OUTPUT_PATH}")

if HARD_OUTPUT_PATH.exists():
    with jsonlines.open(HARD_OUTPUT_PATH) as reader:
        hard_records = list(reader)
    print(f"{Fore.GREEN}✓ Loaded {len(hard_records)} records from {HARD_OUTPUT_PATH}")
else:
    print(f"{Fore.YELLOW}⚠ Hard clones file not found: {HARD_OUTPUT_PATH}")

# Combine records
all_records = easy_records + hard_records

print(f"\n{Fore.CYAN}Total records to write: {len(all_records)}")

# Write combined dataset
with jsonlines.open(COMBINED_OUTPUT_PATH, mode='w') as writer:
    for record in all_records:
        writer.write(record)

print(f"{Fore.GREEN}✓ Combined dataset written to: {COMBINED_OUTPUT_PATH}")

# Count by type
type_counts = {}
for record in all_records:
    clone_type = record.get('clone_type', 'unknown')
    type_counts[clone_type] = type_counts.get(clone_type, 0) + 1

print(f"\n{Fore.CYAN}Distribution by Clone Type:")
for clone_type in ['type1', 'type2', 'type3', 'type4']:
    count = type_counts.get(clone_type, 0)
    print(f"  {clone_type}: {count}")

print(f"\n{Fore.GREEN}{'='*60}")
print(f"{Fore.GREEN}✓ DATASET COMBINATION COMPLETE!")
print(f"{Fore.GREEN}{'='*60}")

STEP 3: Combining Datasets

✓ Loaded 2 records from dataset\java_clones_easy_types.jsonl
✓ Loaded 2 records from dataset\java_clones_hard_types.jsonl

Total records to write: 4
✓ Combined dataset written to: dataset\java_clones_10k.jsonl

Distribution by Clone Type:
  type1: 1
  type2: 1
  type3: 1
  type4: 1

✓ DATASET COMBINATION COMPLETE!


## 6. Validate Combined Dataset
Perform validation and quality checks on the final combined dataset.

In [None]:
"""
Step 4: Validate the Combined Dataset
"""

print(f"{Fore.CYAN}{'='*60}")
print(f"{Fore.CYAN}STEP 4: Dataset Validation")
print(f"{Fore.CYAN}{'='*60}\n")

# Load combined dataset
combined_records = []
if COMBINED_OUTPUT_PATH.exists():
    with jsonlines.open(COMBINED_OUTPUT_PATH) as reader:
        combined_records = list(reader)
    print(f"{Fore.GREEN}✓ Loaded {len(combined_records)} records from {COMBINED_OUTPUT_PATH}")
else:
    print(f"{Fore.RED}✗ Combined dataset not found: {COMBINED_OUTPUT_PATH}")

if combined_records:
    # Validation checks
    print(f"\n{Fore.CYAN}Validation Checks:")
    
    # 1. Check all clone types are present
    type_distribution = {}
    model_distribution = {}
    
    for record in combined_records:
        clone_type = record.get('clone_type', 'unknown')
        model = record.get('generator', 'unknown')
        
        type_distribution[clone_type] = type_distribution.get(clone_type, 0) + 1
        model_distribution[model] = model_distribution.get(model, 0) + 1
    
    print(f"\n{Fore.CYAN}1. Clone Type Distribution:")
    for clone_type in ['type1', 'type2', 'type3', 'type4']:
        count = type_distribution.get(clone_type, 0)
        percentage = (count / len(combined_records) * 100) if combined_records else 0
        status = "✓" if count > 0 else "✗"
        print(f"  {status} {clone_type}: {count} ({percentage:.1f}%)")
    
    print(f"\n{Fore.CYAN}2. Model Distribution:")
    for model, count in model_distribution.items():
        percentage = (count / len(combined_records) * 100) if combined_records else 0
        print(f"  {model}: {count} ({percentage:.1f}%)")
    
    # 3. Verify model assignment correctness
    print(f"\n{Fore.CYAN}3. Model Assignment Verification:")
    correct_assignments = 0
    incorrect_assignments = 0
    
    for record in combined_records:
        clone_type = record.get('clone_type', '')
        model = record.get('generator', '')
        expected_model = get_model_for_clone_type(clone_type)
        
        if model == expected_model:
            correct_assignments += 1
        else:
            incorrect_assignments += 1
    
    if incorrect_assignments == 0:
        print(f"  ✓ All {correct_assignments} records have correct model assignments")
    else:
        print(f"  ⚠ {correct_assignments} correct, {incorrect_assignments} incorrect")
    
    # 4. Check for required fields
    print(f"\n{Fore.CYAN}4. Required Fields Check:")
    required_fields = ['id', 'code_1', 'code_2', 'label', 'clone_type', 'language', 'problem_id', 'generator']
    
    missing_fields = {}
    for record in combined_records:
        for field in required_fields:
            if field not in record or not record[field]:
                missing_fields[field] = missing_fields.get(field, 0) + 1
    
    if not missing_fields:
        print(f"  ✓ All records have all required fields")
    else:
        for field, count in missing_fields.items():
            print(f"  ⚠ {field}: missing in {count} records")
    
    # 5. Summary statistics
    print(f"\n{Fore.GREEN}{'='*60}")
    print(f"{Fore.GREEN}FINAL DATASET SUMMARY")
    print(f"{Fore.GREEN}{'='*60}")
    print(f"{Fore.CYAN}Total Records: {len(combined_records)}")
    print(f"{Fore.CYAN}Dataset Path: {COMBINED_OUTPUT_PATH}")
    print(f"{Fore.CYAN}File Size: {COMBINED_OUTPUT_PATH.stat().st_size / (1024*1024):.2f} MB")
    
    print(f"\n{Fore.CYAN}Clone Type Breakdown:")
    for clone_type in ['type1', 'type2', 'type3', 'type4']:
        count = type_distribution.get(clone_type, 0)
        print(f"  {clone_type}: {count}/{TARGET_CLONES_PER_TYPE}")
    
    print(f"\n{Fore.GREEN}✓ Validation Complete!")
else:
    print(f"{Fore.RED}✗ No records found for validation")

STEP 4: Dataset Validation

✓ Loaded 4 records from dataset\java_clones_10k.jsonl

Validation Checks:

1. Clone Type Distribution:
  ✓ type1: 1 (25.0%)
  ✓ type2: 1 (25.0%)
  ✓ type3: 1 (25.0%)
  ✓ type4: 1 (25.0%)

2. Model Distribution:
  deepseek-coder:6.7b: 4 (100.0%)

3. Model Assignment Verification:
  ✓ All 4 records have correct model assignments

4. Required Fields Check:
  ✓ All records have all required fields

FINAL DATASET SUMMARY
Total Records: 4
Dataset Path: dataset\java_clones_10k.jsonl
File Size: 0.00 MB

Clone Type Breakdown:
  type1: 1/1
  type2: 1/1
  type3: 1/1
  type4: 1/1

✓ Validation Complete!


## 7. Generate Easy Non-Clones
Use the fast `codegemma:2b` model to generate easy non-clones with simple algorithmic differences.

In [None]:
def generate_nonclones_for_types(nonclone_types, output_path, target_per_type):
    # Initialize dataset writer
    dataset_writer = jsonlines.open(output_path, mode='w', flush=True)

    # Load problems
    problems = list_problems()
    if not problems:
        print(f"{Fore.RED}✗ No problems found in CodeNet directory")
        return None

    print(f"Found {len(problems)} problems in CodeNet")

    problems_to_process = problems[:MAX_PROBLEMS] if MAX_PROBLEMS else problems

    # Non-clone counters
    nonclone_counters = {nt: 0 for nt in nonclone_types}

    stats = {
        'failed': 0,
        'seed_failed': 0,
        'no_seed': 0,
        'identical_skipped': 0
    }

    current_problem_idx = 0
    problems_processed = 0

    # Main generation loop
    total_target = target_per_type * len(nonclone_types)
    with tqdm(total=total_target, desc=f"Generating {', '.join(nonclone_types)} non-clones") as pbar:
        while any(nonclone_counters[nt] < target_per_type for nt in nonclone_types):
            # Check if we've exhausted all problems and need to cycle through again
            if current_problem_idx >= len(problems_to_process):
                current_problem_idx = 0
                problems_processed += 1
                print(f"\n{Fore.YELLOW}Completed cycle {problems_processed}, cycling through problems again...")

                # Safety check to prevent infinite loop
                if problems_processed >= MAX_CYCLES:
                    print(f"\n{Fore.RED}Reached maximum cycles limit ({MAX_CYCLES}). Stopping generation.")
                    break

            problem_id = problems_to_process[current_problem_idx]
            current_problem_idx += 1

            try:
                # Load seed code
                seed_code, sub_id = choose_seed(problem_id)

                if not seed_code:
                    stats['no_seed'] += 1
                    continue

                # Load testcases (for validation of generated code)
                test_cases = load_testcases(problem_id)
                if not test_cases:
                    stats['no_seed'] += 1
                    continue

                # Validate seed
                seed_result = validate_java(seed_code, problem_id)
                if isinstance(seed_result, str) and seed_result.startswith("compile_error"):
                     seed_result = "compile_error"
                
                if seed_result != "passed":
                    stats['seed_failed'] += 1
                    continue

                # Generate non-clones for this problem
                nonclones_generated_this_problem = 0

                # Generate each type of non-clone
                for nonclone_type in nonclone_types:
                    if nonclone_counters[nonclone_type] >= target_per_type:
                        continue

                    if nonclones_generated_this_problem >= MAX_CLONES_PER_PROBLEM:
                        break

                    # Generate non-clone
                    try:
                        # UPDATED: Use v2 with None for problem_id (syntax repair only)
                        generated_code = generate_nonclone_v2(seed_code, nonclone_type, problem_id=None)

                        if not generated_code:
                            stats['failed'] += 1
                            continue

                        # Quick quality check
                        is_valid, reason = quick_check_code_quality(generated_code)
                        if not is_valid:
                            stats['failed'] += 1
                            continue

                        # For non-clones, we just need to check if it compiles and runs
                        # (it doesn't need to pass the original test cases)
                        with tempfile.TemporaryDirectory() as temp_dir:
                            java_file = Path(temp_dir) / "Main.java"

                            try:
                                java_file.write_text(generated_code, encoding='utf-8')
                                compile_success, compile_error = compile_java(temp_dir)

                                if compile_success:
                                    # Try to run it with empty input to see if it executes
                                    output, error = run_java_with_input(temp_dir, "", timeout=5)

                                    # As long as it compiles and doesn't crash, it's valid
                                    if error is None or "timeout" not in error.lower():
                                        pair_id = f"{problem_id}_nonclone_{nonclone_type}_{uuid.uuid4().hex[:8]}"

                                        record = {
                                            'id': pair_id,
                                            'code_1': seed_code,
                                            'code_2': generated_code,
                                            'label': "non-clone",
                                            'clone_type': f"nonclone_{nonclone_type}",
                                            'language': 'Java',
                                            'problem_id': problem_id,
                                            'generator': get_model_for_nonclone_type(nonclone_type),
                                            'timestamp': time.time()
                                        }

                                        dataset_writer.write(record)
                                        nonclone_counters[nonclone_type] += 1
                                        nonclones_generated_this_problem += 1
                                        pbar.update(1)
                                    else:
                                        stats['failed'] += 1
                                else:
                                    stats['failed'] += 1
                            except Exception:
                                stats['failed'] += 1

                    except Exception as e:
                        stats['failed'] += 1
                        # print(f"[{problem_id}] {nonclone_type} non-clone error: {repr(e)}") # debug
                        # log(f"[{problem_id}] {nonclone_type} non-clone error: {repr(e)}", Fore.RED)

                # Progress update every 50 problems
                if current_problem_idx % 50 == 0:
                    print(f"\n{Fore.CYAN}Progress Update - Problem {current_problem_idx}/{len(problems_to_process)} (Cycle {problems_processed + 1}):")
                    for nt, count in nonclone_counters.items():
                        status = "✓" if count >= target_per_type else f"{count}/{target_per_type}"
                        print(f"  {nt} non-clones: {status}")
                    print()

            except Exception as e:
                stats['failed'] += 1
                # log(f"[{problem_id}] Unexpected error: {e}", Fore.RED)
                continue

    # Close writer
    dataset_writer.close()

    return nonclone_counters, stats

print(f"{Fore.GREEN}✓ Non-clone generation function defined")

✓ Non-clone generation function defined


## 8. Generate Hard Non-Clones
Use the more capable `deepseek-coder:6.7b` model to generate hard non-clones with different problem domains.

In [None]:
"""
Step 6: Generate Hard Non-Clones using deepseek-coder:6.7b
"""

print(f"{Fore.CYAN}{'='*60}")
print(f"{Fore.CYAN}STEP 6: Generating Hard Non-Clones")
print(f"{Fore.CYAN}Model: {HARD_MODEL}")
print(f"{Fore.CYAN}Output: {HARD_NONCLONES_OUTPUT_PATH}")
print(f"{Fore.CYAN}{'='*60}\n")

start_time = time.time()

hard_nonclone_counters, hard_nonclone_stats = generate_nonclones_for_types(
    nonclone_types=['hard'],
    output_path=HARD_NONCLONES_OUTPUT_PATH,
    target_per_type=TARGET_NONCLONES_HARD
)

elapsed_time = time.time() - start_time

print(f"\n{Fore.GREEN}{'='*60}")
print(f"{Fore.GREEN}✓ HARD NON-CLONES GENERATION COMPLETE!")
print(f"{Fore.GREEN}{'='*60}")

print(f"\n{Fore.CYAN}Non-Clone Counts:")
total_hard_nonclones = 0
for nonclone_type, count in hard_nonclone_counters.items():
    status = "✓ COMPLETE" if count >= TARGET_NONCLONES_HARD else "⚠ INCOMPLETE"
    print(f"  {nonclone_type}: {count}/{TARGET_NONCLONES_HARD} {status}")
    total_hard_nonclones += count

print(f"\n{Fore.GREEN}Total hard non-clones: {total_hard_nonclones}")
print(f"Time taken: {elapsed_time/60:.2f} minutes")
print(f"Dataset saved to: {HARD_NONCLONES_OUTPUT_PATH}")

print(f"\n{Fore.YELLOW}Statistics:")
print(f"  No seed found: {hard_nonclone_stats['no_seed']}")
print(f"  Seed validation failed: {hard_nonclone_stats['seed_failed']}")
print(f"  Generation/validation failed: {hard_nonclone_stats['failed']}")

STEP 6: Generating Hard Non-Clones
Model: deepseek-coder:6.7b
Output: dataset\java_nonclones_hard_types.jsonl

Found 4053 problems in CodeNet


Generating hard non-clones: 100%|██████████| 1/1 [00:58<00:00, 58.49s/it]



✓ HARD NON-CLONES GENERATION COMPLETE!

Non-Clone Counts:
  hard: 1/1 ✓ COMPLETE

Total hard non-clones: 1
Time taken: 0.99 minutes
Dataset saved to: dataset\java_nonclones_hard_types.jsonl

Statistics:
  No seed found: 1
  Seed validation failed: 0
  Generation/validation failed: 0


## 9. Create Complete Dataset
Combine all clones and non-clones into a single comprehensive dataset for training.

In [None]:
"""
Step 7: Create Complete Dataset (Clones + Non-Clones)
"""

print(f"{Fore.CYAN}{'='*60}")
print(f"{Fore.CYAN}STEP 7: Creating Complete Dataset")
print(f"{Fore.CYAN}{'='*60}\n")

# Read all datasets
clone_records = []
easy_nonclone_records = []
hard_nonclone_records = []

if COMBINED_OUTPUT_PATH.exists():
    with jsonlines.open(COMBINED_OUTPUT_PATH) as reader:
        clone_records = list(reader)
    print(f"{Fore.GREEN}✓ Loaded {len(clone_records)} clone records from {COMBINED_OUTPUT_PATH}")
else:
    print(f"{Fore.YELLOW}⚠ Clone dataset file not found: {COMBINED_OUTPUT_PATH}")

if EASY_NONCLONES_OUTPUT_PATH.exists():
    with jsonlines.open(EASY_NONCLONES_OUTPUT_PATH) as reader:
        easy_nonclone_records = list(reader)
    print(f"{Fore.GREEN}✓ Loaded {len(easy_nonclone_records)} easy non-clone records from {EASY_NONCLONES_OUTPUT_PATH}")
else:
    print(f"{Fore.YELLOW}⚠ Easy non-clones file not found: {EASY_NONCLONES_OUTPUT_PATH}")

if HARD_NONCLONES_OUTPUT_PATH.exists():
    with jsonlines.open(HARD_NONCLONES_OUTPUT_PATH) as reader:
        hard_nonclone_records = list(reader)
    print(f"{Fore.GREEN}✓ Loaded {len(hard_nonclone_records)} hard non-clone records from {HARD_NONCLONES_OUTPUT_PATH}")
else:
    print(f"{Fore.YELLOW}⚠ Hard non-clones file not found: {HARD_NONCLONES_OUTPUT_PATH}")

# Combine all records
all_complete_records = clone_records + easy_nonclone_records + hard_nonclone_records

print(f"\n{Fore.CYAN}Total records to write: {len(all_complete_records)}")
print(f"  Clones: {len(clone_records)}")
print(f"  Easy Non-clones: {len(easy_nonclone_records)}")
print(f"  Hard Non-clones: {len(hard_nonclone_records)}")

# Write complete dataset
with jsonlines.open(FINAL_DATASET_PATH, mode='w') as writer:
    for record in all_complete_records:
        writer.write(record)

print(f"{Fore.GREEN}✓ Complete dataset written to: {FINAL_DATASET_PATH}")

# Count by type and label
label_counts = {}
type_counts = {}
for record in all_complete_records:
    label = record.get('label', 'unknown')
    clone_type = record.get('clone_type', 'unknown')

    label_counts[label] = label_counts.get(label, 0) + 1
    type_counts[clone_type] = type_counts.get(clone_type, 0) + 1

print(f"\n{Fore.CYAN}Distribution by Label:")
for label in ['clone', 'non-clone']:
    count = label_counts.get(label, 0)
    percentage = (count / len(all_complete_records) * 100) if all_complete_records else 0
    print(f"  {label}: {count} ({percentage:.1f}%)")

print(f"\n{Fore.CYAN}Distribution by Type:")
for type_name in ['type1', 'type2', 'type3', 'type4', 'nonclone_easy', 'nonclone_hard']:
    count = type_counts.get(type_name, 0)
    percentage = (count / len(all_complete_records) * 100) if all_complete_records else 0
    print(f"  {type_name}: {count} ({percentage:.1f}%)")

print(f"\n{Fore.GREEN}{'='*60}")
print(f"{Fore.GREEN}✓ COMPLETE DATASET CREATION FINISHED!")
print(f"{Fore.GREEN}{'='*60}")
print(f"{Fore.CYAN}Final Dataset: {FINAL_DATASET_PATH}")
print(f"{Fore.CYAN}Total Records: {len(all_complete_records)}")
print(f"{Fore.CYAN}File Size: {FINAL_DATASET_PATH.stat().st_size / (1024*1024):.2f} MB")

STEP 7: Creating Complete Dataset

✓ Loaded 4 clone records from dataset\java_clones_10k.jsonl
⚠ Easy non-clones file not found: dataset\java_nonclones_easy_types.jsonl
✓ Loaded 1 hard non-clone records from dataset\java_nonclones_hard_types.jsonl

Total records to write: 5
  Clones: 4
  Easy Non-clones: 0
  Hard Non-clones: 1
✓ Complete dataset written to: dataset\java_complete_dataset.jsonl

Distribution by Label:
  clone: 4 (80.0%)
  non-clone: 1 (20.0%)

Distribution by Type:
  type1: 1 (20.0%)
  type2: 1 (20.0%)
  type3: 1 (20.0%)
  type4: 1 (20.0%)
  nonclone_easy: 0 (0.0%)
  nonclone_hard: 1 (20.0%)

✓ COMPLETE DATASET CREATION FINISHED!
Final Dataset: dataset\java_complete_dataset.jsonl
Total Records: 5
File Size: 0.01 MB
