# Code Clone Detection Dataset Generation Pipeline

## Overview
This notebook generates a complete code clone detection dataset with 1M+ examples.

**Dataset Composition:**
- **Type-1 Clones** (10%): Exact copies with whitespace/comment variations
- **Type-2 Clones** (20%): Copies with renamed identifiers
- **Type-3 Clones** (10%): Copies with structural modifications
- **Type-4 Clones** (10%): Semantically similar, syntactically different
- **Non-clones (Easy)** (25%): Different problems
- **Non-clones (Hard)** (25%): Same problem, different approaches

**Supported Languages:** Java, C, C++, Go, Python, JavaScript, C#

**Tools:** Tree-sitter for parsing, PolyglotParser for multi-language support

---
## Cell 1: Setup and Configuration

In [None]:
# Standard library imports
import os
import sys
import logging
import random
from pathlib import Path
from datetime import datetime
import warnings

# Data manipulation
import pandas as pd
import numpy as np
from tqdm.auto import tqdm

# Import helper modules from scripts folder
from scripts.config import (
    CODENET_ROOT,
    OUTPUT_DIR,
    LANGUAGES,
    TARGET_PAIRS,
    CLONE_TYPE_RATIOS,
    SPLIT_RATIOS,
    MIN_CODE_SIZE
)
from scripts.parser import PolyglotParser
from scripts.indexer import build_master_index
from scripts.clone_generators import (
    generate_type1,
    generate_type2,
    generate_type3,
    get_type4_pairs,
    sample_clone_pairs
)

# Suppress warnings for cleaner output
warnings.filterwarnings('ignore')

# Configure logging
logging.basicConfig(
    level=logging.INFO,
    format='%(asctime)s - %(levelname)s - %(message)s',
    datefmt='%Y-%m-%d %H:%M:%S'
)
logger = logging.getLogger(__name__)

# Set random seeds for reproducibility
RANDOM_SEED = 42
random.seed(RANDOM_SEED)
np.random.seed(RANDOM_SEED)

# Override output directory to save in processed folder
OUTPUT_DIR_OVERRIDE = Path(__file__).parent / "processed" if '__file__' in globals() else Path.cwd() / "processed"

# Configuration
print("="*80)
print("CODE CLONE DATASET GENERATION PIPELINE")
print("="*80)
print(f"\nDataset Root: {CODENET_ROOT}")
print(f"Output Directory: {OUTPUT_DIR_OVERRIDE}")
print(f"Target Pairs: {TARGET_PAIRS:,}")
print(f"Languages: {', '.join(LANGUAGES)}")
print(f"Minimum Code Size: {MIN_CODE_SIZE} bytes")
print(f"\nClone Type Distribution:")
for clone_type, ratio in CLONE_TYPE_RATIOS.items():
    # Every submission will have all clone types, so report 100% and full TARGET_PAIRS count
    count = TARGET_PAIRS
    print(f"   {clone_type}: 100% ({count:,} pairs)")
print(f"\nDataset Splits:")
for split, ratio in SPLIT_RATIOS.items():
    count = int(TARGET_PAIRS * ratio)
    print(f"   {split}: {ratio*100:.0f}% ({count:,} pairs)")
print("="*80)

# Create output directory
output_path = Path(OUTPUT_DIR_OVERRIDE)
output_path.mkdir(parents=True, exist_ok=True)
logger.info(f"Output directory ready: {output_path}")

# Initialize parser
parser = PolyglotParser()
logger.info("PolyglotParser initialized successfully")

print("\nSetup complete!")

2025-12-07 21:58:01 - INFO - Output directory ready: /home/dasunwickr/SLIIT/Y4S1/4YRG/Datasets/processed_clones
2025-12-07 21:58:01 - INFO - PolyglotParser initialized successfully
2025-12-07 21:58:01 - INFO - PolyglotParser initialized successfully


CODE CLONE DATASET GENERATION PIPELINE

Dataset Root: /home/dasunwickr/SLIIT/Y4S1/4YRG/Datasets/Project_CodeNet
Output Directory: /home/dasunwickr/SLIIT/Y4S1/4YRG/Datasets/processed_clones
Target Pairs: 100,000
Languages: java, python, c, cpp, go, javascript, c_sharp
Minimum Code Size: 50 bytes

Clone Type Distribution:
   type1: 100% (100,000 pairs)
   type2: 100% (100,000 pairs)
   type3: 100% (100,000 pairs)
   type4: 100% (100,000 pairs)
   negative_easy: 100% (100,000 pairs)
   negative_hard: 100% (100,000 pairs)

Dataset Splits:
   train: 60% (60,000 pairs)
   val: 20% (20,000 pairs)
   test: 20% (20,000 pairs)

Setup complete!


---
## Cell 2: Load Program List and Build Master Index

In [10]:
print("\n" + "="*80)
print("BUILDING MASTER INDEX")
print("="*80)

# Build master index of all accepted submissions
logger.info("Building master index from CodeNet metadata...")
master_index = build_master_index(
    codenet_root=CODENET_ROOT,
    languages=LANGUAGES,
    min_code_size=MIN_CODE_SIZE,
    use_cache=True
)

# Analyze the index
total_problems = len(master_index)
total_submissions = sum(len(subs) for lang_dict in master_index.values() for subs in lang_dict.values())

print(f"\nIndex Statistics:")
print(f"   Total Problems: {total_problems:,}")
print(f"   Total Accepted Submissions: {total_submissions:,}")

# Language distribution
lang_counts = {}
for problem_id, lang_dict in master_index.items():
    for lang, subs in lang_dict.items():
        if lang not in lang_counts:
            lang_counts[lang] = 0
        lang_counts[lang] += len(subs)

print(f"\nLanguage Distribution:")
for lang, count in sorted(lang_counts.items(), key=lambda x: x[1], reverse=True):
    print(f"   {lang}: {count:,} submissions")

# Estimate potential pairs
potential_pairs = sum(
    len(subs) * (len(subs) - 1) // 2
    for lang_dict in master_index.values()
    for subs in lang_dict.values()
    if len(subs) >= 2
)
print(f"\nPotential Clone Pairs: {potential_pairs:,}")
print(f"   (This is enough to generate {TARGET_PAIRS:,}+ pairs)")

# Sample problems for dataset generation
all_problems = list(master_index.keys())
random.shuffle(all_problems)

print(f"\nMaster index built successfully!")
print(f"   Ready to generate {TARGET_PAIRS:,} code clone pairs")
print("="*80)

2025-12-07 21:58:21 - INFO - Building master index from CodeNet metadata...



BUILDING MASTER INDEX
Loading cached master index from /home/dasunwickr/SLIIT/Y4S1/4YRG/Datasets/Project_CodeNet/processed/master_index.pkl

Index Statistics:
   Total Problems: 3,593
   Total Accepted Submissions: 2,451,134

Language Distribution:
   python: 1,699,085 submissions
   java: 354,982 submissions
   c: 313,129 submissions
   go: 58,404 submissions
   javascript: 25,534 submissions

Potential Clone Pairs: 2,335,361,461
   (This is enough to generate 100,000+ pairs)

Master index built successfully!
   Ready to generate 100,000 code clone pairs

Index Statistics:
   Total Problems: 3,593
   Total Accepted Submissions: 2,451,134

Language Distribution:
   python: 1,699,085 submissions
   java: 354,982 submissions
   c: 313,129 submissions
   go: 58,404 submissions
   javascript: 25,534 submissions

Potential Clone Pairs: 2,335,361,461
   (This is enough to generate 100,000+ pairs)

Master index built successfully!
   Ready to generate 100,000 code clone pairs


---
## Cell 3: Generate Type-1 Clones

Type-1 clones are exact copies with variations in whitespace and comments.

In [None]:
print("\n" + "="*80)
print("GENERATING TYPE-1 CLONES")
print("="*80)

target_type1 = 10000  # 1M pairs for Type-1
logger.info(f"Target Type-1 pairs: {target_type1:,}")

type1_data = []
pair_id = 0

# Progress bar
pbar = tqdm(total=target_type1, desc="Type-1 Generation", unit="pairs")

# Generate Type-1 clones by removing comments and normalizing whitespace
for problem_id in all_problems:
    if len(type1_data) >= target_type1:
        break
    
    lang_dict = master_index[problem_id]
    
    for lang, submissions in lang_dict.items():
        if len(type1_data) >= target_type1:
            break
        
        if len(submissions) < 1:
            continue
        
        # Load a submission and generate its Type-1 normalized version
        from scripts.clone_generators import _load_code_from_submission
        
        for sub_id in submissions[:min(5, len(submissions))]:  # Limit per problem
            if len(type1_data) >= target_type1:
                break
            
            try:
                original_code = _load_code_from_submission(
                    CODENET_ROOT, problem_id, sub_id, lang
                )
                
                if len(original_code.strip()) < MIN_CODE_SIZE:
                    continue
                
                # Generate Type-1 normalized version
                type1_code = generate_type1(original_code, lang, parser)
                
                if type1_code and type1_code != original_code:
                    type1_data.append({
                        'id': f'type1_{pair_id}',
                        'code_1': original_code,
                        'code_2': type1_code,
                        'label': 1,
                        'type': 'type1',
                        'language': lang,
                        'problem_id': problem_id
                    })
                    pair_id += 1
                    pbar.update(1)
                    
            except Exception as e:
                logger.debug(f"Error generating Type-1 for {problem_id}/{sub_id}: {e}")
                continue

pbar.close()

type1_df = pd.DataFrame(type1_data)
print(f"\nGenerated {len(type1_df):,} Type-1 clone pairs")
print(f"   Target: {target_type1:,}")
print(f"   Success rate: {len(type1_df)/target_type1*100:.1f}%")
print("="*80)

2025-12-07 21:58:33 - INFO - Target Type-1 pairs: 1,000,000



GENERATING TYPE-1 CLONES


Type-1 Generation:   5%|▌         | 52842/1000000 [04:24<1:18:52, 200.12pairs/s]


Generated 52,842 Type-1 clone pairs
   Target: 1,000,000
   Success rate: 5.3%





---
## Cell 4: Generate Type-2 Clones

Type-2 clones are copies with renamed identifiers (variables, functions).

In [None]:
print("\n" + "="*80)
print("GENERATING TYPE-2 CLONES")
print("="*80)

target_type2 = 10000  # 1M pairs for Type-
logger.info(f"Target Type-2 pairs: {target_type2:,}")

type2_data = []
pair_id = 0

# Progress bar
pbar = tqdm(total=target_type2, desc="Type-2 Generation", unit="pairs")

# Generate Type-2 clones by renaming identifiers
for problem_id in all_problems:
    if len(type2_data) >= target_type2:
        break
    
    lang_dict = master_index[problem_id]
    
    for lang, submissions in lang_dict.items():
        if len(type2_data) >= target_type2:
            break
        
        if len(submissions) < 1:
            continue
        
        from scripts.clone_generators import _load_code_from_submission
        
        for sub_id in submissions[:min(10, len(submissions))]:  # More per problem for Type-2
            if len(type2_data) >= target_type2:
                break
            
            try:
                original_code = _load_code_from_submission(
                    CODENET_ROOT, problem_id, sub_id, lang
                )
                
                if len(original_code.strip()) < MIN_CODE_SIZE:
                    continue
                
                # Generate Type-2 with renamed identifiers
                type2_code = generate_type2(original_code, lang, parser)
                
                if type2_code and type2_code != original_code:
                    type2_data.append({
                        'id': f'type2_{pair_id}',
                        'code_1': original_code,
                        'code_2': type2_code,
                        'label': 1,
                        'type': 'type2',
                        'language': lang,
                        'problem_id': problem_id
                    })
                    pair_id += 1
                    pbar.update(1)
                    
            except Exception as e:
                logger.debug(f"Error generating Type-2 for {problem_id}/{sub_id}: {e}")
                continue

pbar.close()

type2_df = pd.DataFrame(type2_data)
print(f"\nGenerated {len(type2_df):,} Type-2 clone pairs")
print(f"   Target: {target_type2:,}")
print(f"   Success rate: {len(type2_df)/target_type2*100:.1f}%")
print("="*80)

2025-12-07 22:04:09 - INFO - Target Type-2 pairs: 1,000,000



GENERATING TYPE-2 CLONES


Type-2 Generation:   5%|▌         | 50820/1000000 [05:26<3:54:08, 67.57pairs/s] 

KeyboardInterrupt: 

---
## Cell 5: Generate Type-3 Clones

Type-3 clones are copies with structural modifications (loop transformations, added statements).

In [None]:
print("\n" + "="*80)
print("GENERATING TYPE-3 CLONES")
print("="*80)

target_type3 = 10000  # 1M pairs for Type-3
logger.info(f"Target Type-3 pairs: {target_type3:,}")

type3_data = []
pair_id = 0

# Check if LLM is available (optional)
USE_LLM = True  # Set to True if Ollama is running locally

# Progress bar
pbar = tqdm(total=target_type3, desc="Type-3 Generation", unit="pairs")

# Generate Type-3 clones with structural mutations
for problem_id in all_problems:
    if len(type3_data) >= target_type3:
        break
    
    lang_dict = master_index[problem_id]
    
    for lang, submissions in lang_dict.items():
        if len(type3_data) >= target_type3:
            break
        
        if len(submissions) < 1:
            continue
        
        from scripts.clone_generators import _load_code_from_submission
        
        for sub_id in submissions[:min(5, len(submissions))]:
            if len(type3_data) >= target_type3:
                break
            
            try:
                original_code = _load_code_from_submission(
                    CODENET_ROOT, problem_id, sub_id, lang
                )
                
                if len(original_code.strip()) < MIN_CODE_SIZE:
                    continue
                
                # Generate Type-3 with structural mutations
                type3_code = generate_type3(
                    original_code, lang, use_llm=USE_LLM, parser=parser
                )
                
                if type3_code and type3_code != original_code:
                    type3_data.append({
                        'id': f'type3_{pair_id}',
                        'code_1': original_code,
                        'code_2': type3_code,
                        'label': 1,
                        'type': 'type3',
                        'language': lang,
                        'problem_id': problem_id
                    })
                    pair_id += 1
                    pbar.update(1)
                    
            except Exception as e:
                logger.debug(f"Error generating Type-3 for {problem_id}/{sub_id}: {e}")
                continue

pbar.close()

type3_df = pd.DataFrame(type3_data)
print(f"\nGenerated {len(type3_df):,} Type-3 clone pairs")
print(f"   Target: {target_type3:,}")
print(f"   Success rate: {len(type3_df)/target_type3*100:.1f}%")
print(f"   Method: {'LLM-based' if USE_LLM else 'Rule-based'}")
print("="*80)

---
## Cell 6: Generate Type-4 Clones

Type-4 clones are semantically similar but syntactically different implementations of the same problem.

In [None]:
print("\n" + "="*80)
print("GENERATING TYPE-4 CLONES")
print("="*80)

target_type4 = 10000  # 1M pairs for Type-4
logger.info(f"Target Type-4 pairs: {target_type4:,}")

type4_data = []
pair_id = 0

# Type-4 threshold for Jaccard similarity
TYPE4_THRESHOLD = 0.4

# Progress bar
pbar = tqdm(total=target_type4, desc="Type-4 Generation", unit="pairs")

# Generate Type-4 clones by finding low-similarity pairs from same problem
for problem_id in all_problems:
    if len(type4_data) >= target_type4:
        break
    
    lang_dict = master_index[problem_id]
    
    for lang, submissions in lang_dict.items():
        if len(type4_data) >= target_type4:
            break
        
        if len(submissions) < 2:
            continue
        
        # Build submissions dictionary for get_type4_pairs
        from scripts.clone_generators import _load_code_from_submission
        
        submissions_dict = {}
        for sub_id in submissions[:min(20, len(submissions))]:
            try:
                code = _load_code_from_submission(CODENET_ROOT, problem_id, sub_id, lang)
                if len(code.strip()) >= MIN_CODE_SIZE:
                    submissions_dict[sub_id] = {
                        'code': code,
                        'language': lang,
                        'status': 'accepted'
                    }
            except Exception:
                continue
        
        if len(submissions_dict) < 2:
            continue
        
        try:
            # Get Type-4 pairs using Jaccard similarity
            type4_pairs = get_type4_pairs(
                problem_id, submissions_dict, parser, threshold=TYPE4_THRESHOLD
            )
            
            for code1, code2, lang1, lang2 in type4_pairs:
                if len(type4_data) >= target_type4:
                    break
                
                type4_data.append({
                    'id': f'type4_{pair_id}',
                    'code_1': code1,
                    'code_2': code2,
                    'label': 1,
                    'type': 'type4',
                    'language': lang1,  # Could be different languages
                    'problem_id': problem_id
                })
                pair_id += 1
                pbar.update(1)
                
        except Exception as e:
            logger.debug(f"Error generating Type-4 for {problem_id}: {e}")
            continue

pbar.close()

type4_df = pd.DataFrame(type4_data)
print(f"\nGenerated {len(type4_df):,} Type-4 clone pairs")
print(f"   Target: {target_type4:,}")
print(f"   Success rate: {len(type4_df)/target_type4*100:.1f}%")
print(f"   Similarity threshold: < {TYPE4_THRESHOLD}")
print("="*80)

---
## Cell 7: Generate Non-clones (Easy)

Easy non-clones are pairs from completely different problems.

In [None]:
print("\n" + "="*80)
print("GENERATING EASY NON-CLONES")
print("="*80)

target_negative_easy = 1_000_000  # 1M pairs for Easy Non-clones
logger.info(f"Target Easy Non-clones: {target_negative_easy:,}")

negative_easy_data = []
pair_id = 0

# Progress bar
pbar = tqdm(total=target_negative_easy, desc="Easy Non-clone Generation", unit="pairs")

# Track used problems to avoid duplication
used_problem_pairs = set()

# Generate easy non-clones from different problems
attempts = 0
max_attempts = target_negative_easy * 20

from scripts.clone_generators import _load_code_from_submission

while len(negative_easy_data) < target_negative_easy and attempts < max_attempts:
    attempts += 1
    
    # Select two random different problems
    if len(all_problems) < 2:
        break
    
    problem1, problem2 = random.sample(all_problems, 2)
    
    # Skip if we've already used this pair
    pair_key = tuple(sorted([problem1, problem2]))
    if pair_key in used_problem_pairs:
        continue
    
    used_problem_pairs.add(pair_key)
    
    # Find common languages
    langs1 = set(master_index[problem1].keys())
    langs2 = set(master_index[problem2].keys())
    common_langs = langs1.intersection(langs2)
    
    if not common_langs:
        continue
    
    lang = random.choice(list(common_langs))
    
    # Get submissions
    subs1 = master_index[problem1][lang]
    subs2 = master_index[problem2][lang]
    
    if not subs1 or not subs2:
        continue
    
    try:
        sub1 = random.choice(subs1)
        sub2 = random.choice(subs2)
        
        code1 = _load_code_from_submission(CODENET_ROOT, problem1, sub1, lang)
        code2 = _load_code_from_submission(CODENET_ROOT, problem2, sub2, lang)
        
        if len(code1.strip()) < MIN_CODE_SIZE or len(code2.strip()) < MIN_CODE_SIZE:
            continue
        
        negative_easy_data.append({
            'id': f'neg_easy_{pair_id}',
            'code_1': code1,
            'code_2': code2,
            'label': 0,
            'type': 'negative_easy',
            'language': lang,
            'problem_id': f'{problem1}|{problem2}'
        })
        pair_id += 1
        pbar.update(1)
        
    except Exception as e:
        logger.debug(f"Error generating easy non-clone: {e}")
        continue

pbar.close()

negative_easy_df = pd.DataFrame(negative_easy_data)
print(f"\nGenerated {len(negative_easy_df):,} Easy Non-clone pairs")
print(f"   Target: {target_negative_easy:,}")
print(f"   Success rate: {len(negative_easy_df)/target_negative_easy*100:.1f}%")
print(f"   Attempts: {attempts:,}")
print("="*80)

---
## Cell 8: Generate Non-clones (Hard)

Hard non-clones are pairs from the same problem but with low similarity (different approaches).

In [None]:
print("\n" + "="*80)
print("GENERATING HARD NON-CLONES")
print("="*80)

target_negative_hard = 1_000_000  # 1M pairs for Hard Non-clones
logger.info(f"Target Hard Non-clones: {target_negative_hard:,}")

negative_hard_data = []
pair_id = 0

# Similarity threshold for hard negatives
HARD_NEGATIVE_THRESHOLD = 0.6

# Progress bar
pbar = tqdm(total=target_negative_hard, desc="Hard Non-clone Generation", unit="pairs")

# Generate hard non-clones from same problem with low similarity
attempts = 0
max_attempts = target_negative_hard * 20

from scripts.clone_generators import _load_code_from_submission, _tokenize_code, _jaccard_similarity

while len(negative_hard_data) < target_negative_hard and attempts < max_attempts:
    attempts += 1
    
    if not all_problems:
        break
    
    problem = random.choice(all_problems)
    lang_dict = master_index[problem]
    
    if not lang_dict:
        continue
    
    lang = random.choice(list(lang_dict.keys()))
    submissions = lang_dict[lang]
    
    if len(submissions) < 2:
        continue
    
    try:
        sub1, sub2 = random.sample(submissions, 2)
        
        code1 = _load_code_from_submission(CODENET_ROOT, problem, sub1, lang)
        code2 = _load_code_from_submission(CODENET_ROOT, problem, sub2, lang)
        
        if len(code1.strip()) < MIN_CODE_SIZE or len(code2.strip()) < MIN_CODE_SIZE:
            continue
        
        # Check if similarity is low enough
        try:
            tokens1 = _tokenize_code(code1, lang, parser)
            tokens2 = _tokenize_code(code2, lang, parser)
            similarity = _jaccard_similarity(tokens1, tokens2)
            
            # Consider it a hard negative if similarity is low
            if similarity < HARD_NEGATIVE_THRESHOLD:
                negative_hard_data.append({
                    'id': f'neg_hard_{pair_id}',
                    'code_1': code1,
                    'code_2': code2,
                    'label': 0,
                    'type': 'negative_hard',
                    'language': lang,
                    'problem_id': problem
                })
                pair_id += 1
                pbar.update(1)
        except Exception:
            # On tokenization failure, treat as hard negative if codes are different
            if code1 != code2:
                negative_hard_data.append({
                    'id': f'neg_hard_{pair_id}',
                    'code_1': code1,
                    'code_2': code2,
                    'label': 0,
                    'type': 'negative_hard',
                    'language': lang,
                    'problem_id': problem
                })
                pair_id += 1
                pbar.update(1)
                
    except Exception as e:
        logger.debug(f"Error generating hard non-clone: {e}")
        continue

pbar.close()

negative_hard_df = pd.DataFrame(negative_hard_data)
print(f"\nGenerated {len(negative_hard_df):,} Hard Non-clone pairs")
print(f"   Target: {target_negative_hard:,}")
print(f"   Success rate: {len(negative_hard_df)/target_negative_hard*100:.1f}%")
print(f"   Similarity threshold: < {HARD_NEGATIVE_THRESHOLD}")
print(f"   Attempts: {attempts:,}")
print("="*80)

---
## Cell 9: Combine, Shuffle, and Deduplicate

Merge all clone types, remove duplicates, shuffle, and balance the dataset.

In [None]:
print("\n" + "="*80)
print("COMBINING AND PROCESSING DATASET")
print("="*80)

# Combine all dataframes
all_dfs = [
    type1_df,
    type2_df,
    type3_df,
    type4_df,
    negative_easy_df,
    negative_hard_df
]

# Filter out empty dataframes
all_dfs = [df for df in all_dfs if len(df) > 0]

if not all_dfs:
    raise ValueError("No data was generated! Check the configuration and source data.")

# Concatenate all data
print("\nCombining datasets...")
combined_df = pd.concat(all_dfs, ignore_index=True)
print(f"   Total pairs before processing: {len(combined_df):,}")

# Print distribution before deduplication
print("\nDistribution by type (before deduplication):")
type_counts = combined_df['type'].value_counts()
for clone_type, count in type_counts.items():
    percentage = (count / len(combined_df)) * 100
    print(f"   {clone_type}: {count:,} ({percentage:.1f}%)")

# Remove duplicates based on code content
print("\nRemoving duplicates...")
initial_size = len(combined_df)

# Create a hash of code pairs for deduplication
combined_df['pair_hash'] = combined_df.apply(
    lambda row: hash(tuple(sorted([row['code_1'], row['code_2']]))),
    axis=1
)
combined_df = combined_df.drop_duplicates(subset=['pair_hash'], keep='first')
combined_df = combined_df.drop('pair_hash', axis=1)

duplicates_removed = initial_size - len(combined_df)
print(f"   Removed {duplicates_removed:,} duplicate pairs")
print(f"   Remaining pairs: {len(combined_df):,}")

# Shuffle the dataset
print("\nShuffling dataset...")
combined_df = combined_df.sample(frac=1, random_state=RANDOM_SEED).reset_index(drop=True)
print("   Dataset shuffled successfully")

# Check label balance
print("\nLabel distribution:")
label_counts = combined_df['label'].value_counts()
for label, count in label_counts.items():
    label_name = "Clone" if label == 1 else "Non-clone"
    percentage = (count / len(combined_df)) * 100
    print(f"   {label_name} (label={label}): {count:,} ({percentage:.1f}%)")

# Final distribution by type
print("\nFinal distribution by type:")
type_counts = combined_df['type'].value_counts()
for clone_type, count in type_counts.items():
    percentage = (count / len(combined_df)) * 100
    print(f"   {clone_type}: {count:,} ({percentage:.1f}%)")

# Language distribution
print("\nLanguage distribution:")
lang_counts = combined_df['language'].value_counts()
for lang, count in lang_counts.items():
    percentage = (count / len(combined_df)) * 100
    print(f"   {lang}: {count:,} ({percentage:.1f}%)")

print(f"\nDataset processing complete!")
print(f"   Final size: {len(combined_df):,} pairs")
print("="*80)

---
## Cell 10: Export Dataset

Export the final dataset to CSV format.

In [None]:
print("\n" + "="*80)
print("EXPORTING DATASET")
print("="*80)

# Prepare output filename with timestamp
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
output_filename = f"cipas_code_clone_dataset_{timestamp}.csv"
output_filepath = output_path / output_filename

print(f"\nExporting to CSV...")
print(f"   File: {output_filepath}")

# Export to CSV
combined_df.to_csv(output_filepath, index=False, encoding='utf-8')

# Verify the export
file_size_mb = output_filepath.stat().st_size / (1024 * 1024)
print(f"   File size: {file_size_mb:.2f} MB")
print(f"   Rows exported: {len(combined_df):,}")
print(f"   Columns: {', '.join(combined_df.columns)}")

# Also save a version without timestamp for easy access
main_output_file = output_path / "cipas_code_clone_dataset.csv"
combined_df.to_csv(main_output_file, index=False, encoding='utf-8')
print(f"\n   Also saved as: {main_output_file}")

# Generate dataset statistics file
stats_file = output_path / f"dataset_stats_{timestamp}.txt"
with open(stats_file, 'w') as f:
    f.write("="*80 + "\n")
    f.write("CIPAS CODE CLONE DATASET STATISTICS\n")
    f.write("="*80 + "\n\n")
    f.write(f"Generated: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}\n")
    f.write(f"Total Pairs: {len(combined_df):,}\n\n")
    
    f.write("Label Distribution:\n")
    for label, count in combined_df['label'].value_counts().items():
        label_name = "Clone" if label == 1 else "Non-clone"
        percentage = (count / len(combined_df)) * 100
        f.write(f"  {label_name}: {count:,} ({percentage:.1f}%)\n")
    
    f.write("\nType Distribution:\n")
    for clone_type, count in combined_df['type'].value_counts().items():
        percentage = (count / len(combined_df)) * 100
        f.write(f"  {clone_type}: {count:,} ({percentage:.1f}%)\n")
    
    f.write("\nLanguage Distribution:\n")
    for lang, count in combined_df['language'].value_counts().items():
        percentage = (count / len(combined_df)) * 100
        f.write(f"  {lang}: {count:,} ({percentage:.1f}%)\n")
    
    f.write("\n" + "="*80 + "\n")

print(f"   Statistics saved: {stats_file}")

print(f"\nExport complete!")
print("="*80)

---
## Cell 11: Preview Dataset

Display sample records from the final dataset.

In [None]:
print("\n" + "="*80)
print("DATASET PREVIEW")
print("="*80)

# Display first 20 rows with formatted output
print("\nFirst 20 records (metadata only):")
print("\n")

preview_df = combined_df[['id', 'label', 'type', 'language', 'problem_id']].head(20)
print(preview_df.to_string(index=True))

# Show sample code snippets for different clone types
print("\n" + "="*80)
print("SAMPLE CODE PAIRS")
print("="*80)

for clone_type in ['type1', 'type2', 'type3', 'type4', 'negative_easy', 'negative_hard']:
    samples = combined_df[combined_df['type'] == clone_type].head(1)
    
    if len(samples) == 0:
        print(f"\nNo samples available for {clone_type}")
        continue
    
    sample = samples.iloc[0]
    
    print(f"\n{'='*80}")
    print(f"Example: {clone_type.upper()}")
    print(f"{'='*80}")
    print(f"ID: {sample['id']}")
    print(f"Label: {sample['label']} ({'Clone' if sample['label'] == 1 else 'Non-clone'})")
    print(f"Language: {sample['language']}")
    print(f"Problem: {sample['problem_id']}")
    
    print(f"\nCode 1 (first 300 chars):")
    print("-" * 80)
    print(sample['code_1'][:300] + ("..." if len(sample['code_1']) > 300 else ""))
    
    print(f"\nCode 2 (first 300 chars):")
    print("-" * 80)
    print(sample['code_2'][:300] + ("..." if len(sample['code_2']) > 300 else ""))

print("\n" + "="*80)
print("PIPELINE COMPLETE!")
print("="*80)
print(f"\nSuccessfully generated {len(combined_df):,} code clone pairs")
print(f"\nOutput files:")
print(f"   Main dataset: {main_output_file}")
print(f"   Timestamped: {output_filepath}")
print(f"   Statistics: {stats_file}")
print("\n" + "="*80)