In [None]:
# Weight-based masking: Higher weights → More tokens masked
# Using CodeBERT tokenizer for [MASK] token
import random
import numpy as np

def mask_line_tokens_weighted_codebert(line_text, mask_percentage):
    """
    Mask a percentage of tokens in a line using CodeBERT tokenizer.
    
    Args:
        line_text: The line of code to mask
        mask_percentage: Percentage of tokens to mask (based on weight)
    
    Returns:
        masked_text: Line with masked tokens
        mask_positions: List of token positions that were masked
        original_tokens: List of original tokens that were masked
    """
    if not line_text.strip():
        return line_text, [], []
    
    # Tokenize the line using CodeBERT tokenizer
    line_tokens = codebert_tokenizer(line_text, return_tensors="pt", add_special_tokens=False).input_ids[0]
    n_line_tokens = len(line_tokens)
    
    if n_line_tokens == 0:
        return line_text, [], []
    
    # Calculate number of tokens to mask based on percentage
    num_to_mask = max(1, int(n_line_tokens * mask_percentage))
    num_to_mask = min(num_to_mask, n_line_tokens)  # Don't exceed total tokens
    
    # Randomly select positions to mask
    mask_positions = random.sample(range(n_line_tokens), num_to_mask)
    mask_positions.sort()
    
    # Store original tokens that will be masked
    original_tokens = []
    masked_token_ids = line_tokens.clone()
    
    # Use CodeBERT's [MASK] token
    mask_token_id = codebert_tokenizer.mask_token_id
    
    for pos in mask_positions:
        original_tokens.append({
            'position': pos,
            'token_id': line_tokens[pos].item(),
            'token_text': codebert_tokenizer.decode([line_tokens[pos]])
        })
        masked_token_ids[pos] = mask_token_id
    
    # Decode the masked line using CodeBERT tokenizer
    masked_text = codebert_tokenizer.decode(masked_token_ids, skip_special_tokens=False)
    
    return masked_text, mask_positions, original_tokens

# Weight-based masking configuration
random.seed(42)  # For reproducibility

# Base masking percentage
BASE_MASK_PCT = 0.15  # 15% base masking

# Apply weights to determine actual masking
# Higher weight → Multiply by factor > 1 (more masking)
# Lower weight → Multiply by factor < 1 (less masking)

# Calculate masking percentage for each line based on its weight
# We'll scale the base 15% by the relative weight
line_stats['mask_percentage'] = line_stats['weight_normalized'].apply(
    lambda w: BASE_MASK_PCT * (w / line_stats['weight_normalized'].mean())
)

# Clip to reasonable range (min 5%, max 30%)
line_stats['mask_percentage'] = line_stats['mask_percentage'].clip(0.05, 0.30)

masked_results = []

for idx, row in line_stats.iterrows():
    line_text = row['line_content']
    mask_pct = row['mask_percentage']
    
    masked_text, mask_pos, orig_tokens = mask_line_tokens_weighted_codebert(line_text, mask_pct)
    
    masked_results.append({
        'line': row['line'],
        'original_line': line_text,
        'masked_line': masked_text,
        'num_tokens': row['num_tokens'],
        'num_masked': len(mask_pos),
        'mask_percentage_actual': (len(mask_pos) / row['num_tokens'] * 100) if row['num_tokens'] > 0 else 0,
        'mask_percentage_target': mask_pct * 100,
        'masked_positions': mask_pos,
        'masked_tokens': orig_tokens,
        'perplexity': row['perplexity'],
        'weight': row['weight_normalized']
    })

masked_df = pd.DataFrame(masked_results)

print("=" * 120)
print("WEIGHT-BASED MASKING WITH CODEBERT (15% Base Rate × Weight Factor)")
print("=" * 120)
print(f"\nTotal lines: {len(masked_df)}")
print(f"Base masking: {BASE_MASK_PCT*100:.1f}% of tokens")
print(f"Weight adjustment: Lines scaled by relative weight (higher weight → more masking)")
print(f"Actual range: {masked_df['mask_percentage_target'].min():.1f}% to {masked_df['mask_percentage_target'].max():.1f}%")
print(f"Tokenizer: CodeBERT (microsoft/codebert-base)")
print(f"Mask token: {codebert_tokenizer.mask_token}\n")

# Display summary sorted by weight (descending)
display_cols = ['line', 'weight', 'perplexity', 'num_tokens', 'num_masked', 'mask_percentage_target', 'mask_percentage_actual']
summary_sorted = masked_df[display_cols].sort_values('weight', ascending=False)

print("\nSummary (sorted by weight - highest first):")
print(summary_sorted.to_string(index=False))

print("\n" + "=" * 120)
print("Sample Lines (Top 5 highest weighted = most masked):")
print("=" * 120)
top_weighted = masked_df.nlargest(5, 'weight')
for i, row in top_weighted.iterrows():
    print(f"\nLine {row['line']} (Weight: {row['weight']:.4f}, Perplexity: {row['perplexity']:.2f}):")
    print(f"  Original: {row['original_line']}")
    print(f"  Masked:   {row['masked_line']}")
    print(f"  Masked {row['num_masked']}/{row['num_tokens']} tokens ({row['mask_percentage_actual']:.1f}%)")

masked_df

In [None]:
# Generate perturbations for all masked lines
import time

# Configuration for perturbation
NUM_PERTURBATIONS = 2
TOP_P = 0.5  # Nucleus sampling threshold
TEMPERATURE = 0.9  # Sampling temperature

print("=" * 120)
print(f"GENERATING PERTURBATIONS USING CODEBERT WITH NUCLEUS SAMPLING")
print("=" * 120)
print(f"Number of perturbations per line: {NUM_PERTURBATIONS}")
print(f"Sampling method: Nucleus sampling (top-p={TOP_P}, temperature={TEMPERATURE})")
print(f"Total masked lines: {len(masked_df)}")
print("\n")

perturbation_results = []

start_time = time.time()

for idx, row in masked_df.iterrows():
    masked_line = row['masked_line']
    
    # Skip lines with no masks
    if '<mask>' not in masked_line.lower() and '[MASK]' not in masked_line:
        perturbations = [row['original_line']] * NUM_PERTURBATIONS
    else:
        # Generate perturbations
        perturbations = generate_perturbations_for_masked_line(
            masked_line, 
            num_perturbations=NUM_PERTURBATIONS,
            top_p=TOP_P,
            temperature=TEMPERATURE
        )
    
    perturbation_results.append({
        'line': row['line'],
        'original_line': row['original_line'],
        'masked_line': masked_line,
        'perturbation_1': perturbations[0] if len(perturbations) > 0 else '',
        'perturbation_2': perturbations[1] if len(perturbations) > 1 else '',
        'num_masked': row['num_masked'],
        'perplexity': row['perplexity'],
        'weight': row['weight']
    })
    
    # Progress indicator
    if (idx + 1) % 10 == 0:
        print(f"Processed {idx + 1}/{len(masked_df)} lines...")

end_time = time.time()

perturbation_df = pd.DataFrame(perturbation_results)

print(f"\n✓ Perturbation generation complete!")
print(f"Time taken: {end_time - start_time:.2f} seconds")
print(f"Average time per line: {(end_time - start_time) / len(masked_df):.3f} seconds")

print("\n" + "=" * 120)
print("SAMPLE PERTURBATIONS (Top 5 highest weighted lines):")
print("=" * 120)

top_samples = perturbation_df.nlargest(5, 'weight')
for i, row in top_samples.iterrows():
    print(f"\nLine {row['line']} (Weight: {row['weight']:.4f}, Perplexity: {row['perplexity']:.2f}):")
    print(f"  Original:        {row['original_line']}")
    print(f"  Masked:          {row['masked_line']}")
    print(f"  Perturbation 1:  {row['perturbation_1']}")
    print(f"  Perturbation 2:  {row['perturbation_2']}")

perturbation_df