In [1]:
import pandas as pd
import numpy as np
import re
import math
import json
import torch
from collections import Counter
from datasets import Dataset
from transformers import (
    AutoTokenizer,
    AutoModelForTokenClassification,
    DataCollatorForTokenClassification,
    Trainer,
    TrainingArguments,
    pipeline,
)
import gc

# --- CONFIGURATION ---
MODEL_CHECKPOINT = "microsoft/deberta-v3-base"  # Stronger model for context
MAX_LENGTH = 512
BATCH_SIZE = 8
LEARNING_RATE = 2e-5
EPOCHS = 5

2025-11-23 15:34:31.569011: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1763912071.775341      19 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1763912071.834238      19 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


AttributeError: 'MessageFactory' object has no attribute 'GetPrototype'

AttributeError: 'MessageFactory' object has no attribute 'GetPrototype'

AttributeError: 'MessageFactory' object has no attribute 'GetPrototype'

AttributeError: 'MessageFactory' object has no attribute 'GetPrototype'

AttributeError: 'MessageFactory' object has no attribute 'GetPrototype'

In [2]:
# --- PART 1: WEAK SUPERVISION (LABELING) ---


def shannon_entropy(data):
    """Calculates Shannon Entropy to detect random/high-entropy strings."""
    if not data:
        return 0
    entropy = 0
    for x in set(data):
        p_x = float(data.count(x)) / len(data)
        if p_x > 0:
            entropy += -p_x * math.log(p_x, 2)
    return entropy


def heuristic_labeling(text):
    """
    Generates weak labels using a broad set of regex patterns + entropy filtering.
    Returns a list of entities in the format required for training.
    """
    labels = []

    # Broad, high-recall patterns for secrets
    patterns = [
        # Assignments (Password, Token, Key, Secret, Auth)
        r'(?:password|passwd|pwd|secret|token|key|auth|api_?key|access_?key)[\s=:>]{1,3}(["\']?)(?P<val>[a-zA-Z0-9\-\_\$\%\@\!\.\+]{8,120})\1',
        # Headers
        r"Authorization:\s*Bearer\s+(?P<val>[a-zA-Z0-9\-\._~\+\/]+=*)",
        r"X-Consul-Token:\s*(?P<val>[a-zA-Z0-9\-]+)",
        # Cloud / Service Tokens
        r"AKIA[0-9A-Z]{16}",  # AWS Access Key
        r"(?:ghp|gho|ghu|ghs|ghr)_[A-Za-z0-9_]{36}",  # GitHub
        r"xox[baprs]-([0-9a-zA-Z]{10,48})",  # Slack
        r"https://hooks\.slack\.com/services/T[a-zA-Z0-9_]+/B[a-zA-Z0-9_]+/[a-zA-Z0-9_]+",  # Slack Webhook
        r"glpat-[0-9a-zA-Z\-\_]{20}",  # GitLab
        r"sq0csp-[0-9a-zA-Z\-\_]{43}",  # Square
        # Database Connection Strings (URI credentials)
        r"(?:postgres|mysql|mongodb|redis)://[^:]+:(?P<val>[^@]+)@",
        # Private Keys (PEM headers)
        r"-----BEGIN (?:RSA|DSA|EC|OPENSSH) PRIVATE KEY-----",
    ]

    for pat in patterns:
        for match in re.finditer(pat, text, re.IGNORECASE):
            try:
                # Try to get the named group 'val' first
                val = match.group("val")
                start, end = match.span("val")
            except IndexError:
                # Fallback to group 0 (whole match) if no specific group
                val = match.group(0)
                start, end = match.span(0)

            # --- FILTERS to reduce False Positives ---
            # 1. Skip if too short
            if len(val) < 8:
                continue
            # 2. Skip if looks like a placeholder (e.g., {{PASSWORD}})
            if "{{" in val or "}}" in val or "..." in val or "****" in val:
                continue
            # 3. Skip if looks like a file path or URL path segment (simple check)
            if "/" in val and not val.startswith("ey"):  # Allow JWT-like strings
                continue
            # 4. Entropy check: Secrets usually have higher entropy than English words
            if shannon_entropy(val) < 3.0:
                continue

            labels.append({"start": start, "end": end, "label": "SECRET"})

    # Clean up boundaries (remove surrounding quotes if captured)
    final_labels = []
    for l in labels:
        span_val = text[l["start"] : l["end"]]

        # Trim leading/trailing quotes
        if span_val.startswith('"') or span_val.startswith("'"):
            l["start"] += 1
        if span_val.endswith('"') or span_val.endswith("'"):
            l["end"] -= 1

        # Sanity check: Ensure start < end after trimming
        if l["start"] < l["end"]:
            final_labels.append(l)

    return final_labels


# Load and Label Data
print("Loading and labeling training data...")
df_train = pd.read_csv("/kaggle/input/secret-detection-in-command-lines/train.csv")
df_train["entities"] = df_train["cmdline"].apply(heuristic_labeling)

# Keep only rows where we found secrets to give the model strong positive signals
# (Optionally, mix in some empty examples to teach the model what is NOT a secret)
training_df = df_train[df_train["entities"].str.len() > 0].reset_index(drop=True)
print(f"Found {len(training_df)} samples with potential secrets out of {len(df_train)}")

Loading and labeling training data...
Found 18203 samples with potential secrets out of 74242


In [3]:
# --- PART 2: DATASET PREPARATION (FIXED) ---

# BIO Mapping
label2id = {"O": 0, "B-SECRET": 1, "I-SECRET": 2}
id2label = {0: "O", 1: "B-SECRET", 2: "I-SECRET"}

tokenizer = AutoTokenizer.from_pretrained(MODEL_CHECKPOINT)

def tokenize_and_align_labels(examples):
    # 1. Tokenize inputs and ask for 'offset_mapping'
    # This gives us the character (start, end) for every token generated
    tokenized_inputs = tokenizer(
        examples["cmdline"], 
        truncation=True, 
        max_length=MAX_LENGTH,
        return_offsets_mapping=True
    )

    labels = []
    
    # 2. Iterate over every example in the batch
    for i, cmdline in enumerate(examples["cmdline"]):
        doc_labels = []
        
        # Get the detected secret spans for this specific example
        # Format: [{'start': 10, 'end': 20, ...}, ...]
        secret_spans = examples["entities"][i]
        
        # Get the offsets for this example: [(0,0), (0,3), (3,5)...]
        offsets = tokenized_inputs["offset_mapping"][i]
        
        # 3. Assign a label (O, B, or I) to every token based on its character position
        for start_char, end_char in offsets:
            
            # Special tokens like [CLS], [SEP], and PAD usually have (0,0) offsets.
            # We label them -100 so PyTorch ignores them during loss calculation.
            if start_char == 0 and end_char == 0:
                doc_labels.append(-100)
                continue
            
            # Default label is 'O' (0)
            token_label = 0 
            
            # Check if this token's position overlaps with any known secret
            for span in secret_spans:
                s_span, e_span = span['start'], span['end']
                
                # Condition 1: Token starts exactly at the beginning of a secret
                if start_char == s_span:
                    token_label = 1 # B-SECRET
                    break
                
                # Condition 2: Token is strictly inside the secret boundaries
                # (Note: start_char >= s_span allows for subwords starting mid-secret)
                elif start_char > s_span and start_char < e_span:
                    token_label = 2 # I-SECRET
                    break
            
            doc_labels.append(token_label)
            
        labels.append(doc_labels)

    # Remove offset_mapping (not needed for the model training)
    tokenized_inputs.pop("offset_mapping")
    tokenized_inputs["labels"] = labels
    return tokenized_inputs

# Convert pandas to HuggingFace Dataset
hf_dataset = Dataset.from_pandas(training_df[["id", "cmdline", "entities"]])

# Apply the fixed function
tokenized_dataset = hf_dataset.map(tokenize_and_align_labels, batched=True)

# Train/Val Split
split_dataset = tokenized_dataset.train_test_split(test_size=0.1, seed=42)


tokenizer_config.json:   0%|          | 0.00/52.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/579 [00:00<?, ?B/s]

spm.model:   0%|          | 0.00/2.46M [00:00<?, ?B/s]



Map:   0%|          | 0/18203 [00:00<?, ? examples/s]

In [4]:
gc.collect()
# Clear cache just in case
torch.cuda.empty_cache()

In [5]:
import os

# 1. Memory Fragmentation Fix (suggested by your error message)
os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "expandable_segments:True"

# --- PART 3: MODEL TRAINING (OPTIMIZED) ---

model = AutoModelForTokenClassification.from_pretrained(
    MODEL_CHECKPOINT, 
    num_labels=3, 
    id2label=id2label, 
    label2id=label2id
)

data_collator = DataCollatorForTokenClassification(tokenizer=tokenizer)

# Optimized Arguments
args = TrainingArguments(
    "deberta-secret-detector",
    eval_strategy="epoch",
    learning_rate=LEARNING_RATE,
    
    # MEMORY OPTIMIZATIONS:
    per_device_train_batch_size=8,    # Reduced from 8 to fit in VRAM
    per_device_eval_batch_size=8,     # Reduced from 8
    gradient_accumulation_steps=2,    # Accumulate 2 steps of size 4 -> Effective batch size 8
    # fp16=True,                        # Use mixed precision (drastically reduces memory)
    
    num_train_epochs=EPOCHS,
    weight_decay=0.01,
    save_strategy="epoch",
    report_to="none",
    logging_steps=50,
    save_total_limit=2,
    load_best_model_at_end=True
)

trainer = Trainer(
    model=model,
    args=args,
    train_dataset=split_dataset["train"],
    eval_dataset=split_dataset["test"],
    tokenizer=tokenizer,
    data_collator=data_collator,
)

print("Starting training with memory optimizations...")
trainer.train()

pytorch_model.bin:   0%|          | 0.00/371M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/371M [00:00<?, ?B/s]

Some weights of DebertaV2ForTokenClassification were not initialized from the model checkpoint at microsoft/deberta-v3-base and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer = Trainer(


Starting training with memory optimizations...




Epoch,Training Loss,Validation Loss
1,0.026,0.013187
2,0.0159,0.013267
3,0.0073,0.011374
4,0.0048,0.006289
5,0.0033,0.005891




TrainOutput(global_step=2560, training_loss=0.020324865319707897, metrics={'train_runtime': 4370.43, 'train_samples_per_second': 18.742, 'train_steps_per_second': 0.586, 'total_flos': 1.33260363993438e+16, 'train_loss': 0.020324865319707897, 'epoch': 5.0})

In [6]:
# --- PART 4: INFERENCE & SUBMISSION ---

print("Loading Test Data...")
test_df = pd.read_csv("/kaggle/input/secret-detection-in-command-lines/test.csv")

# Use Pipeline for robust char offset calculation
# aggregation_strategy="simple" automatically merges B-SECRET and I-SECRET tokens
nlp_pipe = pipeline(
    "token-classification",
    model=model,
    tokenizer=tokenizer,
    aggregation_strategy="simple",
    device=0 if torch.cuda.is_available() else -1,
)


def post_process(text, predictions):
    final_ents = []
    for p in predictions:
        if p["entity_group"] == "SECRET":
            start, end = p["start"], p["end"]

            # Boundary Cleanup: Trim surrounding quotes/spaces captured by the model
            span_text = text[start:end]

            # Find actual content start/end relative to the span
            # Example: span='"password123"' -> we want 'password123'

            # Strip leading/trailing quotes or spaces
            stripped = span_text.strip().strip("'").strip('"')

            # Recalculate offsets
            new_start = start + span_text.find(stripped)
            new_end = new_start + len(stripped)

            # Filter very short or low entropy predictions
            if len(stripped) < 6:
                continue

            final_ents.append({"start": new_start, "end": new_end, "label": "SECRET"})
    return final_ents


print("Running inference on test set...")
submission_data = []

# Processing row by row (can be batched for speed if dataset is huge)
for idx, row in test_df.iterrows():
    preds = nlp_pipe(row["cmdline"])
    refined_preds = post_process(row["cmdline"], preds)

    # Format as JSON string for submission
    submission_data.append({"id": row["id"], "entities": json.dumps(refined_preds)})

# Save Submission
sub_df = pd.DataFrame(submission_data)
sub_df.to_csv("submission.csv", index=False)
print("Submission saved to submission.csv")

Device set to use cuda:0
Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


Loading Test Data...
Running inference on test set...


You seem to be using the pipelines sequentially on GPU. In order to maximize efficiency please use a dataset


Submission saved to submission.csv
