# Bixie Model Training & Benchmarking

This notebook demonstrates how to train and benchmark the Bixie vulnerability classifier using code embeddings from the SAFEEmbedder model.

In [1]:
import sys
import os

# Get the absolute path of the directory containing the current script
current_dir = os.path.dirname(os.path.abspath("."))
print(f"Current directory: {current_dir}")
print(type(sys.path))
print(f"Current sys.path: {sys.path}")

# Add the project root to sys.path
sys.path.insert(0,"/home/trashpanda/repos/bixie.ai/")

Current directory: /home/trashpanda/repos/bixie.ai
<class 'list'>
Current sys.path: ['/usr/lib/python313.zip', '/usr/lib/python3.13', '/usr/lib/python3.13/lib-dynload', '', '/home/trashpanda/repos/bixie.ai/lib/python3.13/site-packages']


In [2]:
import numpy as np
import matplotlib.pyplot as plt
from pathlib import Path
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix, ConfusionMatrixDisplay, accuracy_score, f1_score
import joblib
import torch

from bixie.models.model_inference import SAFEEmbedder, CLASSIFIER_PATH

## 1. Load Labeled Samples

Load both vulnerable and clean samples from the correct data files.

In [3]:
import json
import random

# Load vulnerable samples
with open("../data/bixie_V.json") as f:
    vuln = json.load(f)
    for v in vuln:
        v["label"] = 1

# Load clean samples (non-vulnerable)
with open("../data/bixie_noV.json") as f:
    clean = json.load(f)
    for c in clean:
        c["label"] = 0

data = vuln + clean
random.shuffle(data)

formatted = [
    {
        "id": f"{i}",
        "project": item["project"],
        "code": item["code"],
        "label": item["label"]
    }
    for i, item in enumerate(data)
]

with open("../datasets/training_data.json", "w") as f:
    json.dump(formatted, f, indent=2)

print(f"Loaded {len(vuln)} vulnerable samples and {len(clean)} clean samples")

Loaded 2240 vulnerable samples and 20494 clean samples


In [4]:
import json

with open("../datasets/training_data.json") as f:
    data = json.load(f)

texts = [item["code"] for item in data]
labels = [item["label"] for item in data]

print(f"Total samples: {len(texts)}")
print(f"Vulnerable samples: {sum(labels)}")
print(f"Clean samples: {len(labels) - sum(labels)}")

Total samples: 22734
Vulnerable samples: 2240
Clean samples: 20494


In [5]:
from transformers import RobertaTokenizer
from torch.utils.data import Dataset

tokenizer = RobertaTokenizer.from_pretrained("microsoft/codebert-base")

class CodeDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_len=512):
        self.encodings = tokenizer(texts, truncation=True, padding=True, max_length=max_len)
        self.labels = labels

    def __getitem__(self, idx):
        item = {k: torch.tensor(v[idx]) for k, v in self.encodings.items()}
        item["labels"] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)

In [6]:
from sklearn.model_selection import train_test_split

train_texts, val_texts, train_labels, val_labels = train_test_split(
    texts, labels, test_size=0.2, random_state=42
)

train_dataset = CodeDataset(train_texts, train_labels, tokenizer)
val_dataset = CodeDataset(val_texts, val_labels, tokenizer)

print(f"Training samples: {len(train_texts)}")
print(f"Validation samples: {len(val_texts)}")

Training samples: 18187
Validation samples: 4547


## 2. Extract Embeddings

Create a function to extract embeddings from the text data using the SAFEEmbedder.

In [7]:
def extract_embeddings_from_texts(texts, labels, embedder):
    """Extract embeddings from text data using SAFEEmbedder"""
    X, y, failed = [], [], 0
    
    for i, (text, label) in enumerate(zip(texts, labels)):
        try:
            # Convert text to bytes for the embedder
            text_bytes = text.encode('utf-8')
            emb = embedder.embed_binary_string(text_bytes)
            if emb is not None:
                X.append(emb)
                y.append(label)
            else:
                failed += 1
        except Exception as e:
            print(f"Failed to embed sample {i}: {e}")
            failed += 1
    
    return np.array(X), np.array(y), failed

## 3. Generate Embeddings

In [8]:
embedder = SAFEEmbedder()
X, y, failed = extract_embeddings_from_texts(texts, labels, embedder)
print(f"Extracted {len(X)} embeddings ({failed} failed)")
print(f"Embedding shape: {X.shape}")

tokenizer_config.json:   0%|          | 0.00/19.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/480 [00:00<?, ?B/s]

vocab.json: 0.00B [00:00, ?B/s]

merges.txt: 0.00B [00:00, ?B/s]

pytorch_model.bin:   0%|          | 0.00/336M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/336M [00:00<?, ?B/s]

  return forward_call(*args, **kwargs)


KeyboardInterrupt: 

## 4. Train/Test Split & Model Training

In [None]:
if len(X) < 2:
    raise ValueError("Not enough samples for training.")

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
clf = RandomForestClassifier(n_estimators=100, random_state=42)
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)

print(f"Training samples: {len(X_train)}")
print(f"Test samples: {len(X_test)}")

## 5. Evaluation

In [None]:
acc = accuracy_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)
report = classification_report(y_test, y_pred, target_names=["Clean", "Vulnerable"])
print(f"Accuracy: {acc:.4f}")
print(f"F1 Score: {f1:.4f}")
print(report)

## 6. Confusion Matrix

In [None]:
cm = confusion_matrix(y_test, y_pred, labels=[0, 1])
disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=["Clean", "Vulnerable"])
disp.plot(cmap="Blues")
plt.title("Confusion Matrix")
plt.show()

## 7. Save Trained Classifier (Optional)

In [None]:
# Save the trained classifier for use in inference
joblib.dump(clf, CLASSIFIER_PATH)
print(f"Trained classifier saved to {CLASSIFIER_PATH}")