In [None]:
import os
import ast
import re
import numpy as np
import pandas as pd
import joblib
import torch
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import precision_recall_fscore_support
from transformers import AutoTokenizer, AutoModel


In [None]:
# Load dataset
df = pd.read_csv("Dataset1k.csv")


In [None]:
# Remove invalid labels (-1)
df = df[df["Label"] != -1]


In [None]:
# Ensure all code values are strings
df["Code Snippet"] = df["Code Snippet"].astype(str).fillna("")

# Tokenization using regex
def tokenize_code(code):
    return re.findall(r'[a-zA-Z_][a-zA-Z0-9_]*|\S', code)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df["Code Snippet"] = df["Code Snippet"].astype(str).fillna("")


In [None]:
# AST analysis: Extract function calls & structure
def extract_ast_features(code):
    try:
        tree = ast.parse(code)
        function_calls = [node.func.id for node in ast.walk(tree) if isinstance(node, ast.Call) and hasattr(node.func, 'id')]
        return len(function_calls)
    except:
        return 0

In [None]:
# Count suspicious keywords
def count_suspicious_keywords(code):
    suspicious_keywords = {"eval", "exec", "os.system", "subprocess", "pickle.loads"}
    return sum(code.count(keyword) for keyword in suspicious_keywords)


In [None]:
# Load CodeBERT model
tokenizer = AutoTokenizer.from_pretrained("microsoft/codebert-base")
model_bert = AutoModel.from_pretrained("microsoft/codebert-base")

# Convert code to CodeBERT embeddings
def get_code_embedding(code):
    tokens = tokenizer(code, return_tensors="pt", padding=True, truncation=True, max_length=512)
    with torch.no_grad():
        outputs = model_bert(**tokens)
    return outputs.last_hidden_state.mean(dim=1).squeeze().detach().cpu().numpy()


In [None]:
# Feature extraction function
def extract_features(code):
    if not isinstance(code, str) or not code.strip():  # Skip empty code
        return None
    token_count = len(tokenize_code(code))
    suspicious_count = count_suspicious_keywords(code)
    func_count = extract_ast_features(code)
    embedding = get_code_embedding(code)
    return [token_count, suspicious_count, func_count] + embedding.tolist()


In [None]:
# Prepare dataset
features = []
labels = []

for index, row in df.iterrows():
    code = row["Code Snippet"]
    label = row["Label"]

    feature_vector = extract_features(code)
    if feature_vector is not None:
        features.append(feature_vector)
        labels.append(1 if label == 1 else 0)  # Convert numeric labels


In [None]:
# Convert to NumPy arrays
X = np.array(features)
y = np.array(labels)


In [None]:
# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [None]:
# Train ML model
rf_model = RandomForestClassifier(n_estimators=100, random_state=42)
rf_model.fit(X_train, y_train)




In [None]:
# Save the trained model
joblib.dump(rf_model, "trained_model.pkl")



['trained_model.pkl']

In [None]:
# Evaluate model
y_pred = rf_model.predict(X_test)
precision, recall, f1, _ = precision_recall_fscore_support(y_test, y_pred, average='binary')
print(f"Precision: {precision:.2f}, Recall: {recall:.2f}, F1-score: {f1:.2f}")


Precision: 1.00, Recall: 0.82, F1-score: 0.90


In [None]:
# Load trained model
rf_model = joblib.load("trained_model.pkl")


In [None]:
# Function to test new code
def predict_unsafe_code(code):
    feature_vector = extract_features(code)
    if feature_vector is None:
        print("⚠️ Invalid or empty code provided!")
        return
    prediction = rf_model.predict([feature_vector])[0]
    print("🚨 This code is UNSAFE!" if prediction == 1 else "✅ This code is SAFE!")

# Example tests
safe_code = "print('Hello, world!')"
unsafe_code = """unsafe_snippets = ["import os\nos.system('rm -rf /')", "import socket,subprocess,os\ns=socket.socket(socket.AF_INET,socket.SOCK_STREAM)..."]
for code in unsafe_snippets:
    predict_unsafe_code(code)

"""

print("\nTesting Safe Code:")
predict_unsafe_code(safe_code)

print("\nTesting Unsafe Code:")
predict_unsafe_code(unsafe_code)


Testing Safe Code:
✅ This code is SAFE!

Testing Unsafe Code:
🚨 This code is UNSAFE!
