In [1]:
import os
import re
import tokenize
from io import BytesIO

from sklearn.feature_extraction.text import TfidfVectorizer
import torch
import torch.nn as nn
import torch.optim as optim

from skorch import NeuralNetClassifier
from skorch.callbacks import EarlyStopping

from sklearn.svm import SVC

from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import classification_report, accuracy_score, precision_score, recall_score, f1_score

import joblib

In [2]:
# Check for GPU
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Preprocessing

In [2]:
# Preprocessing functions
def tokenize_code(code, file_path):
    tokens = []
    reader = BytesIO(code.encode('utf-8')).readline
    try:
        for toknum, tokval, _, _, _ in tokenize.tokenize(reader):
            if toknum != tokenize.ENCODING:
                tokens.append(tokval)
    except tokenize.TokenError as e:
        print("Error tokenizing code in file:", file_path)
    return tokens

def normalize_code(code):
    code = re.sub(r'//.*', '', code)
    code = re.sub(r'/\*.*?\*/', '', code, flags=re.DOTALL)
    code = re.sub(r'\s+', ' ', code).strip()
    return code

def preprocess_code(code, file_path):
    normalized_code = normalize_code(code)
    tokens = tokenize_code(normalized_code, file_path)
    return ' '.join(tokens)

# Load data from directories

def load_data_from_directory(directory, label):
    data = []
    labels = []
    for filename in os.listdir(directory):
        if filename.endswith(".sol"):
            filepath = os.path.join(directory, filename)
            with open(filepath, 'r', encoding='utf-8') as file:
                try:
                    code = file.read()
                    preprocessed_code = preprocess_code(code, filepath)
                    data.append(preprocessed_code)
                    labels.append(label)
                except Exception as e:
                    print("Error processing file:", filepath)
    return data, labels

In [4]:
# Paths to the directories
vulnerable_dir = './Contracts for training/Re-entrancy'
non_vulnerable_dir = './Contracts for training/Verified'

# Load and label the data
vulnerable_data, vulnerable_labels = load_data_from_directory(vulnerable_dir, 1)
non_vulnerable_data, non_vulnerable_labels = load_data_from_directory(non_vulnerable_dir, 0)

# Combine the data and labels
data = vulnerable_data + non_vulnerable_data
labels = vulnerable_labels + non_vulnerable_labels

## Neural Network Vectors

In [5]:
# Vectorize the data using TF-IDF
vectorizer = TfidfVectorizer(max_features=1000)
X_nn = vectorizer.fit_transform(data).toarray().astype('float32')  # Ensure dtype float32
y_nn = torch.tensor(labels, dtype=torch.float32).unsqueeze(1)  # Change dtype to float32 and reshape to [batch_size, 1]

In [None]:
# Save the fitted vectorizer
joblib.dump(vectorizer, 'neural_network_vectors.pkl')

## SVM Vectors

In [5]:
# Vectorize the data using TF-IDF
vectorizer = TfidfVectorizer(max_features=1000)
X = vectorizer.fit_transform(data).toarray()
y = labels

In [6]:
# Save the fitted vectorizer
joblib.dump(vectorizer, 'svm_vectors.pkl')

['svm_vectors.pkl']

# Neural Network

In [8]:
# Define the neural network model
class SmartContractVulnerabilityModel(nn.Module):
    def __init__(self, input_dim, hidden_dim1=256, hidden_dim2=128):
        super(SmartContractVulnerabilityModel, self).__init__()
        self.fc1 = nn.Linear(input_dim, hidden_dim1)
        self.fc2 = nn.Linear(hidden_dim1, hidden_dim2)
        self.fc3 = nn.Linear(hidden_dim2, 1)
        self.relu = nn.ReLU()
        self.sigmoid = nn.Sigmoid()

    def forward(self, x):
        x = self.relu(self.fc1(x))
        x = self.relu(self.fc2(x))
        x = self.fc3(x)
        return x

In [15]:
# Skorch wrapper for the PyTorch model
net = NeuralNetClassifier(
    SmartContractVulnerabilityModel,
    module__input_dim=1000,
    max_epochs=20,  # Higher number of epochs with early stopping
    lr=0.001,
    optimizer=optim.Adam,
    criterion=nn.BCEWithLogitsLoss,
    iterator_train__shuffle=True,
    callbacks=[EarlyStopping(patience=5)],  # Early stopping after 5 epochs without improvement
)

# Hyperparameter grid
params = {
    'lr': [0.001],
    'max_epochs': [20],
    'module__hidden_dim1': [256],
    'module__hidden_dim2': [128]
}

In [17]:
# Initialize GridSearchCV
#cv is for cross validation
#n_jobs=-1 means use all available cores
gs = GridSearchCV(net, params, refit=True, cv=5, scoring='accuracy')

# Perform grid search
gs.fit(X_nn, y_nn)

  epoch    train_loss    valid_acc    valid_loss      dur
-------  ------------  -----------  ------------  -------
      1        [36m0.1907[0m       [32m0.9263[0m        [35m0.1736[0m  57.7204
      2        [36m0.1680[0m       [32m0.9302[0m        [35m0.1670[0m  37.0448
      3        [36m0.1608[0m       [32m0.9311[0m        [35m0.1649[0m  37.3297
      4        [36m0.1564[0m       [32m0.9341[0m        [35m0.1606[0m  36.0267
      5        [36m0.1528[0m       0.9335        [35m0.1605[0m  35.1116
      6        [36m0.1503[0m       0.9333        0.1609  37.3694
      7        [36m0.1477[0m       0.9325        0.1612  35.4650
      8        [36m0.1453[0m       0.9339        0.1621  37.9463
      9        [36m0.1431[0m       [32m0.9347[0m        0.1621  35.3213
Stopping since valid_loss has not improved in the last 5 epochs.
  epoch    train_loss    valid_acc    valid_loss      dur
-------  ------------  -----------  ------------  -------
      1   

In [14]:
# Print the best parameters and the best score
print("Best parameters found:", gs.best_params_)
print("Best score:", gs.best_score_)

Best parameters found: {'lr': 0.001, 'max_epochs': 20, 'module__hidden_dim1': 256, 'module__hidden_dim2': 128}
Best score: 0.9341476835048672


In [None]:
# Save the best model
torch.save(gs.best_estimator_.module_.state_dict(), 'neural_network_model.pth')

In [14]:
# Split data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X_nn, y_nn, test_size=0.2, random_state=42)

# Use the best model found by GridSearchCV
best_model = gs.best_estimator_

# Fit the best model on the entire training set
best_model.fit(X_train, y_train)

# Predict on the test set
y_pred = best_model.predict(X_test)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)

print(f"Test Accuracy: {accuracy:.4f}")
print(f"Test Precision: {precision:.4f}")
print(f"Test Recall: {recall:.4f}")
print(f"Test F1 Score: {f1:.4f}")

# Print detailed classification report
print(classification_report(y_test, y_pred))


NameError: name 'gs' is not defined

In [12]:
import joblib
from sklearn.metrics import confusion_matrix
import numpy as np

# Load the saved model
model = joblib.load('neural_network_vectors.pkl')

# Assuming you have your test data and labels in X_test and y_test respectively
# X_test = ...
# y_test = ...

# Make predictions
y_pred = model.predict(X)

# Calculate the confusion matrix
cm = confusion_matrix(y_test, y_pred)

print("Confusion Matrix:")
print(cm)


AttributeError: 'TfidfVectorizer' object has no attribute 'predict'

# SVM

In [6]:
svm_model = SVC(probability=True)

In [7]:
# Hyperparameter grid for SVM
params = {
    'C': [10], #0.1,1,10
    'kernel': ['linear', 'rbf'],
    'gamma': ['scale', 'auto']
}

In [8]:
# Initialize GridSearchCV
gs = GridSearchCV(svm_model, params, refit=True, cv=3, scoring='accuracy')

# Perform grid search
gs.fit(X, y)

# Print the best parameters and the best score
print("Best parameters found:", gs.best_params_)
print("Best score:", gs.best_score_)

In [None]:
# Save the best model
joblib.dump(gs.best_estimator_, 'svm_model.pkl')

In [None]:
# Split data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Use the best model found by GridSearchCV
best_model = gs.best_estimator_

# Fit the best model on the entire training set
best_model.fit(X_train, y_train)

# Predict on the test set
y_pred = best_model.predict(X_test)

In [None]:
# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)

print(f"Test Accuracy: {accuracy:.4f}")
print(f"Test Precision: {precision:.4f}")
print(f"Test Recall: {recall:.4f}")
print(f"Test F1 Score: {f1:.4f}")

In [None]:
# Print detailed classification report
print(classification_report(y_test, y_pred))