In [31]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
import pandas as pd
from imblearn.over_sampling import SMOTE
from sklearn.preprocessing import binarize
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from xgboost import XGBClassifier


In [32]:
df = pd.read_csv('data/small_dataset.csv', index_col=0)
df

Unnamed: 0,index,Opcodes,Label
0,0,PUSH1 PUSH1 MSTORE PUSH1 CALLDATASIZE LT PUSH2...,0
1,1,PUSH1 PUSH1 MSTORE PUSH1 CALLDATASIZE LT PUSH2...,0
2,2,PUSH1 PUSH1 MSTORE PUSH1 CALLDATASIZE LT PUSH2...,0
3,3,PUSH1 PUSH1 MSTORE PUSH1 CALLDATASIZE LT PUSH2...,0
4,4,PUSH1 PUSH1 MSTORE PUSH1 CALLDATASIZE LT PUSH2...,0
...,...,...,...
2087,2130,PUSH1 PUSH1 MSTORE CALLVALUE DUP1 ISZERO PUSH2...,1
2088,2131,PUSH1 PUSH1 MSTORE CALLVALUE DUP1 ISZERO PUSH2...,1
2089,2132,PUSH1 PUSH1 MSTORE CALLVALUE DUP1 ISZERO PUSH2...,1
2090,2134,PUSH1 PUSH1 MSTORE PUSH1 CALLDATASIZE LT PUSH2...,1


In [34]:
# transform opcodes to tf-idf vectors
vectorizer = TfidfVectorizer(ngram_range=(1, 4))

X = vectorizer.fit_transform(df['Opcodes'])

X_train, X_test, y_train, y_test = train_test_split(X, df['Label'], test_size=0.2, random_state=1)

Using SMOTE to deal with imbalanced data and comparing Logistic Reggression, Random Forset and XGboost

In [35]:
# Use SMOTE to oversample the minority class to get balanced training data
smote = SMOTE(random_state=1)
X_train_smote, y_train_smote = smote.fit_resample(X_train, y_train)

# Re-train the classifiers with the balanced data
classifiers = [LogisticRegression(), RandomForestClassifier(), XGBClassifier()]
model_names = ["Logistic Regression", "Random Forest", "XGBoost"]
results = {"Model": [], "Accuracy": [], "Precision": [], "Recall": [], "F1 Score": [], "AUC": []}

for clf, model_name in zip(classifiers, model_names):
    clf.fit(X_train_smote, y_train_smote)
    
    # Change the prediction threshold to 0.3 to make the model predict the positive class more often
    y_pred = (clf.predict_proba(X_test)[:, 1] > 0.3).astype(int)
    
    results["Model"].append(model_name)
    results["Accuracy"].append(accuracy_score(y_test, y_pred))
    results["Precision"].append(precision_score(y_test, y_pred))
    results["Recall"].append(recall_score(y_test, y_pred))
    results["F1 Score"].append(f1_score(y_test, y_pred))
    results["AUC"].append(roc_auc_score(y_test, y_pred))

# Print the results
results_df = pd.DataFrame(results)
print(results_df)

                 Model  Accuracy  Precision    Recall  F1 Score       AUC
0  Logistic Regression  0.618138   0.075145  1.000000  0.139785  0.802956
1        Random Forest  0.976134   0.578947  0.846154  0.687500  0.913225
2              XGBoost  0.980907   0.647059  0.846154  0.733333  0.915688


In [36]:
import pickle

# Assuming that logistic_model, rf_model, xgb_model, and tfidf are already defined

with open('weights/logistic_model.pkl', 'wb') as f:
    pickle.dump(classifiers[0], f)

with open('weights/rf_model.pkl', 'wb') as f:
    pickle.dump(classifiers[1], f)

with open('weights/xgb_model.pkl', 'wb') as f:
    pickle.dump(classifiers[2], f)
    
with open('weights/tfidf.pkl', 'wb') as f:
    pickle.dump(vectorizer, f)