In [2]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
import pandas as pd
from imblearn.over_sampling import SMOTE
from sklearn.preprocessing import binarize
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from xgboost import XGBClassifier


In [3]:
df = pd.read_csv('data/small_dataset.csv', index_col=0)


Vectorizing with tf-idf and bagging with 4 ngrams

In [4]:
# transform opcodes to tf-idf vectors
vectorizer = TfidfVectorizer(ngram_range=(1, 4))

X = vectorizer.fit_transform(df['Opcodes'])

Evaluate model metrics using SMOTE and StratifiedCV to deal with imbalanced data |
Comparing Logistic Reggression, Random Forset and XGboost

In [5]:
from imblearn.pipeline import Pipeline
from sklearn.model_selection import StratifiedKFold, cross_validate
from sklearn.metrics import make_scorer, precision_score, recall_score, f1_score


# Define the cross-validation procedure
cv = StratifiedKFold(n_splits=5)

classifiers = [LogisticRegression(), RandomForestClassifier(), XGBClassifier()]
model_names = ["Logistic Regression", "Random Forest", "XGBoost"]
results = {"Model": [], "Mean Accuracy": [], "Std Accuracy": [], "Mean Precision": [], "Mean Recall": [], "Mean F1 Score": []}


scoring = {'accuracy' : make_scorer(accuracy_score), 
           'precision' : make_scorer(precision_score),
           'recall' : make_scorer(recall_score), 
           'f1_score' : make_scorer(f1_score)}

smote = SMOTE()

for clf, model_name in zip(classifiers, model_names):
    
    # Create pipeline
    pipeline = Pipeline([
        ('sampling', smote),
        ('classification', clf)
    ])
    
    scores = cross_validate(pipeline, X, df['Label'], scoring=scoring, cv=cv, n_jobs=-1, return_train_score=False)
    
    results["Model"].append(model_name)
    results["Mean Accuracy"].append(scores['test_accuracy'].mean())
    results["Std Accuracy"].append(scores['test_accuracy'].std())
    results["Mean Precision"].append(scores['test_precision'].mean())
    results["Mean Recall"].append(scores['test_recall'].mean())
    results["Mean F1 Score"].append(scores['test_f1_score'].mean())

results_df = pd.DataFrame(results)
results_df

Unnamed: 0,Model,Mean Accuracy,Std Accuracy,Mean Precision,Mean Recall,Mean F1 Score
0,Logistic Regression,0.815576,0.118066,0.271172,0.651462,0.321698
1,Random Forest,0.952688,0.023379,0.560236,0.531579,0.49095
2,XGBoost,0.954118,0.021454,0.528826,0.477778,0.458806


After evaluating the diffrent models we will now train them on all the data and export the weights

In [6]:
# Fitting classifiers to the whole data
logistic_model = LogisticRegression()
logistic_model.fit(X, df['Label'])

rf_model = RandomForestClassifier()
rf_model.fit(X, df['Label'])

xgb_model = XGBClassifier()
xgb_model.fit(X, df['Label'])

In [7]:
import pickle

# Assuming that logistic_model, rf_model, xgb_model, and tfidf are already defined

with open('weights/logistic_model.pkl', 'wb') as f:
    pickle.dump(logistic_model, f)

with open('weights/rf_model.pkl', 'wb') as f:
    pickle.dump(rf_model, f)

with open('weights/xgb_model.pkl', 'wb') as f:
    pickle.dump(xgb_model,f)
    
with open('weights/tfidf.pkl', 'wb') as f:
    pickle.dump(vectorizer, f)