In [1]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import make_scorer, precision_score, recall_score, f1_score, classification_report
from sklearn.model_selection import GridSearchCV
import numpy as np
from sklearn.datasets import make_classification
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import RandomForestClassifier
import xgboost as xgb
from lightgbm import LGBMClassifier
from scipy.stats import randint, uniform
from sklearn.model_selection import RandomizedSearchCV

## For the data

In [2]:
data_malware = pd.read_csv('/content/Obfuscated-MalMem2022.csv')

In [3]:
data_malware = data_malware[data_malware['Category'] != 'Benign']

data_malware_families = data_malware.copy()
families = [
    # Ransomware
    'Ako', 'Conti', 'Maze', 'Pysa', 'Shade',
    # Spyware
    '180solutions', 'CWS', 'Gator', 'TIBS', 'Transponder',
    # Trojan Horse
    'Emotet', 'Reconyc', 'Refroso', 'Scar', 'Zeus'
]

def detect_family(row):
    row_str = row.astype(str).str.cat(sep=' ').lower()
    for family in families:
        if family.lower() in row_str:
            return family

data_malware_families['Family'] = data_malware_families.apply(detect_family, axis=1)

data_malware_families = data_malware_families[data_malware_families['Family'] != '180solutions']
data_malware_families = data_malware_families[data_malware_families['Family'] != 'CWS']
data_malware_families = data_malware_families[data_malware_families['Family'] != 'Gator']
data_malware_families = data_malware_families[data_malware_families['Family'] != 'TIBS']
data_malware_families = data_malware_families[data_malware_families['Family'] != 'Transponder']

max_rows_dataset = 500

ako_rows = round(max_rows_dataset * data_malware_families['Family'].value_counts(normalize=True)['Ako'])
conti_rows = round(max_rows_dataset * data_malware_families['Family'].value_counts(normalize=True)['Conti'])
maze_rows = round(max_rows_dataset * data_malware_families['Family'].value_counts(normalize=True)['Maze'])
pysa_rows = round(max_rows_dataset * data_malware_families['Family'].value_counts(normalize=True)['Pysa'])
shade_rows = round(max_rows_dataset * data_malware_families['Family'].value_counts(normalize=True)['Shade'])

emotet_rows = round(max_rows_dataset * data_malware_families['Family'].value_counts(normalize=True)['Emotet'])
reconyc_rows = round(max_rows_dataset * data_malware_families['Family'].value_counts(normalize=True)['Reconyc'])
refroso_rows = round(max_rows_dataset * data_malware_families['Family'].value_counts(normalize=True)['Refroso'])
scar_rows = round(max_rows_dataset * data_malware_families['Family'].value_counts(normalize=True)['Scar'])
zeus_rows = round(max_rows_dataset * data_malware_families['Family'].value_counts(normalize=True)['Zeus'])

family_counts = {
    'Ako': ako_rows,
    'Conti': conti_rows,
    'Maze': maze_rows,
    'Pysa': pysa_rows,
    'Shade': shade_rows,

    'Emotet': emotet_rows,
    'Reconyc': reconyc_rows,
    'Refroso': refroso_rows,
    'Scar': scar_rows,
    'Zeus': zeus_rows
}

def sample_custom(group):
    name = group.name
    n = family_counts.get(name, 0)
    return group.sample(n=min(n, len(group)))
    #return group.sample(n=min(n, len(group)), random_state=42)

# Application du sampling
data_malware_families_filtered = data_malware_families.groupby('Family', group_keys=False).apply(sample_custom)

  data_malware_families_filtered = data_malware_families.groupby('Family', group_keys=False).apply(sample_custom)


## For the rest

In [4]:
X  = data_malware_families_filtered.drop(['Category', 'Class', 'Family'], axis=1)
y = LabelEncoder().fit_transform(data_malware_families_filtered['Family'])

In [5]:
scoring = {
    "accuracy": "accuracy",
    "precision_macro": make_scorer(precision_score, average="macro", zero_division=0),
    "recall_macro": make_scorer(recall_score, average="macro", zero_division=0),
    "f1_macro": make_scorer(f1_score, average="macro", zero_division=0),
}

In [6]:
param_normal = {
      "random_state": [42]
  }

param_grid = {
    "max_depth": [1, 5, 10, 15, 20, 25, 30],
    "n_estimators": [5, 50, 100, 150, 200, 250, 300, 350, 400, 450, 500],
    "criterion": ['log_loss', 'entropy', 'gini'],
    "max_features": ["log2", "sqrt", None],
    "random_state": [42]
  }

param_dist = {
    "max_depth": randint(1, 31),
    "n_estimators": randint(5, 501),
    "criterion": ['log_loss', 'entropy', 'gini'],
    "max_features": ["log2", "sqrt", None],
    "random_state": [42]
}

rf = RandomForestClassifier()
#LGBM = LGBMClassifier()
#xgb_ = xgb.XGBClassifier()

### Grid Search

In [7]:
logreg_cv = GridSearchCV(rf, param_grid, cv=5, scoring=scoring, refit="f1_macro") #, n_jobs=-1)

In [8]:
logreg_cv.fit(X, y)

In [9]:
print("Tuned RF Parameters: {}".format(logreg_cv.best_params_))
best_idx = logreg_cv.best_index_
print(f"CV accuracy:        {logreg_cv.cv_results_['mean_test_accuracy'][best_idx]:.4f}")
print(f"CV precision_macro: {logreg_cv.cv_results_['mean_test_precision_macro'][best_idx]:.4f}")
print(f"CV recall_macro:    {logreg_cv.cv_results_['mean_test_recall_macro'][best_idx]:.4f}")
print(f"CV f1_macro:        {logreg_cv.cv_results_['mean_test_f1_macro'][best_idx]:.4f}")

Tuned RF Parameters: {'criterion': 'gini', 'max_depth': 10, 'max_features': None, 'n_estimators': 300, 'random_state': 42}
CV accuracy:        0.2989
CV precision_macro: 0.2892
CV recall_macro:    0.2976
CV f1_macro:        0.2863


### Random Search

In [10]:
tree_cv = RandomizedSearchCV(rf, param_dist, cv=5, scoring=scoring, refit="f1_macro")
tree_cv.fit(X, y)

In [11]:
print("Tuned RF Parameters: {}".format(tree_cv.best_params_))
best_idx = tree_cv.best_index_
print(f"CV accuracy:        {tree_cv.cv_results_['mean_test_accuracy'][best_idx]:.4f}")
print(f"CV precision_macro: {tree_cv.cv_results_['mean_test_precision_macro'][best_idx]:.4f}")
print(f"CV recall_macro:    {tree_cv.cv_results_['mean_test_recall_macro'][best_idx]:.4f}")
print(f"CV f1_macro:        {tree_cv.cv_results_['mean_test_f1_macro'][best_idx]:.4f}")

Tuned RF Parameters: {'criterion': 'entropy', 'max_depth': 12, 'max_features': None, 'n_estimators': 188, 'random_state': 42}
CV accuracy:        0.2848
CV precision_macro: 0.2737
CV recall_macro:    0.2848
CV f1_macro:        0.2734


### Without Tuning

In [12]:
logreg_cv = GridSearchCV(rf, param_normal, cv=5, scoring=scoring, refit="f1_macro") #, n_jobs=-1)
logreg_cv.fit(X, y)

In [13]:
print("Tuned RF Parameters: {}".format(logreg_cv.best_params_))
best_idx = logreg_cv.best_index_
print(f"CV accuracy:        {logreg_cv.cv_results_['mean_test_accuracy'][best_idx]:.4f}")
print(f"CV precision_macro: {logreg_cv.cv_results_['mean_test_precision_macro'][best_idx]:.4f}")
print(f"CV recall_macro:    {logreg_cv.cv_results_['mean_test_recall_macro'][best_idx]:.4f}")
print(f"CV f1_macro:        {logreg_cv.cv_results_['mean_test_f1_macro'][best_idx]:.4f}")

Tuned RF Parameters: {'random_state': 42}
CV accuracy:        0.2788
CV precision_macro: 0.2704
CV recall_macro:    0.2782
CV f1_macro:        0.2681


### Grid for each Model

In [14]:
## XGBOOST
param_dist = {
    "learning_rate": uniform(0.01, 0.5),
    "eta": uniform(0.01, 0.5),
    "max_depth": randint(1, 31),
    "subsample": uniform(0.01, 1),
    "random_state": [42]
}

In [15]:
## RF
param_dist = {
    "max_depth": randint(1, 31),
    "n_estimators": randint(5, 501),
    "criterion": ['log_loss', 'entropy', 'gini'],
    "max_features": ["log2", "sqrt", None, "auto"],
    "random_state": [42]
}

In [16]:
## LGBM
param_grid = {
    "learning_rate": [0.01, 0.02, 0.03, 0.04, 0.05],
    "max_bin": [5, 10, 15, 20, 25, 30],
    "max_depth": [5, 10, 15, 20, 25],
    "num_leaves": [20, 40, 60, 80, 100],
      "random_state": [42]
  }

param_dist = {
    "learning_rate": uniform(0.01, 0.05),
    "max_bin": randint(5, 31),
    "max_depth": randint(5, 26),
    "num_leaves": randint(20, 101),
    "random_state": [42]
}