In [1]:
# SMOTE oversampling
from models.V2.nb import NB
from models.V2.rf import RF
from imblearn.over_sampling import SMOTE, RandomOverSampler
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from concurrent import futures as cf
from train_models import train_eval_model, to_model_properties, get_instantiated_models

def smote(X_train, y_train):
        smote = SMOTE(random_state=42)
        X_train_resampled, y_train_resampled = smote.fit_resample(X_train, y_train)
        return X_train_resampled, y_train_resampled

def random_oversample(X_train, y_train):
    ros = RandomOverSampler(random_state=42)
    X_train_resampled, y_train_resampled = ros.fit_resample(X_train, y_train)
    return X_train_resampled, y_train_resampled

class NB_SMOTE(NB):
    def fit_vectorized(self, X_train, y_train=None):
        X_train_resampled, y_train_resampled = smote(X_train, y_train)
        super().fit_vectorized(X_train_resampled, y_train_resampled)

    def get_model_type(self):
        return super().get_model_type() + '_SMOTE'
    
class RF_SMOTE(RF):
    def fit_vectorized(self, X_train, y_train):
        X_train_resampled, y_train_resampled = smote(X_train, y_train)
        super().fit_vectorized(X_train_resampled, y_train_resampled)

    def get_model_type(self):
        return super().get_model_type() + '_SMOTE'

class NB_ROS(NB):
    def fit_vectorized(self, X_train, y_train=None):
        X_train_resampled, y_train_resampled = random_oversample(X_train, y_train)
        super().fit_vectorized(X_train_resampled, y_train_resampled)

    def get_model_type(self):
        return super().get_model_type() + '_ROS'

class RF_ROS(RF):
    def fit_vectorized(self, X_train, y_train):
        X_train_resampled, y_train_resampled = random_oversample(X_train, y_train)
        super().fit_vectorized(X_train_resampled, y_train_resampled)

    def get_model_type(self):
        return super().get_model_type() + '_ROS'


def get_instantiated_oversampled_models():
    ngrams = range(1, 6)
    vectorizers = [CountVectorizer, TfidfVectorizer]
    models = [NB_SMOTE, RF_SMOTE, NB_ROS, RF_ROS]
    instantiated_models = []
    token_pattern = r'\b\w+\b'
    for vectorizer in vectorizers:
        for ngram in ngrams:
            for model in models:
                instantiated_models.append(model(vectorizer(ngram_range=(ngram, ngram), token_pattern=token_pattern)))
    return instantiated_models

def get_instantiated_normal_models():
    models = get_instantiated_models(version=2)
    models = [model for model in models if model.get_model_name() in ['NB', 'RF']]
    return models

def train_eval_models(models, X_train, X_test, y_train, y_test):
    scores = []
    with cf.ThreadPoolExecutor(max_workers=4) as executor:
        futures = [executor.submit(train_eval_model, model, X_train, X_test, y_train, y_test) for model in models]
        for future in cf.as_completed(futures):
            model, model_scores = future.result()
            row = model_scores | to_model_properties(model)
            scores.append(row)
        cf.wait(futures)
    return scores

In [2]:
from train_models import train_test_split_df
from log_reader import read_all_logs
from preprocessors.preprocessor import Preprocessor
import pandas as pd

df = read_all_logs(version=2)
df = Preprocessor.get(version=2).preprocess(df)
X_train, X_test, y_train, y_test = train_test_split_df(df)

scores_os = train_eval_models(get_instantiated_oversampled_models(), X_train, X_test, y_train, y_test)
scores = train_eval_models(get_instantiated_normal_models(), X_train, X_test,y_train, y_test)

scores_os_df = pd.DataFrame(scores_os)
scores_df = pd.DataFrame(scores)

scores = scores_df.merge(scores_os_df, on=['Model','Vectorizer','Min Ngram','Max Ngram'], suffixes=('_original', '_oversampled'))
scores['F1_diff'] = scores['F1_oversampled'] - scores['F1_original']



Reading logs from ../logs/V1
Reading logs from ../logs/V2-1
Reading logs from ../logs/V2
Classifying malicious_ransomwarePOC_10min_1.log as malicious, 13383 malicious entries found
Classifying malicious_ransomwarePOC_10min.log as malicious, 14477 malicious entries found
Classifying malicious_JavaRansomware_30min.log as malicious, 45982 malicious entries found
Classifying malicious_RAASNet-AES_60min.log as malicious, 10348 malicious entries found
Classifying malicious_ransim_20min.log as malicious, 23227 malicious entries found
Classifying malicious_RAASNet-Crypto_20min.log as malicious, 20069 malicious entries found
Classifying malicious_roar-ChaCha20_60min.log as malicious, 33968 malicious entries found
Classifying malicious_roar-AES-CTR_60min.log as malicious, 71676 malicious entries found
Classifying malicious_ransim-slow_60min.log as malicious, 23621 malicious entries found
Classifying malicious_cry_20min.log as malicious, 8185 malicious entries found
Classifying malicious_ransim-s

In [3]:
scores[['Model','Vectorizer','Min Ngram','F1_original', 'Model_Type_oversampled', 'F1_oversampled','F1_diff']].sort_values(by='F1_diff', ascending=False)

Unnamed: 0,Model,Vectorizer,Min Ngram,F1_original,Model_Type_oversampled,F1_oversampled,F1_diff
26,NB,TfidfVectorizer,4,0.927487,Classification_SMOTE,0.946055,0.018567
27,NB,TfidfVectorizer,4,0.927487,Classification_ROS,0.945674,0.018187
22,NB,TfidfVectorizer,3,0.927596,Classification_SMOTE,0.941683,0.014087
23,NB,TfidfVectorizer,3,0.927596,Classification_ROS,0.938744,0.011147
38,RF,TfidfVectorizer,5,0.926706,Classification_SMOTE,0.932218,0.005512
33,RF,CountVectorizer,5,0.928,Classification_SMOTE,0.933333,0.005333
31,NB,TfidfVectorizer,5,0.929495,Classification_ROS,0.934702,0.005207
36,RF,TfidfVectorizer,4,0.94449,Classification_SMOTE,0.949362,0.004871
30,NB,TfidfVectorizer,5,0.929495,Classification_SMOTE,0.933881,0.004386
39,RF,TfidfVectorizer,5,0.926706,Classification_ROS,0.92992,0.003214
