In [1]:
%reload_ext autoreload
%autoreload 2

In [2]:
import warnings
warnings.filterwarnings('ignore')

In [73]:
import os
import time

import pandas as pd
import numpy as np
import matplotlib
import matplotlib.pyplot as plt
import plotly.graph_objs as go
import plotly.express as px

from numpy import hstack

from IPython.display import IFrame
from plotly.offline import init_notebook_mode
init_notebook_mode(connected=True)

from plot_utils import *

In [63]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression

from sklearn.preprocessing import MinMaxScaler, StandardScaler, RobustScaler

from sklearn.feature_selection import SelectKBest, f_classif, chi2

from sklearn.utils import shuffle
from sklearn.model_selection import KFold, RepeatedStratifiedKFold, cross_val_score, GridSearchCV, train_test_split
from sklearn.pipeline import Pipeline

from sklearn.metrics import accuracy_score

In [5]:
folder_path = 'C:/Users/yaass/OneDrive/Bureau/Parser'

In [162]:
def get_data(folder_path, file_name, index_value = 'md5'):
    df = pd.read_csv(os.path.join(folder_path, file_name), index_col=index_value)
    return df

def print_proportion(df, label = 'label'):
    print('Proportion : {:.2f}%'.format(100*sum(df.label)/len(df)))

def create_X_y(folder_path, file_name, drop_null_columns=False, index_value = 'md5'):
    file_path = os.path.join(folder_path, file_name)
    df = pd.read_csv(file_path, index_col = index_value)
    X = df.drop('label', axis=1)
    #X = df.drop(['label', 'sublabel'], axis=1)
    if drop_null_columns == True:
        X = X.drop(get_null_columns(X), axis=1)
    y = df['label']
    return shuffle(X, y)

def create_X_y_(df, drop_null_columns=False, index_value = 'md5'):
    X = df.drop('label', axis=1)
    #X = df.drop(['label', 'sublabel'], axis=1)
    if drop_null_columns == True:
        X = X.drop(get_null_columns(X), axis=1)
    y = df['label']
    return shuffle(X, y)

def evaluate_model(model, X, y):
    cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=2)
    scores = cross_val_score(model, X, y, scoring='accuracy', cv=cv, n_jobs=-1, error_score='raise')
    return scores

In [145]:
def score_model_dataset(df, model):
    X, y = create_X_y_(df)
    scores = evaluate_model(model, X, y)
    return scores


def join_dfs(dfs, labels=['label']):
    joined = dfs[0]
    for df in dfs[1:]:
        joined = joined.join(df.drop(labels, axis=1)).fillna(0)
    return joined


def retrieve_subset(original_df, X, y = None, label='label', features_only=False):
    filtered_columns = original_df.columns.tolist()
    filtered_columns.remove(label)
    if features_only == False :
        filtered_index = original_df.index.intersection(X.index)
        X_new = X.loc[filtered_index, filtered_columns]
        y_new = y.loc[filtered_index]
        return X_new, y_new
    else :
        X_new = X.loc[:, filtered_columns]
        return X_new
    
    
def prepare_datasets(original_dfs, X_train_full, X_test, y_train_full, y_test, test_size = 0.4):
    datasets = []
    X_train, X_eval, y_train, y_eval = train_test_split(X_train_full, y_train_full, test_size = test_size, random_state = 1)
    for df in original_dfs:
        dataset = dict()
        dataset['train'] = retrieve_subset(df, X_train, y_train)
        dataset['eval'] = ( retrieve_subset(df, X_eval, features_only=True), y_eval )
        dataset['test'] = ( retrieve_subset(df, X_test, features_only=True), y_test )
        datasets.append(dataset)
    return datasets
    
    
def fit_ensemble(models, datasets):
    X_meta = list()
    for model, dataset in zip(models, datasets):
        model.fit(*dataset['train'])
        y_pred = model.predict(dataset['eval'][0])
        y_pred = y_pred.reshape(len(y_pred), 1)
        X_meta.append(y_pred)
    X_meta = np.hstack(X_meta)
    blender = LogisticRegression()
    blender.fit(X_meta, dataset['eval'][1])
    return blender


def predict_ensemble(models, blender, datasets):
    X_meta = list()
    for model, dataset in zip(models, datasets):
        y_pred = model.predict(dataset['test'][0])
        y_pred = y_pred.reshape(len(y_pred), 1)
        X_meta.append(y_pred)
    X_meta = np.hstack(X_meta)
    return blender.predict(X_meta)


def get_blender_accuracy(original_dfs, X_train_full, X_test, y_train_full, y_test, models):
    datasets = prepare_datasets(original_dfs, X_train_full, X_test, y_train_full, y_test, test_size = 0.4)
    blender = fit_ensemble(models, datasets)
    y_pred = predict_ensemble(models, blender, datasets)
    accuracy = accuracy_score(y_test, y_pred)
    return accuracy

### API

In [7]:
file_name_1 = 'onehot_encoded_apistats_dataset.csv'

oh_apis = get_data(folder_path, file_name_1)

oh_apis.shape

(3761, 304)

In [35]:
print_proportion(oh_apis)

Proportion : 58.23%


In [148]:
#compute api cross-validation accuracy scores
api_scores = score_model_dataset(oh_apis, RandomForestClassifier())
np.mean(api_scores)

### DLL

In [8]:
file_name_2 = 'onehot_encoded_dll_dataset.csv'

dll_loaded = get_data(folder_path, file_name_2)

dll_loaded.shape

(3162, 2242)

In [34]:
print_proportion(dll_loaded)

Proportion : 58.82%


In [149]:
#compute dll cross-validation accuracy scores
dll_scores = score_model_dataset(dll_loaded, RandomForestClassifier())
np.mean(dll_scores)

### PE Imports

In [170]:
file_name_3 = 'pe_entropy_dataset.csv'

pe_imports = get_data(folder_path, file_name_3)

if 'sublabel' in pe_imports.columns.tolist():
    pe_imports.drop('sublabel', axis=1, inplace=True)

pe_imports.shape

(4308, 795)

In [171]:
print_proportion(pe_imports)

Proportion : 60.35%


In [172]:
#compute pe_imports cross-validation accuracy scores
pe_imports_scores = score_model_dataset(pe_imports, RandomForestClassifier())
np.mean(pe_imports_scores)

0.9030475368262021

## Joining Datasets

### API and DLL

In [173]:
original_dfs = [oh_apis, dll_loaded]

joined_ = join_dfs(original_dfs)

#### Joining

In [None]:
joining_scores = score_model_dataset(joined_, RandomForestClassifier())

#### Blending

*Train-Test-Split*

In [130]:
X, y = create_X_y_(joined_)

X_train_full, X_test, y_train_full, y_test = train_test_split(X, y, test_size = 0.1, random_state = 1)

models = [RandomForestClassifier(), RandomForestClassifier()]

score = get_blender_accuracy(original_dfs, X_train_full, X_test, y_train_full, y_test, models)

print('Blending Accuracy: %.3f' % score)

Blending Accuracy: 0.947


*Cross-validation*

In [131]:
CV = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=1)

models = [RandomForestClassifier(), RandomForestClassifier()]

blending_scores = []

X, y = create_X_y_(joined_)

for train_index, test_index in CV.split(X, y):
    X_train_full, X_test = X.iloc[train_index, :], X.iloc[test_index, :]
    y_train_full, y_test = y[train_index], y[test_index]
    score = get_blender_accuracy(original_dfs, X_train_full, X_test, y_train_full, y_test, models)
    blending_scores.append(score)

0.9357469383148033

In [144]:
fig = plot_evaluation_boxplots([api_scores, dll_scores, accuracy_scores, joining_scores], 
                               names = ['API', 'DLL', 'Blended', 'Joined'], 
                               title = 'Model Performance on joined "apistats" and "dll" Data', 
                               y_axis = 'Accuracy')

figure_path = 'joined/evaluation_joined_api_dll.html'

save_figures_to_html(figure_path, [fig])

IFrame(figure_path, width=900, height=600)

### API and PE Imports

In [176]:
original_dfs = [pe_imports, oh_apis]

joined = join_dfs(original_dfs)

#### Joining

In [178]:
joining_scores = score_model_dataset(joined, RandomForestClassifier())

#### Blending

In [179]:
CV = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=1)

models = [RandomForestClassifier(), RandomForestClassifier()]

blending_scores = []

X, y = create_X_y_(joined)

for train_index, test_index in CV.split(X, y):
    X_train_full, X_test = X.iloc[train_index, :], X.iloc[test_index, :]
    y_train_full, y_test = y[train_index], y[test_index]
    score = get_blender_accuracy(original_dfs, X_train_full, X_test, y_train_full, y_test, models)
    blending_scores.append(score)

In [180]:
fig = plot_evaluation_boxplots([api_scores, pe_imports_scores, blending_scores, joining_scores], 
                               names = ['API', 'PE Imports', 'Blended', 'Joined'], 
                               title = 'Model Performance on joined "apistats" and "pe imports" Data', 
                               y_axis = 'Accuracy')

figure_path = 'joined/evaluation_joined_api_pe_imports.html'

save_figures_to_html(figure_path, [fig])

IFrame(figure_path, width=900, height=600)

#### Nested cross-validation

In [81]:
cv_outer = KFold(n_splits=10, shuffle=True, random_state=1)

outer_results = { 'accuracy' : [], 'best_params' : [] }

#X, y = create_X_y(folder_path, file_name)
X, y = create_X_y_(joined)

for train_ix, test_ix in cv_outer.split(X) :
    
    X_train, X_test = X.iloc[train_ix, :], X.iloc[test_ix, :]
    y_train, y_test = y[train_ix], y[test_ix]
    
    cv_inner = KFold(n_splits=5, shuffle=True, random_state=1)
    
    pipe = Pipeline(steps=[('kbest', SelectKBest(f_classif, k=5)), ('rf', RandomForestClassifier(random_state=1))])
    
    model = RandomForestClassifier(random_state=1)
    
    grid = dict()
    grid['rf__n_estimators'] = [50, 100, 150, 200]
    grid['rf__max_features'] = [10, 20, 'sqrt', 'log2']
    #grid['kbest__k'] = [10, 20, 30, 50]
    #grid['kbest__score_func'] = [f_classif, chi2]
    
    space = dict()
    space['n_estimators'] = [50, 100, 150, 200]
    space['max_features'] = [10, 20, 'sqrt', 'log2']
    
    search = GridSearchCV(model, space, scoring='accuracy', cv=cv_inner, refit=True)
    
    result = search.fit(X_train, y_train)

    best_model = result.best_estimator_
    best_params = result.best_params_

    yhat = best_model.predict(X_test)

    acc = accuracy_score(y_test, yhat)

    outer_results['accuracy'].append(acc)
    outer_results['best_params'].append(best_params)

**API**

In [27]:
np.mean(outer_results)

0.9460254529036629

**API avec SelectKBest**

In [33]:
np.mean(outer_results['accuracy'])

0.8921939212658213

In [34]:
accuracies = outer_results['accuracy']
max_ix = [i for i, x in enumerate(accuracies) if x == max(accuracies)]
outer_results['best_params'][max_ix[0]]

{'kbest__k': 50,
 'kbest__score_func': <function sklearn.feature_selection._univariate_selection.f_classif(X, y)>,
 'rf__n_estimators': 200}