In [1]:
from os import path, getcwd
import numpy as np
from numpy import array
import pandas as pd
import sklearn as sk

In [None]:
# Function to read features.csv and then print new_features.csv with X random malicious samples and Y random benign samples
# This allows us to easily test different size datasets e.g. should mal.size = ben.size?

# Load training data
df = pd.read_csv(path.join(getcwd(), "training_data/features.csv"))

# Seperate malicious and benign
malicious_df = df[df['classification'] == 1]
benign_df = df[df['classification'] == 0]

# Get group lengths
malicious_group_len = len(malicious_df.groupby(["sample"]))
benign_group_len = len(benign_df.groupby(["sample"]))

# Initialise variables
target_len, dataset_to_reduce, dataset_to_leave_untouched = 0, None, None

# If there's more malicious groups than benign groups - reduce malicious dataset
if malicious_group_len > benign_group_len:
    target_len = benign_group_len
    dataset_to_reduce = malicious_df
    dataset_to_leave_untouched = benign_df
# If there's more benign groups than malicious groups - reduce benign dataset
elif malicious_group_len < benign_group_len:
    target_len = malicious_group_len
    dataset_to_reduce = benign_df
    dataset_to_leave_untouched = malicious_df
else:
    print("Datasets are already the same size!")
    exit()

# Group by sample name - NOTE: this is a slow way to shuffle by group - bound to be better way?!
dataset_to_reduce_groups = [df for _, df in dataset_to_reduce.groupby('sample')]

# Shuffle the data and take X amount of samples
dataset_to_reduce_groups = sk.utils.shuffle(dataset_to_reduce_groups)[0:target_len]

# Convert back to dataframe
dataset_to_reduce_groups = pd.concat(dataset_to_reduce_groups).reset_index(drop=True)

# Append the subset of dataset to untouched dataset
new_features = dataset_to_leave_untouched.append(dataset_to_reduce_groups)

# Print to CSV file (just for testing)
new_features.to_csv("training_data/equal_features.csv", index=False)

In [None]:
# Function to read features.csv and then print new_features.csv with shuffled samples

# Load training data
df = pd.read_csv(path.join(getcwd(), "training_data/features.csv"))

# Group by sample name - NOTE: this is a slow way to shuffle by group - bound to be better way?!
df_groups = [df for _, df in df.groupby('sample')]
# Shuffle the data
df_groups = sk.utils.shuffle(df_groups)
# Convert back to dataframe
shuffled_df = pd.concat(df_groups).reset_index(drop=True)

# Print to CSV file (just for testing)
shuffled_df.to_csv("training_data/shuffled_features.csv", index=False)

In [75]:
# Function to read results.csv and then print avg_results.csv with averages of results
# e.g. Combine all results that use same number of layers + neurons, average them and print

# Load result data
df = pd.read_csv(path.join(getcwd(), "results/results.csv"))

# Group by number of layers and neurons
groups = df.groupby(['layers', 'neurons_per_layer'], as_index=False)

averaged_results = [] # List to hold averaged results

# Loop through each group
for group in groups:
    # Create a new row from the first entry (let's use constants from this e.g. date)
    new_row = group[1].iloc[0, :]

    # List of features we want to average
    features_to_avg = ['accuracy', 'precision', 'recall', 'f1', 'kappa', 'roc', 'test_acc', 'test_loss', 'val_acc_best', 'val_acc_avg', 'val_loss_best', 'val_loss_avg', 'train_acc_best', 'train_acc_avg', 'train_loss_best', 'train_loss_avg', 'stopped_epoch', 'best_epoch', 'train_time']

    # Average the features and store them in row
    for feature in features_to_avg:
        new_row[feature] = np.average(group[1][feature])
    
    # Append the row to list
    averaged_results.append(new_row)

# Convert results to dataframe
averaged_results = pd.DataFrame(averaged_results)

# Print to new CSV file
averaged_results.to_csv("results/averaged_results.csv", index=False)

In [24]:
# Re-order the rank of CSV (if combining multiple results.csv)

# Load result data
df = pd.read_csv(path.join(getcwd(), "results/results.csv"))

# Re-rank
df['rank'] = df['f1'].rank(method='first', ascending=False, na_option='bottom')

# Round out some values
df = df.round({'accuracy': 3, 'precision': 3, 'recall': 3, 'f1': 3, 'fit_time': 3, 'test_time': 3})

# Print to new CSV file
df.to_csv("results/ranked_results.csv", index=False)

In [2]:
# Load training data
df = pd.read_csv(path.join(getcwd(), "training_data/features.csv"))

# Seperate malicious and benign
malicious_df = df[df['classification'] == 1]
benign_df = df[df['classification'] == 0]

# Get group lengths
malicious_group_len = len(malicious_df.groupby(["sample"]))
benign_group_len = len(benign_df.groupby(["sample"]))
