In [38]:
# load libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.preprocessing import OneHotEncoder
import sklearn.decomposition
from sklearn.model_selection import train_test_split
from sklearn.cluster import KMeans
from yellowbrick.cluster import KElbowVisualizer

In [39]:
# load data
def load_data(filepath):
    data = pd.read_csv(filepath)
    return data

df = load_data("./dataset/CLAMP_Train.csv")

In [40]:
df.head()

Unnamed: 0,e_magic,e_cblp,e_cp,e_crlc,e_cparhdr,e_minalloc,e_maxalloc,e_ss,e_sp,e_csum,...,CheckSum,Subsystem,DllCharacteristics,SizeOfStackReserve,SizeOfStackCommit,SizeOfHeapReserve,SizeOfHeapCommit,LoaderFlags,NumberOfRvaAndSizes,class
0,23117,144,3,0,4,0,65535,0,184,0,...,65452,2,1024,1048576,4096,1048576,4096,0,16,0
1,23117,144,3,0,4,0,65535,0,184,0,...,0,2,0,1048576,4096,1048576,4096,0,16,1
2,23117,144,3,0,4,0,65535,0,184,0,...,119291,2,0,1048576,4096,1048576,4096,0,16,0
3,23117,144,3,0,4,0,65535,0,184,0,...,91168,3,320,262144,4096,1048576,4096,0,16,0
4,23117,144,3,0,4,0,65535,0,184,0,...,82656,2,32768,1048576,4096,1048576,4096,0,16,1


In [41]:
def find_dataset_statistics(dataset:pd.DataFrame,target_col:str) -> tuple[int,int,int,int,float]:

    #Total number of records
    n_records = dataset.shape[0]
    #Total number of columns 
    n_columns = len(dataset.columns)
    #Number of records where target is negative
    n_negative = dataset[target_col].value_counts()[0]
    #Number of records where where target is positive
    n_positive = dataset[target_col].value_counts()[1]
    # Percentage of instances of positive target value
    perc_positive =  (n_positive/n_records)*100

    return n_records,n_columns,n_negative,n_positive,perc_positive

In [42]:

# def train_test_split(  dataset: pd.DataFrame,
#                        target_col: str, 
#                        test_size: float,
#                        stratify: bool,
#                        random_state: int) -> tuple[pd.DataFrame,pd.DataFrame,pd.Series,pd.Series]:
    
# #    split the dataset into train and test datasets
#     train, test = train_test_split(dataset, test_size=test_size, random_state=random_state, stratify=dataset[target_col] if stratify else None )

#     train_features = train.drop(target_col, axis=1)
#     test_features = test.drop(target_col, axis=1)
#     train_targets = train[target_col]
#     test_targets = test[target_col]

#     return train_features,test_features,train_targets,test_targets

def train_test_split(dataset: pd.DataFrame, target_col: str, test_size: float, stratify: bool = False, random_state: int = None):
    if stratify:
        # Split the dataset into positive and negative examples
        positive = dataset[dataset[target_col] == 1]
        negative = dataset[dataset[target_col] == 0]

        # Calculate the number of positive and negative examples to include in the test set
        n_positive_test = int(len(positive) * test_size)
        n_negative_test = int(len(negative) * test_size)

        # Randomly select positive and negative examples for the test set
        positive_test = positive.sample(n_positive_test, random_state=random_state)
        negative_test = negative.sample(n_negative_test, random_state=random_state)

        # Combine the positive and negative test examples into a single DataFrame
        test = pd.concat([positive_test, negative_test])

        # Remove the test examples from the dataset to create the training set
        train = dataset.drop(test.index)

        # Split the training and test sets into features and targets
        train_features = train.drop(target_col, axis=1)
        train_targets = train[target_col]
        test_features = test.drop(target_col, axis=1)
        test_targets = test[target_col]

        return train_features, test_features, train_targets, test_targets

    else:
        # If stratification is not required, simply split the data into training and test sets
        train, test = sklearn.model_selection.train_test_split(dataset, test_size=test_size, random_state=random_state)
        train_features = train.drop(target_col, axis=1)
        train_targets = train[target_col]
        test_features = test.drop(target_col, axis=1)
        test_targets = test[target_col]

        return train_features, test_features, train_targets, test_targets


In [43]:
class PreprocessDataset:
    def __init__(self, 
                 train_features:pd.DataFrame, 
                 test_features:pd.DataFrame,
                 one_hot_encode_cols:list[str],
                 min_max_scale_cols:list[str],
                #  n_components:int,
                #  feature_engineering_functions:dict
                 ):
        
        self.one_hot_encode_cols = one_hot_encode_cols
        self.test_features = test_features
        self.train_features = train_features
        self.min_max_scale_cols = min_max_scale_cols
        return

    def one_hot_encode_columns_train(self) -> pd.DataFrame:
        
        encoder = sklearn.preprocessing.OneHotEncoder(sparse_output=False)
        encoded = encoder.fit_transform(self.train_features[self.one_hot_encode_cols])
        column_names = encoder.get_feature_names_out(self.one_hot_encode_cols)
        encoded = pd.DataFrame(encoded, columns=column_names, index=self.train_features.index)
        train_features_encoded = pd.concat([encoded, self.train_features.drop(self.one_hot_encode_cols, axis=1)], axis=1)
        
        return train_features_encoded


    def one_hot_encode_columns_test(self) -> pd.DataFrame:
        # Split data into columns to be encoded and columns to be passed through
        encode_cols = self.test_features[self.one_hot_encode_cols]
        pass_cols = self.test_features.drop(self.one_hot_encode_cols, axis=1)
        
        # Fit OneHotEncoder on training data and transform test data
        encoder = sklearn.preprocessing.OneHotEncoder(sparse_output=False, handle_unknown='ignore')
        encoder.fit(self.train_features[self.one_hot_encode_cols])
        encode_cols_encoded = pd.DataFrame(encoder.transform(encode_cols), index=encode_cols.index)
        encode_cols_encoded.columns = encoder.get_feature_names_out(self.one_hot_encode_cols)
        
        # Join encoded and pass-through columns
        test_features_encoded = pd.concat([encode_cols_encoded, pass_cols], axis=1)
        
        return test_features_encoded
    
    def min_max_scaled_columns_train(self) -> pd.DataFrame:

        self.mms = sklearn.preprocessing.MinMaxScaler()
        # scaled = mms.fit_transform(self.train_features[self.min_max_scale_cols])
        self.mms.fit(self.train_features[self.min_max_scale_cols])
        scaled = self.mms.transform(self.train_features[self.min_max_scale_cols])

        # get column anmes
        column_names = self.mms.get_feature_names_out(self.min_max_scale_cols)
        min_max_scale_df = pd.DataFrame(scaled, columns=column_names, index=self.train_features.index)
        # merging the min_max_scaled 
        min_max_scaled_dataset = pd.concat([min_max_scale_df, self.train_features.drop(self.min_max_scale_cols, axis=1)], axis=1)
        
        # print (scaled)
        min_max_scaled_dataset
        return min_max_scaled_dataset

    def min_max_scaled_columns_test(self) -> pd.DataFrame:
        
        scaled = self.mms.transform(self.test_features[self.min_max_scale_cols])

        # get column anmes
        column_names = self.mms.get_feature_names_out(self.min_max_scale_cols)
        min_max_scale_df = pd.DataFrame(scaled, columns=column_names, index=self.test_features.index)
        # merging the min_max_scaled 
        min_max_scaled_dataset = pd.concat([min_max_scale_df, self.test_features.drop(self.min_max_scale_cols, axis=1)], axis=1)

        return min_max_scaled_dataset
    
        
    def pca_train(self) -> pd.DataFrame:

        self.pca = sklearn.decomposition.PCA(n_components=self.n_components, random_state=0)
        clean_df = self.train_features.dropna(axis=1)
        self.pca.fit(clean_df)

        pca_dataset = pd.DataFrame(self.pca.transform(clean_df), columns=[f"component_{i+1}" for i in range(self.n_components)])   

        return pca_dataset

    def pca_test(self) -> pd.DataFrame:
        pca_dataset = pd.DataFrame(self.pca.transform(self.test_features), columns=[f"component_{i+1}" for i in range(self.n_components)])

        return pca_dataset
    
    def feature_engineering_train(self) -> pd.DataFrame:
        feature_engineered_dataset = self.train_features.copy()
        for feature_name, function in self.feature_engineering_functions.items():
            feature_engineered_dataset[feature_name] = function(self.train_features)
        return feature_engineered_dataset

    def feature_engineering_test(self) -> pd.DataFrame:
        feature_engineered_dataset = self.test_features.copy()
        for feature_name, function in self.feature_engineering_functions.items():
            feature_engineered_dataset[feature_name] = function(self.test_features)
        return feature_engineered_dataset
    
    def preprocess(self):
        # One-hot encode and concatenate data
        train_features_encoded = self.one_hot_encode_columns_train()
        test_features_encoded = self.one_hot_encode_columns_test()

        # Scale numerical columns using min-max scaling
        train_features_scaled, test_features_scaled = self.min_max_scaled_columns_train(), self.min_max_scaled_columns_test()

        # Apply feature engineering functions to data
        train_features_engineered, test_features_engineered = self.feature_engineering_train(), self.feature_engineering_test()

        # Combine encoded, scaled, and engineered data for training set with unique columns only
        train_features_combined = pd.concat([train_features_scaled.iloc[:, 0], train_features_encoded.iloc[:, :-2], train_features_engineered.iloc[:, -2:]], axis=1, join='inner')
        test_features_combined = pd.concat([test_features_scaled.iloc[:, 0], test_features_encoded.iloc[:, :-2], test_features_engineered.iloc[:, -2:]], axis=1, join='inner')

        return train_features_combined, test_features_combined

In [None]:

class KmeansClustering:
    def __init__(self, 
                 train_features:pd.DataFrame,
                 test_features:pd.DataFrame,
                 random_state: int
                ):

        self.train_features = train_features
        self.test_features = test_features
        self.random_state = random_state
        pass

    def kmeans_train(self) -> list:
        kms = KMeans(random_state=self.random_state, n_init=10)
        visualizer = KElbowVisualizer(kms(random_state=self.random_state),k=(1,10))
        visualizer.fit(self.train_features)
        self.optimal_k = visualizer.elbow_value_


        kmeans = KMeans(n_clusters=self.optimal_k, random_state=self.random_state)
        self.kmeans.fit(self.train_features)

        # Get the cluster ids for each row of the training data
        cluster_ids = kmeans.predict(self.train_features)

        return cluster_ids.tolist()

    def kmeans_test(self) -> list:
        cluster_ids = self.kmeans.predict(self.test_features)
        return cluster_ids.tolist()

    def train_add_kmeans_cluster_id_feature(self) -> pd.DataFrame:
        cluster_ids = self.kmeans_train()
        # copy of the train data
        train_data = self.train_features.copy()
        train_data['kmeans_cluster_id'] = cluster_ids

        return train_data

    def test_add_kmeans_cluster_id_feature(self) -> pd.DataFrame:
        cluster_ids = self.kmeans_test()

        # copy of the test data
        test_data = self.test_features.copy()
        test_data['kmeans_cluster_id'] = cluster_ids
        return test_data

In [44]:
df.columns

Index(['e_magic', 'e_cblp', 'e_cp', 'e_crlc', 'e_cparhdr', 'e_minalloc',
       'e_maxalloc', 'e_ss', 'e_sp', 'e_csum', 'e_ip', 'e_cs', 'e_lfarlc',
       'e_ovno', 'e_res', 'e_oemid', 'e_oeminfo', 'e_res2', 'e_lfanew',
       'Machine', 'NumberOfSections', 'CreationYear', 'PointerToSymbolTable',
       'NumberOfSymbols', 'SizeOfOptionalHeader', 'Characteristics', 'Magic',
       'MajorLinkerVersion', 'MinorLinkerVersion', 'SizeOfCode',
       'SizeOfInitializedData', 'SizeOfUninitializedData',
       'AddressOfEntryPoint', 'BaseOfCode', 'BaseOfData', 'ImageBase',
       'SectionAlignment', 'FileAlignment', 'MajorOperatingSystemVersion',
       'MinorOperatingSystemVersion', 'MajorImageVersion', 'MinorImageVersion',
       'MajorSubsystemVersion', 'MinorSubsystemVersion', 'SizeOfImage',
       'SizeOfHeaders', 'CheckSum', 'Subsystem', 'DllCharacteristics',
       'SizeOfStackReserve', 'SizeOfStackCommit', 'SizeOfHeapReserve',
       'SizeOfHeapCommit', 'LoaderFlags', 'NumberOfRvaAndSiz

In [50]:
# Testing

# get dataset statistics
n_records,n_columns,n_negative,n_positive,perc_positive = find_dataset_statistics(df,'class') 
# n_records,n_columns,n_negative,n_positive,perc_positive 

# split the dataset to train and test
target_col = 'class'
min_max_scale_cols = ['Characteristics', 'SizeOfCode','SizeOfInitializedData', 'AddressOfEntryPoint', 'BaseOfData', 'ImageBase', 'SizeOfImage', 'CheckSum', 'DllCharacteristics', 'SizeOfStackReserve']
one_hot_encode_cols = ['MajorLinkerVersion','MinorLinkerVersion','MinorOperatingSystemVersion', 'MajorImageVersion', 'MinorImageVersion', 'MajorSubsystemVersion', 'Subsystem']
# target_col
train_features,test_features,train_targets,test_targets = train_test_split(df, target_col, test_size=0.2,stratify=True, random_state=42)

train_features = pd.DataFrame(train_features)
test_features = pd.DataFrame(test_features)

preprocess = PreprocessDataset(train_features, test_features, one_hot_encode_cols,  min_max_scale_cols)


ohe = preprocess.one_hot_encode_columns_train()
# ohe.to_csv('new_df.csv')
mms = preprocess.min_max_scaled_columns_train()
mms.to_csv('new_df.csv')

In [46]:
cat_cols = df.select_dtypes(include=['object']).columns
cat_cols

Index([], dtype='object')

In [47]:
class_values = df['MinorOperatingSystemVersion'].unique()
class_values

array([0, 1, 2, 3, 4])

In [48]:
df.dtypes

e_magic                          int64
e_cblp                           int64
e_cp                             int64
e_crlc                           int64
e_cparhdr                        int64
e_minalloc                       int64
e_maxalloc                       int64
e_ss                             int64
e_sp                             int64
e_csum                           int64
e_ip                             int64
e_cs                             int64
e_lfarlc                         int64
e_ovno                           int64
e_res                          float64
e_oemid                          int64
e_oeminfo                        int64
e_res2                         float64
e_lfanew                         int64
Machine                          int64
NumberOfSections                 int64
CreationYear                     int64
PointerToSymbolTable             int64
NumberOfSymbols                  int64
SizeOfOptionalHeader             int64
Characteristics          

In [49]:
df.head()

Unnamed: 0,e_magic,e_cblp,e_cp,e_crlc,e_cparhdr,e_minalloc,e_maxalloc,e_ss,e_sp,e_csum,...,CheckSum,Subsystem,DllCharacteristics,SizeOfStackReserve,SizeOfStackCommit,SizeOfHeapReserve,SizeOfHeapCommit,LoaderFlags,NumberOfRvaAndSizes,class
0,23117,144,3,0,4,0,65535,0,184,0,...,65452,2,1024,1048576,4096,1048576,4096,0,16,0
1,23117,144,3,0,4,0,65535,0,184,0,...,0,2,0,1048576,4096,1048576,4096,0,16,1
2,23117,144,3,0,4,0,65535,0,184,0,...,119291,2,0,1048576,4096,1048576,4096,0,16,0
3,23117,144,3,0,4,0,65535,0,184,0,...,91168,3,320,262144,4096,1048576,4096,0,16,0
4,23117,144,3,0,4,0,65535,0,184,0,...,82656,2,32768,1048576,4096,1048576,4096,0,16,1
