In [1]:
# load libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.preprocessing import OneHotEncoder
import sklearn.decomposition
from sklearn.model_selection import train_test_split


In [2]:
# load data
def load_data(filepath):
    data = pd.read_csv(filepath)
    return data

df = load_data("./dataset/CLAMP_Train.csv")

In [3]:
df.head()

Unnamed: 0,e_magic,e_cblp,e_cp,e_crlc,e_cparhdr,e_minalloc,e_maxalloc,e_ss,e_sp,e_csum,...,CheckSum,Subsystem,DllCharacteristics,SizeOfStackReserve,SizeOfStackCommit,SizeOfHeapReserve,SizeOfHeapCommit,LoaderFlags,NumberOfRvaAndSizes,class
0,23117,144,3,0,4,0,65535,0,184,0,...,65452,2,1024,1048576,4096,1048576,4096,0,16,0
1,23117,144,3,0,4,0,65535,0,184,0,...,0,2,0,1048576,4096,1048576,4096,0,16,1
2,23117,144,3,0,4,0,65535,0,184,0,...,119291,2,0,1048576,4096,1048576,4096,0,16,0
3,23117,144,3,0,4,0,65535,0,184,0,...,91168,3,320,262144,4096,1048576,4096,0,16,0
4,23117,144,3,0,4,0,65535,0,184,0,...,82656,2,32768,1048576,4096,1048576,4096,0,16,1


In [4]:
def find_dataset_statistics(dataset:pd.DataFrame,target_col:str) -> tuple[int,int,int,int,float]:

    #Total number of records
    n_records = dataset.shape[0]
    #Total number of columns 
    n_columns = len(dataset.columns)
    #Number of records where target is negative
    n_negative = dataset[target_col].value_counts()[0]
    #Number of records where where target is positive
    n_positive = dataset[target_col].value_counts()[1]
    # Percentage of instances of positive target value
    perc_positive =  (n_positive/n_records)*100

    return n_records,n_columns,n_negative,n_positive,perc_positive

In [5]:

def train_test_split(  dataset: pd.DataFrame,
                       target_col: str, 
                       test_size: float,
                       stratify: bool,
                       random_state: int) -> tuple[pd.DataFrame,pd.DataFrame,pd.Series,pd.Series]:
    
#    split the dataset into train and test datasets
    train, test = train_test_split(dataset, test_size=test_size, random_state=random_state, stratify=dataset[target_col] if stratify else None )

    train_features = train.drop(target_col, axis=1)
    test_features = test.drop(target_col, axis=1)
    train_targets = train[target_col]
    test_targets = test[target_col]

    return train_features,test_features,train_targets,test_targets



In [None]:
class PreprocessDataset:
    def __init__(self, 
                 train_features:pd.DataFrame, 
                 test_features:pd.DataFrame,
                 one_hot_encode_cols:list[str],
                 min_max_scale_cols:list[str],
                 n_components:int,
                 feature_engineering_functions:dict
                 ):
        # TODO: Add any state variables you may need to make your functions work
        self.one_hot_encode_cols = one_hot_encode_cols
        self.train_features = train_features
        return

    def one_hot_encode_columns_train(self) -> pd.DataFrame:
      # Instantiate the OneHotEncoder class
        ohe = sklearn.preprocessing.OneHotEncoder(sparse_output=False)

        one_hot_encoded = ohe.fit_transform(self.train_features[self.one_hot_encode_cols])

        # get feature names
        column_names = ohe.get_feature_names_out(self.one_hot_encode_cols)
        
        # Convert the one_hot_encoded array to a DataFrame with column headers
        one_hot_encoded_df = pd.DataFrame(one_hot_encoded, columns=column_names, index=self.train_features.index)
        one_hot_encoded_df = pd.concat([one_hot_encoded_df, self.train_features.drop(self.one_hot_encode_cols, axis=1)], axis=1)  

        # return one_hot_encoded_dataset
        return one_hot_encoded_df