# Loading the GUIDE_Train Dataset

In [1]:
import pandas as pd
import numpy as np

def optimize_memory(df):
    
    for col in df.columns:
        col_type = df[col].dtypes
        if col_type == 'int64':
            df[col] = df[col].astype('int32')
        elif col_type == 'float64':
            df[col] = df[col].astype('float32')
        elif col_type == 'object':  # For string-based data
            df[col] = df[col].astype('category')
    return df

# Load the dataset in chunks to avoid memory issues
chunks = []
file_path = '../Project Info/Dataset/GUIDE_Train.csv'
for chunk in pd.read_csv(file_path, chunksize=500000, low_memory=False):
    # Optimize memory usage for each chunk
    chunk = optimize_memory(chunk) 
    chunks.append(chunk)

# Concatenate all chunks into a single DataFrame
df = pd.concat(chunks, ignore_index=True)

# Clean up memory by deleting the list of chunks
del chunks

# Display the first few rows of the optimized DataFrame
df.head()

Unnamed: 0,Id,OrgId,IncidentId,AlertId,Timestamp,DetectorId,AlertTitle,Category,MitreTechniques,IncidentGrade,...,ResourceType,Roles,OSFamily,OSVersion,AntispamDirection,SuspicionLevel,LastVerdict,CountryCode,State,City
0,1786,0,612,123247,2024-06-04T06:05:15.000Z,7,6,InitialAccess,,TruePositive,...,,,5,66,,,,31,6,3
1,1492,88,326,210035,2024-06-14T03:01:25.000Z,58,43,Exfiltration,,FalsePositive,...,,,5,66,,,,242,1445,10630
2,2573,809,58352,712507,2024-06-13T04:52:55.000Z,423,298,InitialAccess,T1189,FalsePositive,...,,,5,66,,Suspicious,Suspicious,242,1445,10630
3,4528,92,32992,774301,2024-06-10T16:39:36.000Z,2,2,CommandAndControl,,BenignPositive,...,,,5,66,,Suspicious,Suspicious,242,1445,10630
4,3722,148,4359,188041,2024-06-15T01:08:07.000Z,9,74,Execution,,TruePositive,...,,,5,66,,,,242,1445,10630


# Handling Missing Values

In [2]:
missing_threshold = 0.5  # set the threshold for missing values

# 1. Remove columns with more than 50% of missing values
missing_percentage = df.isnull().mean()
columns_to_drop = missing_percentage[missing_percentage > missing_threshold].index
df.drop(columns=columns_to_drop, inplace=True)

In [3]:
df.isnull().sum()

Id                        0
OrgId                     0
IncidentId                0
AlertId                   0
Timestamp                 0
DetectorId                0
AlertTitle                0
Category                  0
IncidentGrade         51340
EntityType                0
EvidenceRole              0
DeviceId                  0
Sha256                    0
IpAddress                 0
Url                       0
AccountSid                0
AccountUpn                0
AccountObjectId           0
AccountName               0
DeviceName                0
NetworkMessageId          0
RegistryKey               0
RegistryValueName         0
RegistryValueData         0
ApplicationId             0
ApplicationName           0
OAuthApplicationId        0
FileName                  0
FolderPath                0
ResourceIdName            0
OSFamily                  0
OSVersion                 0
CountryCode               0
State                     0
City                      0
dtype: int64

In [5]:
df = df.dropna(subset=['IncidentGrade'])    # drop missing target values

In [6]:
# remove duplicates
df.drop_duplicates(inplace=True)    

# Changing Datatypes

In [7]:
cols = df.columns.tolist()
df[cols] = df[cols].astype('category')
cols.remove('IncidentGrade')

In [8]:
df['Timestamp'] = pd.to_datetime(df['Timestamp'], errors='coerce')   # converting the timestamp column to datetime format

df['Year'] = df['Timestamp'].dt.year
df['Month'] = df['Timestamp'].dt.month
df['Day'] = df['Timestamp'].dt.day
df['Hour'] = df['Timestamp'].dt.hour
df['Minute'] = df['Timestamp'].dt.minute
df['Second'] = df['Timestamp'].dt.second

df.drop(columns=['Timestamp'], inplace=True)    # dropping the timestamp column
cols.remove('Timestamp')

In [10]:
df.dtypes

Id                    category
OrgId                 category
IncidentId            category
AlertId               category
DetectorId            category
AlertTitle            category
Category              category
IncidentGrade         category
EntityType            category
EvidenceRole          category
DeviceId              category
Sha256                category
IpAddress             category
Url                   category
AccountSid            category
AccountUpn            category
AccountObjectId       category
AccountName           category
DeviceName            category
NetworkMessageId      category
RegistryKey           category
RegistryValueName     category
RegistryValueData     category
ApplicationId         category
ApplicationName       category
OAuthApplicationId    category
FileName              category
FolderPath            category
ResourceIdName        category
OSFamily              category
OSVersion             category
CountryCode           category
State   

# Feature Engineering

In [11]:
constant_features = [col for col in cols if df[col].nunique() == 1]    # finding the constant features
print("Constant Features:", constant_features)
df.drop(columns=constant_features, inplace=True)    # dropping the constant features

Constant Features: []


In [12]:
from scipy.stats import chi2_contingency

significance_level = 0.05   # setting the significance level
categorical_to_keep = []
categorical_to_drop = []
target = 'IncidentGrade'    # target column

for col in cols:
    # Create contingency table
    contingency_table = pd.crosstab(df[col], df[target])
    chi2_stat, p_value, dof, expected = chi2_contingency(contingency_table)     # performing the chi2 test
    
    # Keep or drop feature based on p-value
    if p_value < significance_level:
        categorical_to_keep.append(col)
    else:
        categorical_to_drop.append(col)

print(f"Categorical Features Kept: {categorical_to_keep}")
print(f"Categorical Features Dropped: {categorical_to_drop}")

Categorical Features Kept: ['Id', 'OrgId', 'IncidentId', 'AlertId', 'DetectorId', 'AlertTitle', 'Category', 'EntityType', 'EvidenceRole', 'DeviceId', 'Sha256', 'IpAddress', 'Url', 'AccountSid', 'AccountUpn', 'AccountObjectId', 'AccountName', 'DeviceName', 'NetworkMessageId', 'RegistryKey', 'RegistryValueName', 'RegistryValueData', 'ApplicationId', 'ApplicationName', 'OAuthApplicationId', 'FileName', 'FolderPath', 'ResourceIdName', 'OSFamily', 'OSVersion', 'CountryCode', 'State', 'City']
Categorical Features Dropped: []


# Encoding Catagorical Variables

One-hot Encoding for all classes

In [13]:
cols_for_top_1_values = ['EvidenceRole','DeviceId','Sha256','IpAddress','AccountSid','AccountUpn','AccountObjectId','AccountName','DeviceName','NetworkMessageId','RegistryKey','RegistryValueName','RegistryValueData','OAuthApplicationId','FileName','ResourceIdName','OSVersion']

# Initialize dictionaries to store top values
top_1_value_dict = {}
top_3_values_dict = {}

# Lists to store encoded DataFrames
encoded_dataframes1 = []
encoded_dataframes2 = []

# Process columns
for col in cols:
    # Handle top 1 values for specific columns
    if col in cols_for_top_1_values:
        # Get top 1 most frequent value in the column
        top_1_value = df[col].value_counts().nlargest(1).index.tolist()
        top_1_value_dict[col] = top_1_value  # Save to dictionary

        # Create a temporary column for top 1 values
        temp_col1 = df[col].apply(lambda x: x if x in top_1_value else "Others")

        # Perform one-hot encoding
        encoded_df1 = pd.get_dummies(temp_col1, prefix=col)
        encoded_dataframes1.append(encoded_df1)
    
    # Handle top 3 values for other columns
    else:
        # Get top 3 most frequent values in the column
        top_3_values = df[col].value_counts().nlargest(3).index.tolist()
        top_3_values_dict[col] = top_3_values  # Save to dictionary

        # Create a temporary column for top 3 values
        temp_col2 = df[col].apply(lambda x: x if x in top_3_values else "Others")

        # Perform one-hot encoding
        encoded_df2 = pd.get_dummies(temp_col2, prefix=col)
        encoded_dataframes2.append(encoded_df2)

# Combine the original DataFrame with all encoded DataFrames
df_encoded = pd.concat([df] + encoded_dataframes1 + encoded_dataframes2, axis=1)

# Drop the original columns that were encoded
df_encoded.drop(columns=cols, inplace=True)

In [14]:
import pickle

# Save dictionaries to files
with open('../Resources/top_1_value_dict.pkl', 'wb') as f:
    pickle.dump(top_1_value_dict, f)

with open('../Resources/top_3_values_dict.pkl', 'wb') as f:
    pickle.dump(top_3_values_dict, f)


Custom mapping for Target class

In [15]:
df_encoded['IncidentGrade'].value_counts()    # checking the value counts of the target column

IncidentGrade
BenignPositive    3827800
TruePositive      3140126
FalsePositive     1954879
Name: count, dtype: int64

In [16]:
custom_mapping = {
    'BenignPositive': 0,
    'TruePositive': 1,
    'FalsePositive': 2
}
df_encoded[target] = df_encoded[target].map(custom_mapping)

In [21]:
# Saving the cleaned dataset
df_encoded.to_csv('../Resources/cleaned_data.csv', index=False)    # saving the cleaned data to a csv file