# Handling Missing Values

In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

# Load the dataset
df = pd.read_csv('../Dataset/GUIDE_Train.csv')  

In [2]:
# Identify missing values
print("Missing Values:\n", df.isnull().sum())

Missing Values:
 Id                          0
OrgId                       0
IncidentId                  0
AlertId                     0
Timestamp                   0
DetectorId                  0
AlertTitle                  0
Category                    0
MitreTechniques       5468386
IncidentGrade           51340
ActionGrouped         9460773
ActionGranular        9460773
EntityType                  0
EvidenceRole                0
DeviceId                    0
Sha256                      0
IpAddress                   0
Url                         0
AccountSid                  0
AccountUpn                  0
AccountObjectId             0
AccountName                 0
DeviceName                  0
NetworkMessageId            0
EmailClusterId        9420025
RegistryKey                 0
RegistryValueName           0
RegistryValueData           0
ApplicationId               0
ApplicationName             0
OAuthApplicationId          0
ThreatFamily          9441956
FileName               

In [3]:
missing_threshold = 0.5  # set the threshold for missing values

# 1. Remove columns with more than 50% of missing values
missing_percentage = df.isnull().mean()
columns_to_drop = missing_percentage[missing_percentage > missing_threshold].index
df.drop(columns=columns_to_drop, inplace=True)

In [4]:
df.isnull().sum()

Id                        0
OrgId                     0
IncidentId                0
AlertId                   0
Timestamp                 0
DetectorId                0
AlertTitle                0
Category                  0
IncidentGrade         51340
EntityType                0
EvidenceRole              0
DeviceId                  0
Sha256                    0
IpAddress                 0
Url                       0
AccountSid                0
AccountUpn                0
AccountObjectId           0
AccountName               0
DeviceName                0
NetworkMessageId          0
RegistryKey               0
RegistryValueName         0
RegistryValueData         0
ApplicationId             0
ApplicationName           0
OAuthApplicationId        0
FileName                  0
FolderPath                0
ResourceIdName            0
OSFamily                  0
OSVersion                 0
CountryCode               0
State                     0
City                      0
dtype: int64

In [None]:
# fill missing values with the mode of the column
df['IncidentGrade'].fillna(df['IncidentGrade'].mode()[0], inplace=True)

In [None]:
# remove duplicates
df.drop_duplicates(inplace=True)    

# Removing Outliers

In [23]:
from scipy.stats import zscore    # importing the zscore function
import numpy as np                # importing the numpy library 

# function to remove outliers
def remove_outliers(df, threshold=3):
    numerical_df = df.select_dtypes(include=[np.number])    # Select only numerical columns
    z_scores = np.abs((numerical_df - numerical_df.mean()) / numerical_df.std())    # calculating the zscore

    # Filter out rows with Z-scores above the threshold in any column
    df_clean = df[(z_scores < threshold).all(axis=1)].copy()
    return df_clean     # returning the dataframe

print("Before removing outliers:", df.shape)    # printing the shape of the dataframe

# Remove outliers using Z-score
df = remove_outliers(df)  

print("After removing outliers:", df.shape)             # printing the shape of the dataframe

Before removing outliers: (8970539, 35)
After removing outliers: (4910238, 35)


# Feature Engineering

In [29]:
# Separate categorical and numerical columns
categorical_columns = df.select_dtypes(include=['object']).columns.tolist()
numerical_columns = df.select_dtypes(include=['number']).columns.tolist()
categorical_columns.remove('IncidentGrade')

In [None]:
constant_features = [col for col in numerical_columns if df[col].nunique() == 1]
print("Constant Features:", constant_features)
df.drop(columns=constant_features, inplace=True)
numerical_columns = df.select_dtypes(include=[np.number]).columns.tolist()

Constant Features: ['RegistryValueName', 'RegistryValueData']


In [None]:
from sklearn.feature_selection import f_classif, chi2

target = 'IncidentGrade'    # target column
# Perform ANOVA test (f_classif)
anova_scores, p_values = f_classif(df[numerical_columns], df[target])

# Set significance threshold for feature selection
significance_level = 0.05
numerical_to_keep = [num for num, p in zip(numerical_columns, p_values) if p < significance_level]
numerical_to_drop = [num for num, p in zip(numerical_columns, p_values) if p >= significance_level]

print(f"Numerical Features Kept: {numerical_to_keep}")
print(f"Numerical Features Dropped: {numerical_to_drop}")

Numerical Features Kept: ['Id', 'OrgId', 'IncidentId', 'AlertId', 'DetectorId', 'AlertTitle', 'DeviceId', 'Sha256', 'IpAddress', 'Url', 'AccountSid', 'AccountUpn', 'AccountObjectId', 'AccountName', 'DeviceName', 'NetworkMessageId', 'ApplicationId', 'ApplicationName', 'FileName', 'FolderPath', 'ResourceIdName', 'CountryCode', 'State', 'City']
Numerical Features Dropped: ['RegistryKey', 'OAuthApplicationId', 'OSFamily', 'OSVersion']


In [31]:
df.drop(columns=numerical_to_drop, inplace=True)

In [33]:
from scipy.stats import chi2_contingency

significance_level = 0.05
categorical_to_keep = []
categorical_to_drop = []

for col in categorical_columns:
    # Create contingency table
    contingency_table = pd.crosstab(df[col], df[target])
    
    # Perform Chi-Square Test
    chi2_stat, p_value, dof, expected = chi2_contingency(contingency_table)
    
    # Keep or drop feature based on p-value
    if p_value < significance_level:
        categorical_to_keep.append(col)
    else:
        categorical_to_drop.append(col)

print(f"Categorical Features Kept: {categorical_to_keep}")
print(f"Categorical Features Dropped: {categorical_to_drop}")

Categorical Features Kept: ['Timestamp', 'Category', 'EntityType', 'EvidenceRole']
Categorical Features Dropped: []


# Encoding Catagorical Variables

In [34]:
df[categorical_columns].nunique()

Timestamp       665569
Category            18
EntityType          26
EvidenceRole         2
dtype: int64

In [37]:
df['Timestamp'] = pd.to_datetime(df['Timestamp'])

In [39]:
from sklearn.preprocessing import LabelEncoder, OneHotEncoder

# Label Encoding
le = LabelEncoder()
df['Timestamp'] = le.fit_transform(df['Timestamp'])
df['EvidenceRole'] = le.fit_transform(df['EvidenceRole'])
df['IncidentGrade'] = le.fit_transform(df['IncidentGrade'])

# One-Hot Encoding
df = pd.get_dummies(df, columns=['Category', 'EntityType'], drop_first=True)

In [41]:
df.head()

Unnamed: 0,Id,OrgId,IncidentId,AlertId,Timestamp,DetectorId,AlertTitle,IncidentGrade,EvidenceRole,DeviceId,...,EntityType_Mailbox,EntityType_MailboxConfiguration,EntityType_Malware,EntityType_Nic,EntityType_OAuthApplication,EntityType_Process,EntityType_RegistryKey,EntityType_SecurityGroup,EntityType_Url,EntityType_User
1,455266534868,88,326,210035,599729,58,43,1,0,98799,...,False,False,False,False,False,False,False,False,False,True
2,1056561957389,809,58352,712507,553181,423,298,1,1,98799,...,False,False,False,False,False,False,False,False,True,False
4,214748368522,148,4359,188041,631702,9,74,2,0,98799,...,False,False,False,False,False,False,False,False,False,True
9,1073741827836,72,70,831157,300445,4,3,2,0,98799,...,False,False,False,False,False,False,False,False,False,True
13,223338299440,6,2472,1148,3785,17,284,1,1,98799,...,False,False,False,False,False,False,False,False,False,False
