# Handling Missing Values

In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

# Load the dataset
df = pd.read_csv('../Dataset/GUIDE_Train.csv')  

In [2]:
# Identify missing values
print("Missing Values:\n", df.isnull().sum())

Missing Values:
 Id                          0
OrgId                       0
IncidentId                  0
AlertId                     0
Timestamp                   0
DetectorId                  0
AlertTitle                  0
Category                    0
MitreTechniques       5468386
IncidentGrade           51340
ActionGrouped         9460773
ActionGranular        9460773
EntityType                  0
EvidenceRole                0
DeviceId                    0
Sha256                      0
IpAddress                   0
Url                         0
AccountSid                  0
AccountUpn                  0
AccountObjectId             0
AccountName                 0
DeviceName                  0
NetworkMessageId            0
EmailClusterId        9420025
RegistryKey                 0
RegistryValueName           0
RegistryValueData           0
ApplicationId               0
ApplicationName             0
OAuthApplicationId          0
ThreatFamily          9441956
FileName               

In [3]:
missing_threshold = 0.5  # set the threshold for missing values

# 1. Remove columns with more than 50% of missing values
missing_percentage = df.isnull().mean()
columns_to_drop = missing_percentage[missing_percentage > missing_threshold].index
df.drop(columns=columns_to_drop, inplace=True)

In [4]:
df.isnull().sum()

Id                        0
OrgId                     0
IncidentId                0
AlertId                   0
Timestamp                 0
DetectorId                0
AlertTitle                0
Category                  0
IncidentGrade         51340
EntityType                0
EvidenceRole              0
DeviceId                  0
Sha256                    0
IpAddress                 0
Url                       0
AccountSid                0
AccountUpn                0
AccountObjectId           0
AccountName               0
DeviceName                0
NetworkMessageId          0
RegistryKey               0
RegistryValueName         0
RegistryValueData         0
ApplicationId             0
ApplicationName           0
OAuthApplicationId        0
FileName                  0
FolderPath                0
ResourceIdName            0
OSFamily                  0
OSVersion                 0
CountryCode               0
State                     0
City                      0
dtype: int64

In [None]:
# fill missing values with the mode of the column
df['IncidentGrade'].fillna(df['IncidentGrade'].mode()[0], inplace=True)

In [None]:
# remove duplicates
df.drop_duplicates(inplace=True)    

# Removing Outliers

In [20]:
# Separate categorical and numerical columns
categorical_columns = df.select_dtypes(include=['object']).columns.tolist()
numerical_columns = df.select_dtypes(include=['number']).columns.tolist()
categorical_columns.remove('IncidentGrade')

In [23]:
from scipy.stats import zscore    # importing the zscore function
import numpy as np                # importing the numpy library 

# function to remove outliers
def remove_outliers(df, threshold=3):
    numerical_df = df.select_dtypes(include=[np.number])    # Select only numerical columns
    z_scores = np.abs((numerical_df - numerical_df.mean()) / numerical_df.std())    # calculating the zscore

    # Filter out rows with Z-scores above the threshold in any column
    df_clean = df[(z_scores < threshold).all(axis=1)].copy()
    return df_clean     # returning the dataframe

print("Before removing outliers:", df.shape)    # printing the shape of the dataframe

# Remove outliers using Z-score
df = remove_outliers(df)  

print("After removing outliers:", df.shape)             # printing the shape of the dataframe

Before removing outliers: (8970539, 35)
After removing outliers: (4910238, 35)


# Feature Engineering

In [19]:
from sklearn.feature_selection import f_classif, chi2

target = 'IncidentGrade'    # target column
# Perform ANOVA test (f_classif)
anova_scores, p_values = f_classif(df[numerical_columns], df[target])

# Set significance threshold for feature selection
significance_level = 0.05
numerical_to_keep = [num for num, p in zip(numerical_columns, p_values) if p < significance_level]
numerical_to_drop = [num for num, p in zip(numerical_columns, p_values) if p >= significance_level]

print(f"Numerical Features Kept: {numerical_to_keep}")
print(f"Numerical Features Dropped: {numerical_to_drop}")

Numerical Features Kept: ['Id', 'OrgId', 'IncidentId', 'AlertId', 'DetectorId', 'AlertTitle', 'DeviceId', 'Sha256', 'IpAddress', 'Url', 'AccountSid', 'AccountUpn', 'AccountObjectId', 'AccountName', 'DeviceName', 'NetworkMessageId', 'RegistryKey', 'RegistryValueName', 'RegistryValueData', 'ApplicationId', 'ApplicationName', 'OAuthApplicationId', 'FileName', 'FolderPath', 'ResourceIdName', 'OSFamily', 'OSVersion', 'CountryCode', 'State', 'City']
Numerical Features Dropped: []


In [None]:
from scipy.stats import f_oneway, chi2_contingency

target = 'IncidentGrade'    # target column

# ANOVA Test for n  umerical columns
significant_numerical = []
for num_col in numerical_columns:
    groups = [df[df[target] == cat][num_col] for cat in df['IncidentGrade'].unique()]
    f_stat, p_val = f_oneway(*groups)
    if p_val < 0.05:  # Significant relationship
        significant_numerical.append(num_col)

In [None]:
from scipy.stats import f_oneway, chi2_contingency

# ANOVA Test for numerical columns
significant_numerical = []
for num_col in numerical_columns:
    groups = [df[df['IncidentGrade'] == cat][num_col] for cat in df['IncidentGrade'].unique()]
    f_stat, p_val = f_oneway(*groups)
    if p_val < 0.05:  # Significant relationship
        significant_numerical.append(num_col)

# Chi-Square Test for categorical columns
significant_categorical = []
for cat_col in categorical_columns:
    contingency_table = pd.crosstab(df[cat_col], df['IncidentGrade'])
    chi2, p_val, _, _ = chi2_contingency(contingency_table)
    if p_val < 0.05:  # Significant relationship
        significant_categorical.append(cat_col)

In [None]:
df[significant_numerical + significant_categorical + ['IncidentGrade']]

# Encoding Catagorical Variables