# Load The Test Dataset 

In [None]:
import pickle
import pandas as pd

df = pd.read_csv('../Dataset/GUIDE_Test.csv')

# Data Preprocessing

In [2]:
missing_threshold = 0.5  # set the threshold for missing values

# 1. Remove columns with more than 50% of missing values
missing_percentage = df.isnull().mean()
columns_to_drop = missing_percentage[missing_percentage > missing_threshold].index
df.drop(columns=columns_to_drop, inplace=True)

In [3]:
df.isnull().sum()

Id                    0
OrgId                 0
IncidentId            0
AlertId               0
Timestamp             0
DetectorId            0
AlertTitle            0
Category              0
IncidentGrade         0
EntityType            0
EvidenceRole          0
DeviceId              0
Sha256                0
IpAddress             0
Url                   0
AccountSid            0
AccountUpn            0
AccountObjectId       0
AccountName           0
DeviceName            0
NetworkMessageId      0
RegistryKey           0
RegistryValueName     0
RegistryValueData     0
ApplicationId         0
ApplicationName       0
OAuthApplicationId    0
FileName              0
FolderPath            0
ResourceIdName        0
OSFamily              0
OSVersion             0
CountryCode           0
State                 0
City                  0
Usage                 0
dtype: int64

In [4]:
# remove duplicates
df.drop_duplicates(inplace=True)    

In [5]:
from scipy.stats import zscore    # importing the zscore function
import numpy as np                # importing the numpy library 

# function to remove outliers
def remove_outliers(df, threshold=3):
    numerical_df = df.select_dtypes(include=[np.number])    # Select only numerical columns
    z_scores = np.abs((numerical_df - numerical_df.mean()) / numerical_df.std())    # calculating the zscore

    # Filter out rows with Z-scores above the threshold in any column
    df_clean = df[(z_scores < threshold).all(axis=1)].copy()
    return df_clean     # returning the dataframe

print("Before removing outliers:", df.shape)    # printing the shape of the dataframe

# Remove outliers using Z-score
df = remove_outliers(df)  

print("After removing outliers:", df.shape)             # printing the shape of the dataframe

Before removing outliers: (3922695, 36)
After removing outliers: (2096390, 36)


In [6]:
numerical_columns = df.select_dtypes(include=['number']).columns.tolist()
constant_features = [col for col in numerical_columns if df[col].nunique() == 1]    # finding the constant features
print("Constant Features:", constant_features)
df.drop(columns=constant_features, inplace=True)    # dropping the constant features

Constant Features: ['RegistryValueName', 'RegistryValueData', 'OSFamily', 'OSVersion']


In [7]:
df.drop(['RegistryKey','OAuthApplicationId','Usage'], inplace=True, axis=1)    # dropping the columns

In [8]:
df['Timestamp'] = pd.to_datetime(df['Timestamp'])   # converting the timestamp column to datetime format

In [9]:
df['Year'] = df['Timestamp'].dt.year
df['Month'] = df['Timestamp'].dt.month
df['Day'] = df['Timestamp'].dt.day
df['Hour'] = df['Timestamp'].dt.hour
df['Minute'] = df['Timestamp'].dt.minute
df['Second'] = df['Timestamp'].dt.second

In [10]:
df.drop(columns=['Timestamp'], inplace=True)    # dropping the timestamp column

# Encoding Catagorical Variables

In [11]:
df.select_dtypes(include=['object']).nunique()    # checking the unique values in the object columns

Category         18
IncidentGrade     3
EntityType       25
EvidenceRole      2
dtype: int64

In [12]:
from sklearn.preprocessing import LabelEncoder
import category_encoders as ce

# Label Encoding
le = LabelEncoder()
df['EvidenceRole'] = le.fit_transform(df['EvidenceRole'])

# Ordinal Encoding
custom_mapping = {
    'FalsePositive': 0,
    'BenignPositive': 1,
    'TruePositive': 2
}
df['IncidentGrade'] = df['IncidentGrade'].map(custom_mapping)

# One-Hot Encoding
# Step 1: Identify the top 4 most frequent categories
top_5_categories = df['Category'].value_counts().nlargest(5).index
top_5_entities = df['EntityType'].value_counts().nlargest(5).index

# Step 2: Create a new column grouping all other categories as 'Other'
df['Category_Top'] = df['Category'].apply(lambda x: x if x in top_5_categories else 'Other')
df['EntityType_Top'] = df['EntityType'].apply(lambda x: x if x in top_5_entities else 'Other')

# Step 3: Perform one-hot encoding on the modified column
category_encoded = pd.get_dummies(df['Category_Top'], prefix='Category')
df = pd.concat([df, category_encoded], axis=1)
entity_encoded = pd.get_dummies(df['EntityType_Top'], prefix='EntityType')
df = pd.concat([df, entity_encoded], axis=1)

# Step 4: Drop the original category columns if no longer needed
df.drop(['Category', 'Category_Top'], axis=1, inplace=True)
df.drop(['EntityType', 'EntityType_Top'], axis=1, inplace=True)


# Scaling the numerical values

In [13]:
# import scaling 4
from sklearn.preprocessing import StandardScaler

# Load the StandardScaler object
scaler = pickle.load(open('../NoteBooks/scaler.pkl', 'rb'))

# Define numerical features
numerical_features = ['Id', 'OrgId', 'IncidentId', 'AlertId', 'DetectorId', 'AlertTitle', 'DeviceId', 'Sha256', 'IpAddress', 'Url', 'AccountSid', 'AccountUpn', 'AccountObjectId', 'AccountName', 'DeviceName', 'NetworkMessageId', 'ApplicationId', 'ApplicationName', 'FileName', 'FolderPath', 'ResourceIdName', 'CountryCode', 'State', 'City', 'Year', 'Month', 'Day', 'Hour', 'Minute', 'Second']

df1 = df.copy()    # creating a copy of the dataframe

df1[numerical_features] = scaler.transform(df1[numerical_features])    # scaling the numerical features

# Load the model and test on the GUIDE_TEST dataset

In [16]:
from sklearn.metrics import f1_score, precision_score, recall_score, accuracy_score, classification_report

# Load the model from the file
model = pickle.load(open('../NoteBooks/model.pkl', 'rb'))

x = df1.drop(columns=['IncidentGrade'])    # defining the features
y_true = df1['IncidentGrade']    # defining the target variable

# Predict the target variable
y_pred = model.predict(x)

# Evaluate performance
print("Model Performance on Test Set:")
print(f"Macro-F1 Score: {f1_score(y_true, y_pred, average='macro'):.4f}")
print(f"Precision: {precision_score(y_true, y_pred, average='macro'):.4f}")
print(f"Recall: {recall_score(y_true, y_pred, average='macro'):.4f}")
print(f"Accuracy: {accuracy_score(y_true, y_pred):.4f}")
print("Classification Report:")
print(classification_report(y_true, y_pred))


Model Performance on Test Set:
Macro-F1 Score: 0.8293
Precision: 0.8319
Recall: 0.8335
Accuracy: 0.8368
Classification Report:
              precision    recall  f1-score   support

           0       0.77      0.80      0.79    437337
           1       0.77      0.89      0.82    770348
           2       0.95      0.81      0.88    888705

    accuracy                           0.84   2096390
   macro avg       0.83      0.83      0.83   2096390
weighted avg       0.85      0.84      0.84   2096390

