# Load The Test Dataset 

In [None]:
import pandas as pd
import numpy as np

df = pd.read_csv('../Project Info/Dataset/GUIDE_Test.csv')

In [2]:
df.head()

Unnamed: 0,Id,OrgId,IncidentId,AlertId,Timestamp,DetectorId,AlertTitle,Category,MitreTechniques,IncidentGrade,...,Roles,OSFamily,OSVersion,AntispamDirection,SuspicionLevel,LastVerdict,CountryCode,State,City,Usage
0,1245540519230,657,11767,87199,2024-06-04T22:56:27.000Z,524,563,LateralMovement,T1021;T1047;T1105;T1569.002,BenignPositive,...,,5,66,,Suspicious,Suspicious,242,1445,10630,Private
1,1400159342154,3,91158,632273,2024-06-03T12:58:26.000Z,2,2,CommandAndControl,,BenignPositive,...,,0,0,,Suspicious,Suspicious,242,1445,10630,Public
2,1279900255923,145,32247,131719,2024-06-08T03:20:49.000Z,2932,10807,LateralMovement,T1021;T1027.002;T1027.005;T1105,BenignPositive,...,,5,66,,Suspicious,Suspicious,242,1445,10630,Public
3,60129547292,222,15294,917686,2024-06-12T12:07:31.000Z,0,0,InitialAccess,T1078;T1078.004,FalsePositive,...,,5,66,,,,242,1445,10630,Public
4,515396080539,363,7615,5944,2024-06-06T17:42:05.000Z,27,18,Discovery,T1087;T1087.002,BenignPositive,...,Suspicious,5,66,,,,242,1445,10630,Public


# Data Preprocessing

### Handling Missing Values

In [3]:
missing_threshold = 0.5  # set the threshold for missing values

# 1. Remove columns with more than 50% of missing values
missing_percentage = df.isnull().mean()
columns_to_drop = missing_percentage[missing_percentage > missing_threshold].index
df.drop(columns=columns_to_drop, inplace=True)

In [4]:
df.isnull().sum()

Id                    0
OrgId                 0
IncidentId            0
AlertId               0
Timestamp             0
DetectorId            0
AlertTitle            0
Category              0
IncidentGrade         0
EntityType            0
EvidenceRole          0
DeviceId              0
Sha256                0
IpAddress             0
Url                   0
AccountSid            0
AccountUpn            0
AccountObjectId       0
AccountName           0
DeviceName            0
NetworkMessageId      0
RegistryKey           0
RegistryValueName     0
RegistryValueData     0
ApplicationId         0
ApplicationName       0
OAuthApplicationId    0
FileName              0
FolderPath            0
ResourceIdName        0
OSFamily              0
OSVersion             0
CountryCode           0
State                 0
City                  0
Usage                 0
dtype: int64

In [5]:
# remove duplicates
df.drop_duplicates(inplace=True)    

### Feature Engineering

In [6]:
df.drop(columns=['Usage','Url'], inplace=True)    # drop the 'Usage and Url' column

In [7]:
cols = df.columns.tolist()    # columns in the dataframe
cols.remove('IncidentGrade')

constant_features = [col for col in cols if df[col].nunique() == 1]    # finding the constant features
print("Constant Features:", constant_features)
df.drop(columns=constant_features, inplace=True)    # dropping the constant features

Constant Features: []


In [8]:
cols = df.columns.tolist()
df[cols] = df[cols].astype('category')
cols.remove('IncidentGrade')

# converting the timestamp column to datetime format
df['Timestamp'] = pd.to_datetime(df['Timestamp'])   

df['Year'] = df['Timestamp'].dt.year
df['Month'] = df['Timestamp'].dt.month
df['Day'] = df['Timestamp'].dt.day
df['Hour'] = df['Timestamp'].dt.hour
df['Minute'] = df['Timestamp'].dt.minute
df['Second'] = df['Timestamp'].dt.second

df.drop(columns=['Timestamp'], inplace=True)    # dropping the timestamp column
cols.remove('Timestamp')

In [9]:
df.dtypes

Id                    category
OrgId                 category
IncidentId            category
AlertId               category
DetectorId            category
AlertTitle            category
Category              category
IncidentGrade         category
EntityType            category
EvidenceRole          category
DeviceId              category
Sha256                category
IpAddress             category
AccountSid            category
AccountUpn            category
AccountObjectId       category
AccountName           category
DeviceName            category
NetworkMessageId      category
RegistryKey           category
RegistryValueName     category
RegistryValueData     category
ApplicationId         category
ApplicationName       category
OAuthApplicationId    category
FileName              category
FolderPath            category
ResourceIdName        category
OSFamily              category
OSVersion             category
CountryCode           category
State                 category
City    

In [10]:
df.shape

(3922695, 39)

### Encoding Catagorical Variables


One-hot Encoding for all classes

In [11]:
import pickle

# Load `top_1_value_dict` and `top_3_values_dict` from files
with open('../Resources/top_1_value_dict.pkl', 'rb') as f:
    top_1_value_dict = pickle.load(f)

with open('../Resources/top_3_values_dict.pkl', 'rb') as f:
    top_3_values_dict = pickle.load(f)

# Extract keys for top values
top_1_values = list(top_1_value_dict.keys())
top_3_values = list(top_3_values_dict.keys())

In [12]:
# Filter top unique values based on their presence in the test dataset
filtered_top_1_value_dict = {}
filtered_top_3_values_dict = {}

# Check top_1_value_dict
for col, top_values in top_1_value_dict.items():
    if col in df.columns:  # Ensure the column exists in the test dataset
        filtered_values = [val for val in top_values if val in df[col].unique()]
        if filtered_values:  # Only keep columns with valid top values
            filtered_top_1_value_dict[col] = filtered_values

# Check top_3_values_dict
for col, top_values in top_3_values_dict.items():
    if col in df.columns:  # Ensure the column exists in the test dataset
        filtered_values = [val for val in top_values if val in df[col].unique()]
        if filtered_values:  # Only keep columns with valid top values
            filtered_top_3_values_dict[col] = filtered_values

# Output the filtered dictionaries
print("Filtered top_1_value_dict:", filtered_top_1_value_dict)
print("Filtered top_3_values_dict:", filtered_top_3_values_dict)

Filtered top_1_value_dict: {'EvidenceRole': ['Related'], 'DeviceId': [98799], 'Sha256': [138268], 'IpAddress': [360606], 'AccountSid': [441377], 'AccountUpn': [673934], 'AccountObjectId': [425863], 'AccountName': [453297], 'DeviceName': [153085], 'NetworkMessageId': [529644], 'RegistryKey': [1631], 'RegistryValueName': [635], 'RegistryValueData': [860], 'OAuthApplicationId': [881], 'FileName': [289573], 'ResourceIdName': [3586], 'OSVersion': [66]}
Filtered top_3_values_dict: {'Id': [1, 1783], 'OrgId': [0, 2, 1], 'IncidentId': [9], 'AlertId': [0, 2], 'DetectorId': [0, 2, 3], 'AlertTitle': [0, 2, 1], 'Category': ['InitialAccess', 'Exfiltration', 'SuspiciousActivity'], 'EntityType': ['Ip', 'User', 'MailMessage'], 'ApplicationId': [2251, 0, 1], 'ApplicationName': [3421, 0, 1], 'FolderPath': [117668, 0, 1], 'OSFamily': [5, 0, 1], 'CountryCode': [242, 0, 1], 'State': [1445, 0, 1], 'City': [10630, 0, 1]}


In [13]:
del filtered_top_3_values_dict['Id']
del filtered_top_3_values_dict['IncidentId']
del filtered_top_3_values_dict['AlertId']
df.drop(columns=['Id', 'IncidentId', 'AlertId'], inplace=True)

In [14]:
# Initialize a list to store encoded DataFrames
encoded_dataframes = []

# One-hot encode using filtered top 1 values
for col, top_values in filtered_top_1_value_dict.items():
    if col in df.columns:
        # Create a temporary column with 'others' for lesser-used categories
        temp_col = df[col].apply(lambda x: x if x in top_values else "Others")
        encoded_df = pd.get_dummies(temp_col, prefix=col)
        encoded_dataframes.append(encoded_df)

# One-hot encode using filtered top 3 values
for col, top_values in filtered_top_3_values_dict.items():
    if col in df.columns:
        # Create a temporary column with 'others' for lesser-used categories
        temp_col = df[col].apply(lambda x: x if x in top_values else "Others")
        encoded_df = pd.get_dummies(temp_col, prefix=col)
        encoded_dataframes.append(encoded_df)    


# Combine the original DataFrame with all encoded DataFrames
df_encoded = pd.concat([df] + encoded_dataframes, axis=1)

# Drop the original columns that were encoded
df_encoded = df_encoded.drop(columns=list(filtered_top_1_value_dict.keys()) + list(filtered_top_3_values_dict.keys()))

In [15]:
df_encoded.shape

(3922695, 89)

Custom mapping for Target class

In [16]:
df_encoded['IncidentGrade'].value_counts()    # checking the value counts of the target column

IncidentGrade
BenignPositive    1630942
TruePositive      1422856
FalsePositive      868897
Name: count, dtype: int64

In [17]:
custom_mapping = {
    'BenignPositive': 0,
    'TruePositive': 1,
    'FalsePositive': 2
}
df_encoded['IncidentGrade'] = df_encoded['IncidentGrade'].map(custom_mapping)

# Load the model and test on the GUIDE_TEST dataset

In [20]:
from sklearn.metrics import f1_score, precision_score, recall_score, accuracy_score, classification_report

# Load the model from the file
model = pickle.load(open('../Resources/best_model.pkl', 'rb'))

X = df_encoded.drop(columns=['IncidentGrade'])    # defining the features
y = df_encoded['IncidentGrade']    # defining the target variable

# Predict the target variable
y_pred = model.predict(X)

# Evaluate performance
print("Model Performance on Test Set:")
print(f"Macro-F1 Score: {f1_score(y, y_pred, average='macro'):.4f}")
print(f"Precision: {precision_score(y, y_pred, average='macro'):.4f}")
print(f"Recall: {recall_score(y, y_pred, average='macro'):.4f}")
print(f"Accuracy: {accuracy_score(y, y_pred):.4f}")
print("Classification R eport:")
print(classification_report(y, y_pred))

Model Performance on Test Set:
Macro-F1 Score: 0.6107
Precision: 0.6140
Recall: 0.6095
Accuracy: 0.6328
Classification R eport:
              precision    recall  f1-score   support

           0       0.65      0.69      0.67   1630942
           1       0.74      0.66      0.70   1422856
           2       0.46      0.48      0.47    868897

    accuracy                           0.63   3922695
   macro avg       0.61      0.61      0.61   3922695
weighted avg       0.64      0.63      0.63   3922695

