# **XGBoost**

## Base Model

In [1]:
import numpy as np
from sklearn.datasets import make_classification
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import train_test_split
import xgboost as xgb
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import StratifiedKFold

### Without feature extraction

In [3]:
import pandas as pd
pd.set_option('display.max_columns', None)
training_set = pd.read_csv('cybersecurity_training.csv',sep="|")
testing_set = pd.read_csv('cybersecurity_test.csv',sep="|")


training_set = training_set.set_index('alert_ids')
testing_set = testing_set.set_index('alert_ids')

In [4]:
import pandas as pd

with open('cybersecurity_test_targets.txt', 'r') as f:
  test_targets = f.readlines()
test_targets = [int(target.strip()) for target in test_targets]

testing_set['notified'] = test_targets

In [5]:
training_set = training_set.drop("client_code",axis=1)
testing_set = testing_set.drop("client_code",axis=1)

In [6]:
training_set = training_set.fillna(-1)
testing_set = testing_set.fillna(-1)

In [7]:
import pandas as pd

# List of columns to convert to categorical
categorical_columns = ["categoryname", "ipcategory_name","ip", "ipcategory_scope", "parent_category",
                       "grandparent_category", "weekday", "isiptrusted", "enforcementscore",
                       "dstipcategory_dominate", "srcipcategory_dominate", "dstportcategory_dominate",
                       "srcportcategory_dominate","notified"]

# Convert specified columns to categorical type in training set
for col in categorical_columns:
  if col in training_set.columns:
    training_set[col] = training_set[col].astype('category')

# Convert specified columns to categorical type in testing set
for col in categorical_columns:
  if col in testing_set.columns:
    testing_set[col] = testing_set[col].astype('category')

In [8]:
X = training_set.drop("notified", axis=1)
Y = training_set["notified"]

In [None]:
X_train_full = X
y_train_full = Y
X_train_full

In [10]:
test_X = testing_set.drop("notified", axis=1)
test_y = testing_set["notified"]

In [11]:
clf = xgb.XGBClassifier(
    tree_method="auto",
    enable_categorical=True,
    max_cat_to_onehot=1,
    device="cuda",
    booster="gbtree",
    sampling_method="uniform",
    subsample=1,
    reg_lambda=1,
    alpha=10,
    scale_pos_weight=17,  # sum(negative instances) / sum(positive instances)
    process_type="default",
    objective="binary:logistic",
    eval_metric="auc",
    seed=42,
    colsample_bytree=0.6,
    learning_rate=0.001,
    max_depth=6,
    n_estimators=2000,
)


In [12]:
clf.fit(X, Y)
probs_test = clf.predict_proba(test_X)
auc_score = roc_auc_score(test_y, probs_test[:, 1])
auc_score

Potential solutions:
- Use a data structure that matches the device ordinal in the booster.
- Set the device for booster before call to inplace_predict.




0.8257024795294661

### With feature extraction

In [13]:


import pandas as pd
pd.set_option('display.max_columns', None)
training_set = pd.read_csv('train_onehot.csv')
testing_set = pd.read_csv('test_onehot.csv')

training_set = training_set.set_index('alert_ids')
testing_set = testing_set.set_index('alert_ids')





In [14]:
X = training_set.drop("notified", axis=1)
Y = training_set["notified"]

In [None]:
X

In [None]:
X_train_full = X
y_train_full = Y
X_train_full

In [17]:
test_X = testing_set.drop("notified", axis=1)
test_y = testing_set["notified"]

In [18]:
clf = xgb.XGBClassifier(
    tree_method="auto",
    enable_categorical=True,
    max_cat_to_onehot=1,
    device="cuda",
    booster="gbtree",
    sampling_method="uniform",
    subsample=1,
    reg_lambda=1,
    alpha=10,
    scale_pos_weight=17,  # sum(negative instances) / sum(positive instances)
    process_type="default",
    objective="binary:logistic",
    eval_metric="auc",
    seed=42,
    colsample_bytree=0.6,
    learning_rate=0.001,
    max_depth=6,
    n_estimators=2000,
)


In [19]:
clf.fit(X, Y)
probs_test = clf.predict_proba(test_X)
auc_score = roc_auc_score(test_y, probs_test[:, 1])
auc_score

0.8809424518509976

# **Active learning**

## **Least confident**

In [None]:
import numpy as np
import pandas as pd
from sklearn.metrics import roc_auc_score
import matplotlib.pyplot as plt

# List of random states for reproducibility
random_state = [6, 15, 10, 17, 42, 20, 67, 49, 12, 25]
auc_scores_total = {}
num_samples_per_iter = 1

# Loop through each random state
for rs in random_state:
    # Split the dataset into training and pool sets with a small initial training size
    X_train, X_pool, y_train, y_pool = train_test_split(X, Y, train_size=0.001, random_state=rs)
    auc_scores = []
    print("random state =", rs)
    
    # Active learning loop
    for i in range(200):
        # Fit the classifier on the current training set
        clf.fit(X_train, y_train)
        
        # Predict probabilities on the pool set and test set
        probs = clf.predict_proba(X_pool)
        probs_test = clf.predict_proba(test_X)
        
        # Calculate the AUC score and append to the list of scores
        auc_score = roc_auc_score(test_y, probs_test[:, 1])
        auc_scores.append(auc_score)
        
        print("X size:", len(X_train))
        print("AUC Score:", auc_score)
        
        # Calculate uncertainty of each sample in the pool set
        uncertainty = 1 - np.max(probs, axis=1)
        
        # Select the samples with the highest uncertainty
        query_indices = np.argsort(uncertainty)[-num_samples_per_iter:]
        query_samples = X_pool.iloc[query_indices]
        y_query_samples = y_pool.iloc[query_indices]
        
        # Add the selected samples to the training set and remove them from the pool set
        X_train = pd.concat([X_train, query_samples])
        y_train = pd.concat([y_train, y_query_samples])
        X_pool = X_pool.drop(query_samples.index)
        y_pool = y_pool.drop(query_samples.index)
    
    # Store the AUC scores for each random state
    auc_scores_total[rs] = auc_scores[:]

print("Finished")


In [20]:
auc_df = pd.DataFrame(auc_scores_total)
auc_df.to_csv('lc_scores.csv', index=False)

## **Margin Sampling**

In [None]:
import numpy as np
import pandas as pd
from sklearn.metrics import roc_auc_score
import matplotlib.pyplot as plt

# List of random states for reproducibility
random_state = [6, 15, 10, 17, 42, 20, 67, 49, 12, 25]
num_samples_per_iter = 1
auc_scores_total_margin = {}

# Loop through each random state
for rs in random_state:
    # Split the dataset into training and pool sets with a small initial training size
    X_train, X_pool, y_train, y_pool = train_test_split(X, Y, train_size=0.001, random_state=rs)
    auc_scores = []
    print("random state =", rs)
    
    # Active learning loop
    for i in range(200):
        # Fit the classifier on the current training set
        clf.fit(X_train, y_train)
        
        # Predict probabilities on the pool set and test set
        probs = clf.predict_proba(X_pool)
        probs_test = clf.predict_proba(test_X)
        
        # Calculate the AUC score and append to the list of scores
        auc_score = roc_auc_score(test_y, probs_test[:, 1])
        auc_scores.append(auc_score)
        
        print("X size:", len(X_train))
        print("AUC Score:", auc_score)
        
        # Sort probabilities to calculate margins
        sorted_probs = np.sort(probs, axis=1)
        margins = sorted_probs[:, -1] - sorted_probs[:, -2]
        
        # Select the samples with the smallest margins
        query_indices = np.argsort(margins)[:num_samples_per_iter]
        query_samples = X_pool.iloc[query_indices]
        y_query_samples = y_pool.iloc[query_indices]
        
        # Add the selected samples to the training set and remove them from the pool set
        X_train = pd.concat([X_train, query_samples])
        y_train = pd.concat([y_train, y_query_samples])
        X_pool = X_pool.drop(query_samples.index)
        y_pool = y_pool.drop(query_samples.index)
    
    # Store the AUC scores for each random state
    auc_scores_total_margin[rs] = auc_scores[:]

print("Finished")


In [None]:
auc_df = pd.DataFrame(auc_scores_total_margin)
auc_df.to_csv('ms_scores.csv', index=False)

## **entropy**

In [None]:
import numpy as np
import pandas as pd
from sklearn.metrics import roc_auc_score
import matplotlib.pyplot as plt
from scipy.stats import entropy

# List of random states for reproducibility
random_state = [6, 15, 10, 17, 42, 20, 67, 49, 12, 25]
auc_scores_total_entropy = {}
num_samples_per_iter = 1

# Loop through each random state
for rs in random_state:
    # Split the dataset into training and pool sets with a small initial training size
    X_train, X_pool, y_train, y_pool = train_test_split(X, Y, train_size=0.001, random_state=rs)
    auc_scores = []
    
    # Active learning loop
    for i in range(200):
        # Fit the classifier on the current training set
        clf.fit(X_train, y_train)
        
        # Predict probabilities on the pool set and test set
        probs = clf.predict_proba(X_pool)
        probs_test = clf.predict_proba(test_X)
        
        # Calculate the AUC score and append to the list of scores
        auc_score = roc_auc_score(test_y, probs_test[:, 1])
        auc_scores.append(auc_score)
        
        print("AUC Score:", auc_score)
        
        # Normalize probabilities
        probs_normalized = probs / probs.sum(axis=1, keepdims=True)
        
        # Calculate entropies (avoid log(0))
        entropies = -np.sum(probs_normalized * np.log(probs_normalized + 1e-10), axis=1)
        
        # Select the samples with the highest entropy
        query_indices = np.argsort(entropies)[-num_samples_per_iter:]
        query_samples = X_pool.iloc[query_indices]
        y_query_samples = y_pool.iloc[query_indices]
        
        # Add the selected samples to the training set and remove them from the pool set
        X_train = pd.concat([X_train, query_samples])
        y_train = pd.concat([y_train, y_query_samples])
        X_pool = X_pool.drop(query_samples.index)
        y_pool = y_pool.drop(query_samples.index)
    
    # Store the AUC scores for each random state
    auc_scores_total_entropy[rs] = auc_scores[:]

print("Finished")


In [None]:
auc_df = pd.DataFrame(auc_scores_total_entropy)
auc_df.to_csv('entropy_scores.csv', index=False)

# **Density-Weighted**

## **Density-entropy**

In [None]:
import numpy as np
import pandas as pd
import xgboost as xgb
from sklearn.metrics import roc_auc_score
import matplotlib.pyplot as plt
from sklearn.neighbors import NearestNeighbors
from sklearn.model_selection import train_test_split

# List of random states for reproducibility
random_state = [6, 15, 10, 17, 42, 20, 67, 49, 12, 25]
auc_scores_total_density_entropy = {}
num_samples_per_iter = 1

# Function to calculate density
def calculate_density(X):
    nbrs = NearestNeighbors(n_neighbors=5).fit(X)
    distances, _ = nbrs.kneighbors(X)
    density = np.mean(distances, axis=1)
    return density

# Loop through each random state
for rs in random_state:
    # Split the dataset into training and pool sets with a small initial training size
    X_train, X_pool, y_train, y_pool = train_test_split(X, Y, train_size=0.001, random_state=rs)
    auc_scores = []
    densities = calculate_density(X_pool)
    valid_mask = np.ones(len(X_pool), dtype=bool)
    
    # Active learning loop
    for i in range(200):
        # Fit the classifier on the current training set
        clf.fit(X_train, y_train)
        
        # Predict probabilities on the pool set and test set
        probs = clf.predict_proba(X_pool)
        probs_test = clf.predict_proba(test_X)
        
        # Calculate the AUC score and append to the list of scores
        auc_score = roc_auc_score(test_y, probs_test[:, 1])
        auc_scores.append(auc_score)
        
        print("AUC Score:", auc_score)
        
        # Normalize probabilities
        probs_normalized = probs / probs.sum(axis=1, keepdims=True)
        
        # Calculate entropies (avoid log(0))
        entropies = -np.sum(probs_normalized * np.log(probs_normalized + 1e-10), axis=1)
        
        # Calculate weighted entropies
        weighted_entropies = entropies[valid_mask] / densities[valid_mask]
        
        # Select the samples with the highest weighted entropies
        query_indices = np.argsort(weighted_entropies)[-num_samples_per_iter:]
        queried_indices = np.where(valid_mask)[0][query_indices]
        query_samples = X_pool.iloc[queried_indices]
        y_query_samples = y_pool.iloc[queried_indices]
        
        # Add the selected samples to the training set and remove them from the pool set
        X_train = pd.concat([X_train, query_samples])
        y_train = pd.concat([y_train, y_query_samples])
        valid_mask[queried_indices] = False
    
    X_pool = X_pool[valid_mask]
    y_pool = y_pool[valid_mask]
    # Store the AUC scores for each random state
    auc_scores_total_density_entropy[rs] = auc_scores[:]

print("Finished")


In [None]:
auc_df = pd.DataFrame(auc_scores_total_density_entropy)
auc_df.to_csv('de_scores.csv', index=False)

## **Density-lc**

In [None]:
import numpy as np
import pandas as pd
import xgboost as xgb
from sklearn.metrics import roc_auc_score
import matplotlib.pyplot as plt
from sklearn.neighbors import NearestNeighbors
from sklearn.model_selection import train_test_split

# List of random states for reproducibility
random_state = [6, 15, 10, 17, 42, 20, 67, 49, 12, 25]
auc_scores_total_density_least_confident = {}
num_samples_per_iter = 1

# Function to calculate density
def calculate_density(X):
    nbrs = NearestNeighbors(n_neighbors=5).fit(X)
    distances, _ = nbrs.kneighbors(X)
    density = np.mean(distances, axis=1)
    return density

# Loop through each random state
for rs in random_state:
    # Split the dataset into training and pool sets with a small initial training size
    X_train, X_pool, y_train, y_pool = train_test_split(X, Y, train_size=0.001, random_state=rs)
    auc_scores = []
    print("random state =", rs)
    densities = calculate_density(X_pool)
    valid_mask = np.ones(len(X_pool), dtype=bool)
    
    # Active learning loop
    for i in range(200):
        # Fit the classifier on the current training set
        clf.fit(X_train, y_train)
        
        # Predict probabilities on the pool set and test set
        probs = clf.predict_proba(X_pool)
        probs_test = clf.predict_proba(test_X)
        
        # Calculate the AUC score and append to the list of scores
        auc_score = roc_auc_score(test_y, probs_test[:, 1])
        auc_scores.append(auc_score)
        print("AUC Score:", auc_score)
        
        # Calculate least confident scores
        least_confident = 1 - np.max(probs, axis=1)
        
        # Calculate weighted least confident scores
        weighted_least_confident = least_confident[valid_mask] / densities[valid_mask]
        
        # Select the samples with the highest weighted least confident scores
        query_indices = np.argsort(weighted_least_confident)[-num_samples_per_iter:]
        queried_indices = np.where(valid_mask)[0][query_indices]
        query_samples = X_pool.iloc[queried_indices]
        y_query_samples = y_pool.iloc[queried_indices]
        
        # Add the selected samples to the training set and remove them from the pool set
        X_train = pd.concat([X_train, query_samples])
        y_train = pd.concat([y_train, y_query_samples])
        valid_mask[queried_indices] = False
    
    # Update the pool set with valid samples
    X_pool = X_pool[valid_mask]
    y_pool = y_pool[valid_mask]
    
    # Store the AUC scores for each random state
    auc_scores_total_density_least_confident[rs] = auc_scores[:]

print("Finished")


In [None]:
auc_df = pd.DataFrame(auc_scores_total_density_lest_confident)
auc_df.to_csv('dlc_scores.csv', index=False)

## **Density-margin**

In [None]:
import numpy as np
import pandas as pd
import xgboost as xgb
from sklearn.metrics import roc_auc_score
import matplotlib.pyplot as plt
from sklearn.neighbors import NearestNeighbors
from sklearn.model_selection import train_test_split

# List of random states for reproducibility
random_state = [6, 15, 10, 17, 42, 20, 67, 49, 12, 25]
auc_scores_total_density_margin = {}
num_samples_per_iter = 1

# Function to calculate density
def calculate_density(X):
    nbrs = NearestNeighbors(n_neighbors=5).fit(X)
    distances, _ = nbrs.kneighbors(X)
    density = np.mean(distances, axis=1)
    return density

# Loop through each random state
for rs in random_state:
    # Split the dataset into training and pool sets with a small initial training size
    X_train, X_pool, y_train, y_pool = train_test_split(X, Y, train_size=0.001, random_state=rs)
    print("random state =", rs)
    auc_scores = []
    densities = calculate_density(X_pool)
    valid_mask = np.ones(len(X_pool), dtype=bool)
    
    # Active learning loop
    for i in range(200):
        # Fit the classifier on the current training set
        clf.fit(X_train, y_train)
        
        # Predict probabilities on the pool set and test set
        probs = clf.predict_proba(X_pool)
        probs_test = clf.predict_proba(test_X)
        
        # Calculate the AUC score and append to the list of scores
        auc_score = roc_auc_score(test_y, probs_test[:, 1])
        auc_scores.append(auc_score)
        print("AUC Score:", auc_score)
        
        # Sort probabilities to calculate margins
        sorted_probs = np.sort(probs, axis=1)
        margin = sorted_probs[:, -1] - sorted_probs[:, -2]
        
        # Calculate weighted margins
        weighted_margin = margin[valid_mask] / densities[valid_mask]
        
        # Select the samples with the smallest margins
        query_indices = np.argsort(weighted_margin)[:num_samples_per_iter]
        queried_indices = np.where(valid_mask)[0][query_indices]
        query_samples = X_pool.iloc[queried_indices]
        y_query_samples = y_pool.iloc[queried_indices]
        
        # Add the selected samples to the training set and remove them from the pool set
        X_train = pd.concat([X_train, query_samples])
        y_train = pd.concat([y_train, y_query_samples])
        valid_mask[queried_indices] = False
    
    # Update the pool set with valid samples
    X_pool = X_pool[valid_mask]
    y_pool = y_pool[valid_mask]
    
    # Store the AUC scores for each random state
    auc_scores_total_density_margin[rs] = auc_scores[:]

print("Finished")


In [None]:
auc_df = pd.DataFrame(auc_scores_total_density_margin)
auc_df.to_csv('dm_scores.csv', index=False)