# XGBoost

In [1]:
import pandas as pd

# Define the list of file names
file_names = [
    "../train/02-14-2018_clean.csv",
    "../train/02-15-2018_clean.csv",
    "../train/02-16-2018_clean.csv",
    "../train/02-20-2018_clean.csv",
    "../train/02-21-2018_clean.csv",
    "../train/02-22-2018_clean.csv",
    "../train/02-23-2018_clean.csv",
    "../train/02-28-2018_clean.csv",
    "../train/03-01-2018_clean.csv",
    "../train/03-02-2018_clean.csv"
]

# Initialize an empty list to store the DataFrames
dfs = []

# Loop through the file names and read each file as a DataFrame
for file_name in file_names:
    df = pd.read_csv(file_name)
    dfs.append(df)

# Concatenate the DataFrames into a single DataFrame
df = pd.concat(dfs, ignore_index=True)

# Print the shape of the combined DataFrame
print(df.shape)

(12639793, 71)


In [2]:
test = pd.read_csv("../test/test.csv")
print(test.shape)

(3159955, 72)


## Dropping the irrelevant features

In [3]:
features = [
    "PSH Flag Cnt",
    "Dst Port",
    "Init Fwd Win Byts",
    "RST Flag Cnt",
    "Fwd PSH Flags",
    "Bwd IAT Max",
    "Date",
    "Fwd IAT Tot",
    "Fwd IAT Max",
    "Fwd IAT Min",
    "Fwd Seg Size Min",
    "Bwd Seg Size Avg",
    "Flow Duration",
    "Init Bwd Win Byts",
    "Fwd IAT Mean",
    "Pkt Len Std",
    "Bwd Pkts/s",
    "Time",
    "Bwd Pkt Len Mean",
    "Fwd Pkt Len Max",
    "Fwd Pkts/s",
    "Bwd Header Len",
    "Bwd IAT Tot",
    "Pkt Len Max",
    "Bwd Pkt Len Min",
    "Bwd Pkt Len Max",
    "Subflow Fwd Byts",
    "Flow IAT Max",
    "ACK Flag Cnt",
    "Flow IAT Mean",
    "TotLen Fwd Pkts",
    "Fwd Header Len",
    "Protocol",
    "ECE Flag Cnt",
    "Pkt Len Var",
    "SYN Flag Cnt"
]
len(features)

36

In [5]:
cols_to_drop = [col for col in df.columns if col not in features and col != 'Label']
df.drop(cols_to_drop, axis=1, inplace=True)
test.drop(cols_to_drop, axis=1, inplace=True)

## training the model

In [9]:
from xgboost import XGBClassifier

X_train = df.drop('Label', axis=1)
y_train = df['Label']
X_test = test.drop('Label', axis=1)
y_test = test['Label']

In [None]:
model1 = XGBClassifier(n_jobs=8)
model1.fit(X_train, y_train)

In [8]:
model2 = XGBClassifier(n_estimators=1000, learning_rate=0.05, n_jobs=8)
model2.fit(X_train, y_train, early_stopping_rounds=5, eval_set=[(X_test, y_test)])

NameError: name 'XGBClassifier' is not defined

In [None]:
model2 = XGBClassifier(n_estimators=1000, learning_rate=0.05, n_jobs=8)
model2.fit(X_train, y_train, early_stopping_rounds=5, eval_set=[(X_test, y_test)])

In [None]:
# Calculate scale_pos_weight value
class_counts = y_train.value_counts()
scale_pos_weight = class_counts[0] / class_counts[5]

# Create and train the model
model3 = XGBClassifier(n_estimators=1000, learning_rate=0.05, n_jobs=8, scale_pos_weight=scale_pos_weight)
model3.fit(X_train, y_train, early_stopping_rounds=5, eval_set=[(X_test, y_test)])

In [None]:
y_pred_model1 = model1.predict(X_test)
y_pred_model2 = model2.predict(X_test)

## Model Evaluation

In [None]:
from sklearn.metrics import precision_score, recall_score, f1_score, accuracy_score

# For model 1
accuracy_model1 = accuracy_score(y_test, y_pred_model1)
precision_model1 = precision_score(y_test, y_pred_model1, average='weighted')
recall_model1 = recall_score(y_test, y_pred_model1, average='weighted')
f1_model1 = f1_score(y_test, y_pred_model1, average='weighted')

print("Model 1 Accuracy: ", accuracy_model1)
print("Model 1 Precision: ", precision_model1)
print("Model 1 Recall: ", recall_model1)
print("Model 1 F1 Score: ", f1_model1)

In [None]:
# For model 2
accuracy_model2 = accuracy_score(y_test, y_pred_model2)
precision_model2 = precision_score(y_test, y_pred_model2, average='weighted')
recall_model2 = recall_score(y_test, y_pred_model2, average='weighted')
f1_model2 = f1_score(y_test, y_pred_model2, average='weighted')

print("Model 2 Accuracy: ", accuracy_model2)
print("Model 2 Precision: ", precision_model2)
print("Model 2 Recall: ", recall_model2)
print("Model 2 F1 Score: ", f1_model2)

### Confusion matrices

In [None]:
import seaborn as sns
from sklearn.metrics import confusion_matrix
import matplotlib.pyplot as plt

# Compute confusion matrix for model1
cm_model1 = confusion_matrix(y_test, y_pred_model1)

plt.figure(figsize=(8, 6))
sns.heatmap(cm_model1, annot=True, cbar=None, cmap="YlGnBu", fmt="d")
plt.title("Confusion Matrix for Model 1"), plt.tight_layout()
plt.ylabel("True Class"), plt.xlabel("Predicted Class")
plt.show()

# Compute confusion matrix for model2
cm_model2 = confusion_matrix(y_test, y_pred_model2)

plt.figure(figsize=(8, 6))
sns.heatmap(cm_model2, annot=True, cbar=None, cmap="YlGnBu", fmt="d")
plt.title("Confusion Matrix for Model 2"), plt.tight_layout()
plt.ylabel("True Class"), plt.xlabel("Predicted Class")
plt.show()