In [48]:
# load dataset 
import pandas as pd
data=pd.read_csv('../data/train.csv')
data.head()

Unnamed: 0,service,flag,src_bytes,dst_bytes,logged_in,count,serror_rate,srv_serror_rate,same_srv_rate,diff_srv_rate,...,dst_host_count,dst_host_srv_count,dst_host_same_srv_rate,dst_host_diff_srv_rate,dst_host_same_src_port_rate,dst_host_srv_diff_host_rate,dst_host_serror_rate,dst_host_srv_serror_rate,level,attack
0,44,9,146,0,0,13,0.0,0.0,0.08,0.15,...,255,1,0.0,0.6,0.88,0.0,0.0,0.0,15,1
1,49,5,0,0,0,123,1.0,1.0,0.05,0.07,...,255,26,0.1,0.05,0.0,0.0,1.0,1.0,19,0
2,24,9,232,8153,1,5,0.2,0.2,1.0,0.0,...,30,255,1.0,0.0,0.03,0.04,0.03,0.01,21,1
3,24,9,199,420,1,30,0.0,0.0,1.0,0.0,...,255,255,1.0,0.0,0.0,0.0,0.0,0.0,21,1
4,49,1,0,0,0,121,0.0,0.0,0.16,0.06,...,255,19,0.07,0.07,0.0,0.0,0.0,0.0,21,0


In [49]:
# train test split
from sklearn.model_selection import train_test_split

X = data.drop(["attack"], axis=1)
y = data["attack"]

X_train, X_test, y_train, y_test = train_test_split(X, y,test_size=0.1,random_state=43) 

In [50]:
# scaling
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()

X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

In [54]:
# model build
import lightgbm as lgb

# Create LightGBM datasets for training and testing
train_data = lgb.Dataset(X_train, label=y_train)
test_data = lgb.Dataset(X_test, label=y_test, reference=train_data)

# Define hyperparameters
params = {
    "objective": "binary",
    "boosting_type": "rf",
    "num_leaves": 30,
    "max_depth": -1,
    "force_row_wise": True,
    "learning_rate": 0.01,
    "metric": "binary_logloss",
    "bagging_fraction": 0.6,
    "bagging_freq": 5,
    "feature_fraction": 0.6,
    "n_estimators": 1000,
    "min_data_in_leaf": 30,
    "max_bin": 1024,
    "lambda_l1": 0.1,
    "lambda_l2": 0.2,
}

# Train the LightGBM model
num_round = 100
bst = lgb.train(params, train_data, num_round, valid_sets=[test_data])

[LightGBM] [Info] Number of positive: 60572, number of negative: 52802
[LightGBM] [Info] Total Bins 4132
[LightGBM] [Info] Number of data points in the train set: 113374, number of used features: 20
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.534267 -> initscore=0.137284
[LightGBM] [Info] Start training from score 0.137284


In [55]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
# Make predictions
y_pred = bst.predict(X_test)

# Convert probabilities to binary predictions
y_pred_binary = (y_pred > 0.5).astype(int)

# Evaluate model
accuracy = accuracy_score(y_test, y_pred_binary)
precision = precision_score(y_test, y_pred_binary)
recall = recall_score(y_test, y_pred_binary)
f1score = f1_score(y_test, y_pred_binary)
# Print metrics
print(f"Accuracy: {accuracy:.4f}")
print(f"Precision: {precision:.4f}")
print(f"Recall: {recall:.4f}")
print(f"F1-Score: {f1score:.4f}")

Accuracy: 0.9966
Precision: 0.9990
Recall: 0.9947
F1-Score: 0.9968


In [56]:
from sklearn.metrics import classification_report
# Generate classification report
report = classification_report(y_test, y_pred_binary)
print("Classification Report:\n", report)

Classification Report:
               precision    recall  f1-score   support

           0       0.99      1.00      1.00      5828
           1       1.00      0.99      1.00      6770

    accuracy                           1.00     12598
   macro avg       1.00      1.00      1.00     12598
weighted avg       1.00      1.00      1.00     12598

