In [30]:
import pandas as pd
import numpy as np
import lightgbm as lgb
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import GridSearchCV
import pickle

# Load your dataset (update the path as needed)
data = pd.read_csv('../datasets/Trojan_Detection.csv')

# Display the first few rows to understand the dataset structure
print(data.head())

   Unnamed: 0                                Flow ID    Source IP  \
0       73217    10.42.0.42-121.14.255.84-49975-80-6   10.42.0.42   
1       72089  172.217.6.226-10.42.0.42-443-49169-17   10.42.0.42   
2       96676       10.42.0.1-10.42.0.42-53-37749-17   10.42.0.42   
3       42891       10.42.0.1-10.42.0.42-53-41352-17   10.42.0.42   
4      169326  10.42.0.151-107.22.241.77-44353-443-6  10.42.0.151   

    Source Port  Destination IP   Destination Port   Protocol  \
0         49975   121.14.255.84                 80          6   
1         49169   172.217.6.226                443         17   
2         37749       10.42.0.1                 53         17   
3         41352       10.42.0.1                 53         17   
4         44353   107.22.241.77                443          6   

             Timestamp   Flow Duration   Total Fwd Packets  ...  \
0  17/07/2017 01:18:33        10743584                   4  ...   
1  17/07/2017 10:25:25          254217                   6  

In [31]:
# Check for missing values
print("Missing values in each column:")
print(data.isnull().sum())

Missing values in each column:
Unnamed: 0         0
Flow ID            0
 Source IP         0
 Source Port       0
 Destination IP    0
                  ..
Idle Mean          0
 Idle Std          0
 Idle Max          0
 Idle Min          0
Class              0
Length: 86, dtype: int64


In [32]:
# Strip any leading or trailing spaces from column names
data.columns = data.columns.str.strip()

In [33]:
data = data.drop(columns=['Unnamed: 0', 'Flow ID', 'Source IP', 'Destination IP', 'Timestamp'])


In [34]:
# Encode the target variable 'Class' as binary (Trojan = 1, Benign = 0)
label_encoder = LabelEncoder()
data['Class'] = label_encoder.fit_transform(data['Class'])

In [35]:
# Fill any missing values with the median (you can also use mean or mode, depending on the feature)
data = data.fillna(data.median())

In [36]:
# Select features (excluding the target column 'Class')
X = data.drop(columns=['Class'])

In [37]:
# Target variable
y = data['Class']

In [38]:
# Split the data into training and test sets (80% train, 20% test)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [39]:
# Define LGBM model
lgbm_model = lgb.LGBMClassifier(boosting_type='gbdt', objective='binary', metric='binary_error')


In [40]:
# Train the model
lgbm_model.fit(X_train, y_train)

[LightGBM] [Info] Number of positive: 72623, number of negative: 69362
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.022012 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 14573
[LightGBM] [Info] Number of data points in the train set: 141985, number of used features: 68
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.511484 -> initscore=0.045943
[LightGBM] [Info] Start training from score 0.045943


In [41]:
with open('lgbm_model.pkl', 'wb') as f:
    pickle.dump(lgbm_model, f)

print("Model saved as 'lgbm_model.pkl'")

Model saved as 'lgbm_model.pkl'


In [42]:
# Save all features to a .txt file
feature_names = X_train.columns if isinstance(X_train, pd.DataFrame) else [f"Feature_{i}" for i in range(X_train.shape[1])]

with open('features.txt', 'w') as f:
    for feature in feature_names:
        f.write(feature + '\n')

print("Features saved as 'features.txt'")

Features saved as 'features.txt'


In [43]:
# Make predictions
y_pred = lgbm_model.predict(X_test)

In [44]:
# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy:.4f}")
print("\nClassification Report:")
print(classification_report(y_test, y_pred))
print("\nConfusion Matrix:")
print(confusion_matrix(y_test, y_pred))

Accuracy: 0.6989

Classification Report:
              precision    recall  f1-score   support

           0       0.68      0.72      0.70     17437
           1       0.72      0.67      0.70     18060

    accuracy                           0.70     35497
   macro avg       0.70      0.70      0.70     35497
weighted avg       0.70      0.70      0.70     35497


Confusion Matrix:
[[12631  4806]
 [ 5881 12179]]


In [None]:
#Performing hyperparameter tuning using GridSearchCV:
param_grid = {
    'num_leaves': [31, 50],
    'max_depth': [10, 20, -1],
    'learning_rate': [0.05, 0.1],
    'n_estimators': [20, 50, 100]
}


In [46]:
grid_search = GridSearchCV(estimator=lgbm_model, param_grid=param_grid, cv=3, verbose=1)
grid_search.fit(X_train, y_train)


Fitting 3 folds for each of 36 candidates, totalling 108 fits
[LightGBM] [Info] Number of positive: 48415, number of negative: 46241
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.016512 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 14533
[LightGBM] [Info] Number of data points in the train set: 94656, number of used features: 68
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.511484 -> initscore=0.045943
[LightGBM] [Info] Start training from score 0.045943
[LightGBM] [Info] Number of positive: 48416, number of negative: 46241
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.016726 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 14539
[LightGBM] [Info] Number of data points in the train set: 94657, number of used features: 68
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.511489 -> initscore=0.045963
[Ligh

In [48]:
with open('lgbm_model_GS.pkl', 'wb') as f:
    pickle.dump(grid_search, f)

print("Model saved as 'lgbm_model_GS.pkl'")

Model saved as 'lgbm_model_GS.pkl'


In [49]:
# Print best parameters and evaluate with the best model
print("\nBest Parameters from GridSearchCV:")
print(grid_search.best_params_)


Best Parameters from GridSearchCV:
{'learning_rate': 0.1, 'max_depth': -1, 'n_estimators': 100, 'num_leaves': 50}


In [50]:
# Evaluate the model with the best parameters
best_model = grid_search.best_estimator_
y_pred_best = best_model.predict(X_test)
print(f"\nAccuracy with Best Model: {accuracy_score(y_test, y_pred_best):.4f}")


Accuracy with Best Model: 0.7115
