In [None]:
# Install necessary libraries
!pip install -q scikit-learn pandas xgboost

import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from xgboost import XGBClassifier
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix

# -------------------------------------
# Step 1: Upload and Load the Server Suspicious Logs CSV File
from google.colab import files
uploaded = files.upload()  # Upload your "server_logs_suspicious.csv" file


[31mERROR: Operation cancelled by user[0m[31m
[0mTraceback (most recent call last):
  File "/usr/local/lib/python3.11/dist-packages/pip/_internal/cli/base_command.py", line 179, in exc_logging_wrapper
    status = run_func(*args)
             ^^^^^^^^^^^^^^^
  File "/usr/local/lib/python3.11/dist-packages/pip/_internal/cli/req_command.py", line 67, in wrapper
    return func(self, options, args)
           ^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/usr/local/lib/python3.11/dist-packages/pip/_internal/commands/install.py", line 324, in run
    session = self.get_default_session(options)
              ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/usr/local/lib/python3.11/dist-packages/pip/_internal/cli/index_command.py", line 71, in get_default_session
    self._session = self.enter_context(self._build_session(options))
                                       ^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/usr/local/lib/python3.11/dist-packages/pip/_internal/cli/index_command.py", line 100, in _build_sess

KeyboardInterrupt: 

In [None]:
df = pd.read_csv("server_logs_suspicious.csv")
print("First few rows of the dataset:")
print(df.head())
print("\nData Info:")
print(df.info())

First few rows of the dataset:
           Date first seen   Duration  Proto    Src IP Addr  Src Pt  \
0  2017-03-14 17:43:57.172  81412.697  TCP       EXT_SERVER    8082   
1  2017-03-14 17:43:57.172  81412.697  TCP    OPENSTACK_NET   56978   
2  2017-03-14 17:43:26.135  81504.787  TCP       EXT_SERVER    8082   
3  2017-03-14 17:43:26.135  81504.787  TCP    OPENSTACK_NET   56979   
4  2017-03-14 18:17:09.005  82100.692  TCP       EXT_SERVER    8082   

     Dst IP Addr   Dst Pt  Packets     Bytes  Flows   Flags  Tos   class  \
0  OPENSTACK_NET  56978.0     3057     2.1 M      1  .AP...    0  normal   
1     EXT_SERVER   8082.0     4748     2.5 M      1  .AP...    0  normal   
2  OPENSTACK_NET  56979.0     8639     9.1 M      1  .AP...    0  normal   
3     EXT_SERVER   8082.0    12024    10.3 M      1  .AP...    0  normal   
4  OPENSTACK_NET  51649.0    11012    27.2 M      1  .AP.S.    0  normal   

  attackType attackID attackDescription  
0        ---      ---               ---  
1

In [None]:
# Custom function to convert the "Bytes" column from strings like "2.1 M" or "500K" to a numeric value.
def convert_bytes(val):
    try:
        val = str(val).strip()
    except:
        return 0.0
    if val[-1].upper() == 'M':
        try:
            return float(val[:-1]) * 1e6
        except:
            return 0.0
    elif val[-1].upper() == 'K':
        try:
            return float(val[:-1]) * 1e3
        except:
            return 0.0
    else:
        try:
            return float(val)
        except:
            return 0.0

# Fill missing values
df.fillna("Unknown", inplace=True)



In [None]:
# Convert numeric columns; adjust errors to coerce invalid values into NaN, then fill them.
df['Duration'] = pd.to_numeric(df['Duration'], errors='coerce').fillna(0.0)
df['Packets'] = pd.to_numeric(df['Packets'], errors='coerce').fillna(0)
df['Flows'] = pd.to_numeric(df['Flows'], errors='coerce').fillna(0)
df['Src Pt'] = pd.to_numeric(df['Src Pt'], errors='coerce').fillna(0)
df['Dst Pt'] = pd.to_numeric(df['Dst Pt'], errors='coerce').fillna(0)
df['Bytes_num'] = df['Bytes'].apply(convert_bytes)


In [None]:
# Process the date information: convert "Date first seen" to datetime and extract the hour
df['Date first seen'] = pd.to_datetime(df['Date first seen'], errors='coerce')
df['hour'] = df['Date first seen'].dt.hour.fillna(0).astype(int)


In [None]:
# Step 3: Create a Target Label
# Here, we use the "class" column. We assume that logs labeled as "normal" are non-suspicious (0)
# and any other label indicates a suspicious log (1).
df['suspicious'] = df['class'].apply(lambda x: 0 if str(x).strip().lower() == 'normal' else 1)

print("\nLabel distribution:")
print(df['suspicious'].value_counts())


Label distribution:
suspicious
1    123232
0     49606
Name: count, dtype: int64


In [None]:
# Step 4: Feature Selection
# For our model, we will use these numeric features:
#   - Duration, Packets, Flows, Src Pt, Dst Pt, Bytes_num, hour
# And we will include "Proto" (protocol) as a categorical feature.
num_features = ['Duration', 'Packets', 'Flows', 'Src Pt', 'Dst Pt', 'Bytes_num', 'hour']
cat_features = ['Proto']

X = df[num_features + cat_features]
y = df['suspicious']

In [None]:
# Step 5: Split Data into Training and Testing Sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)


In [None]:
# Step 6: Build a Pipeline with Feature Processing and XGBoost Classification
# Use a ColumnTransformer to scale numeric features and one-hot encode the categorical feature.
preprocessor = ColumnTransformer(transformers=[
    ('num', StandardScaler(), num_features),
    ('cat', OneHotEncoder(handle_unknown='ignore'), cat_features)
])

pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', XGBClassifier(
        n_estimators=100,
        use_label_encoder=False,
        eval_metric='logloss',
        random_state=42))
])


In [None]:
# Step 7: Train the Model
pipeline.fit(X_train, y_train)

Parameters: { "use_label_encoder" } are not used.



In [None]:
# Step 8: Make Predictions on the Test Set
y_pred = pipeline.predict(X_test)

In [None]:
# Step 9: Evaluate the Model
print("\nClassification Report:")
print(classification_report(y_test, y_pred))
print("Accuracy:", accuracy_score(y_test, y_pred))
print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred))


Classification Report:
              precision    recall  f1-score   support

           0       1.00      1.00      1.00     14831
           1       1.00      1.00      1.00     37021

    accuracy                           1.00     51852
   macro avg       1.00      1.00      1.00     51852
weighted avg       1.00      1.00      1.00     51852

Accuracy: 0.9999614286816323
Confusion Matrix:
[[14829     2]
 [    0 37021]]
