In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from xgboost import XGBClassifier
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix

# -------------------------------------
# Step 1: Upload and Load the WEB Logs CSV File
from google.colab import files
uploaded = files.upload()

Saving weblog.csv to weblog.csv


In [None]:
from datetime import datetime
from sklearn.feature_extraction.text import TfidfVectorizer

In [None]:
# Since the file may have no header, we assign column names:
# Columns: IP, Timestamp, Request, Status
df = pd.read_csv("weblog.csv", header=None, names=["IP", "Timestamp", "Request", "Status"])
print("First few rows of the dataset:")
print(df.head())
print("\nData Info:")
print(df.info())

First few rows of the dataset:
           IP              Timestamp                                Request  \
0          IP                   Time                                    URL   
1  10.128.2.1  [29/Nov/2017:06:58:55                GET /login.php HTTP/1.1   
2  10.128.2.1  [29/Nov/2017:06:59:02             POST /process.php HTTP/1.1   
3  10.128.2.1  [29/Nov/2017:06:59:03                 GET /home.php HTTP/1.1   
4  10.131.2.1  [29/Nov/2017:06:59:04  GET /js/vendor/moment.min.js HTTP/1.1   

  Status  
0  Staus  
1    200  
2    302  
3    200  
4    200  

Data Info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 16008 entries, 0 to 16007
Data columns (total 4 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   IP         16008 non-null  object
 1   Timestamp  16008 non-null  object
 2   Request    16008 non-null  object
 3   Status     16008 non-null  object
dtypes: object(4)
memory usage: 500.4+ KB
None


In [None]:
# Step 2: Data Cleaning & Preprocessing

# Remove any extraneous characters from the Timestamp (e.g. the leading '[')
df['Timestamp'] = df['Timestamp'].str.replace(r'^\[', '', regex=True)


In [None]:
# Convert Timestamp to datetime using format: day/Mon/year:hour:minute:second
# Example: "29/Nov/2017:06:58:55"
df['Timestamp'] = pd.to_datetime(df['Timestamp'], format="%d/%b/%Y:%H:%M:%S", errors='coerce')

# Extract time features: hour and minute
df['hour'] = df['Timestamp'].dt.hour
df['minute'] = df['Timestamp'].dt.minute

# Convert Status to numeric (if not already)
df['Status'] = pd.to_numeric(df['Status'], errors='coerce').fillna(0).astype(int)


In [None]:
# Create target label:
# If status is 200, label as normal (0); else, label as suspicious (1)
df['issue'] = df['Status'].apply(lambda x: 0 if x == 200 else 1)

print("\nLabel distribution:")
print(df['issue'].value_counts())

# Extract HTTP method from the Request column (assume the method is the first token)
df['Method'] = df['Request'].apply(lambda x: str(x).split()[0] if pd.notnull(x) and len(str(x).split())>0 else "Unknown")



Label distribution:
issue
0    11330
1     4678
Name: count, dtype: int64


In [None]:
# Step 3: Feature Selection
# Define features:
# - "Request": full HTTP request text, to be vectorized.
# - "Method": HTTP method (categorical).
# - "hour" and "minute": numeric features.
feature_columns = ["Request", "Method", "hour", "minute"]
X = df[feature_columns]
y = df['issue']

In [None]:
# Step 4: Split Data into Training and Testing Sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)


In [None]:
# Step 5: Build a Pipeline with Feature Processing and XGBoost Classification
# We use a ColumnTransformer to:
# - Apply TfidfVectorizer to the "Request" column.
# - One-hot encode the "Method" column.
# - Scale the numeric features "hour" and "minute".
preprocessor = ColumnTransformer(transformers=[
    ('text', TfidfVectorizer(), 'Request'),
    ('cat', OneHotEncoder(handle_unknown='ignore'), ['Method']),
    ('num', StandardScaler(), ['hour', 'minute'])
])


In [None]:
# Build the pipeline with XGBoost as the classifier.
pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', XGBClassifier(n_estimators=100,
                                 use_label_encoder=False,
                                 eval_metric='logloss',
                                 random_state=42))
])

In [None]:
# Step 6: Train the Model
pipeline.fit(X_train, y_train)

# -------------------------------------
# Step 7: Make Predictions on the Test Set
y_pred = pipeline.predict(X_test)


Parameters: { "use_label_encoder" } are not used.



In [None]:
# Step 8: Evaluate the Model
print("\nClassification Report:")
print(classification_report(y_test, y_pred))
print("Accuracy:", accuracy_score(y_test, y_pred))
print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred))



Classification Report:
              precision    recall  f1-score   support

           0       0.96      0.98      0.97      3359
           1       0.95      0.90      0.93      1444

    accuracy                           0.96      4803
   macro avg       0.95      0.94      0.95      4803
weighted avg       0.96      0.96      0.96      4803

Accuracy: 0.9566937330834895
Confusion Matrix:
[[3289   70]
 [ 138 1306]]
