In [8]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report
from urllib.parse import urlparse

# Load cleaned dataset
df = pd.read_csv('data/fully_cleaned_dataset_no_anomaly.csv')

# Step 1: Check class balance
print("Class distribution:")
print(df['classification'].value_counts(normalize=True))

# Step 2: Enhanced feature engineering
# URL-based features
df['url_length'] = df['URL'].apply(lambda x: len(str(x)))
df['query_present'] = df['URL'].apply(lambda x: 1 if '?' in str(x) else 0)
df['path_depth'] = df['URL'].apply(lambda x: len(urlparse(str(x)).path.split('/')) - 1)
df['num_params'] = df['URL'].apply(lambda x: len(urlparse(str(x)).query.split('&')) if urlparse(str(x)).query else 0)
df['has_suspicious_chars'] = df['URL'].apply(lambda x: 1 if any(c in str(x).lower() for c in ['<', '>', 'script', 'union', '..']) else 0)

# Method-based features
df['method_is_get'] = df['Method'].apply(lambda x: 1 if str(x).upper() == 'GET' else 0)
df['method_is_post'] = df['Method'].apply(lambda x: 1 if str(x).upper() == 'POST' else 0)
df['method_is_other'] = df['Method'].apply(lambda x: 1 if str(x).upper() not in ['GET', 'POST'] else 0)

# Header-based features
df['user_agent_length'] = df['User-Agent'].apply(lambda x: len(str(x)))
df['has_cookie'] = df['cookie'].apply(lambda x: 1 if str(x).strip() else 0)
df['connection_closed'] = df['connection'].apply(lambda x: 1 if 'close' in str(x).lower() else 0)

# Step 3: Group by URL and split unique URLs
unique_urls = df['URL'].unique()
train_urls, test_urls = train_test_split(unique_urls, test_size=0.2, random_state=42)

# Step 4: Assign rows to train/test
train_df = df[df['URL'].isin(train_urls)]
test_df = df[df['URL'].isin(test_urls)]

# Step 5: Extract features and target
X_train = train_df.drop(columns=['classification'])
y_train = train_df['classification']
X_test = test_df.drop(columns=['classification'])
y_test = test_df['classification']

# Step 6: Numeric features
numeric_cols = X_train.select_dtypes(include=['int64', 'float64']).columns
print(f"\nNumeric features in X_train: {list(numeric_cols)}")
print(f"Train size: {len(X_train)}, Test size: {len(X_test)}")

# Step 7: Train and evaluate model
model = RandomForestClassifier(random_state=42)
model.fit(X_train[numeric_cols], y_train)
y_pred = model.predict(X_test[numeric_cols])
accuracy = accuracy_score(y_test, y_pred)
print(f"Model accuracy on test set: {accuracy:.2f}")
print("\nClassification report:")
print(classification_report(y_test, y_pred))

# Step 8: Feature importance
importances = pd.Series(model.feature_importances_, index=numeric_cols)
print("\nFeature importances:")
print(importances.sort_values(ascending=False))

# Step 9: Check URL overlap
train_urls_set = set(X_train['URL'])
test_urls_set = set(X_test['URL'])
url_overlap = train_urls_set.intersection(test_urls_set)
print(f"Number of overlapping URLs: {len(url_overlap)}")

Class distribution:
classification
0    0.589536
1    0.410464
Name: proportion, dtype: float64

Numeric features in X_train: ['url_length', 'query_present', 'path_depth', 'num_params', 'has_suspicious_chars', 'method_is_get', 'method_is_post', 'method_is_other', 'user_agent_length', 'has_cookie', 'connection_closed']
Train size: 46996, Test size: 14069
Model accuracy on test set: 0.52

Classification report:
              precision    recall  f1-score   support

           0       0.85      0.30      0.44      9029
           1       0.42      0.91      0.57      5040

    accuracy                           0.52     14069
   macro avg       0.64      0.60      0.51     14069
weighted avg       0.70      0.52      0.49     14069


Feature importances:
url_length              0.538624
path_depth              0.166412
method_is_get           0.112105
method_is_post          0.066068
num_params              0.056902
has_suspicious_chars    0.025600
query_present           0.023064
method_