# FROM FLOWS TO DATAFRAME

In [12]:
import pandas as pd

df = pd.read_csv("../data/Friday-02-03-2018_Traffic.csv", low_memory=False)
df['is_bot'] = df['Label'].apply(lambda x: 1 if x != 'Benign' else 0)
print(df['is_bot'].value_counts())

is_bot
0    762384
1    286191
Name: count, dtype: int64


# Data Cleaning

In [13]:
import numpy as np

# Replace inf/-inf with NaN
df.replace([np.inf, -np.inf], np.nan, inplace=True)

# Drop rows with NaN (optional: could also fill them with 0)
df.dropna(inplace=True)

# Check for any remaining problematic values
print("Any NaNs:", df.isna().values.any())
print("Any Infs:", np.isinf(df.select_dtypes(include=[np.number])).values.any())


Any NaNs: False
Any Infs: False


# Shuffling and splitting the data

In [14]:
from sklearn.model_selection import train_test_split

X = df.drop(columns=['Label', 'is_bot', 'Timestamp'])
y = df['is_bot']

X = X.fillna(0)  # Handle missing values

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.3, stratify=y, random_state=42
)

# Training the classifier

In [15]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix

model = RandomForestClassifier(n_estimators=100, random_state=42)
model.fit(X_train, y_train)

y_pred = model.predict(X_test)

print(classification_report(y_test, y_pred))
print(confusion_matrix(y_test, y_pred))

              precision    recall  f1-score   support

           0       1.00      1.00      1.00    227501
           1       1.00      1.00      1.00     85857

    accuracy                           1.00    313358
   macro avg       1.00      1.00      1.00    313358
weighted avg       1.00      1.00      1.00    313358

[[227499      2]
 [     7  85850]]


# Checking feature importance

In [16]:
import matplotlib.pyplot as plt

importances = model.feature_importances_
features = X.columns

plt.figure(figsize=(12, 8))
plt.barh(features[:20], importances[:20])
plt.title("Top 20 Features for Bot Detection")
plt.tight_layout()
plt.show()

ModuleNotFoundError: No module named 'matplotlib'