In [4]:
%load_ext autoreload
%autoreload 2

In [5]:
from data_loader import load_folder
import pandas as pd

In [16]:
human_df, _ = load_folder("D:/Files/GitHub/DL-BotGuard/data/raw/our/v1/mouse", label=0)
bot_df,   _ = load_folder("D:/Files/GitHub/DL-BotGuard/data/raw/our_bot/v1/mouse", label=1)

In [17]:
human_df.info(memory_usage='deep')

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1772654 entries, 0 to 1772653
Data columns (total 15 columns):
 #   Column        Dtype  
---  ------        -----  
 0   timestamp_ms  float64
 1   session_id    object 
 2   event_type    object 
 3   pid           float64
 4   process_name  object 
 5   source        object 
 6   x             float64
 7   y             float64
 8   key           object 
 9   type          object 
 10  label         int64  
 11  dt            float64
 12  dx            float64
 13  dy            float64
 14  speed         float64
dtypes: float64(8), int64(1), object(6)
memory usage: 692.6 MB


In [18]:
bot_df.info(memory_usage='deep')

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8154 entries, 0 to 8153
Data columns (total 15 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   timestamp_ms  8154 non-null   float64
 1   session_id    8154 non-null   object 
 2   event_type    8154 non-null   object 
 3   pid           8151 non-null   float64
 4   process_name  8151 non-null   object 
 5   source        8154 non-null   object 
 6   x             8129 non-null   float64
 7   y             8129 non-null   float64
 8   key           0 non-null      object 
 9   type          8154 non-null   object 
 10  label         8154 non-null   int64  
 11  dt            8154 non-null   float64
 12  dx            8154 non-null   float64
 13  dy            8154 non-null   float64
 14  speed         8154 non-null   float64
dtypes: float64(8), int64(1), object(6)
memory usage: 3.2 MB


In [19]:
human_df = human_df.iloc[:10_000].copy()

In [None]:
def build_windows(df, window_size=1000):
    rows = []

    df = df.sort_values(["session_id", "timestamp_ms"]).reset_index(drop=True)

    for session_id, d in df.groupby("session_id"):
        label = d["label"].iloc[0]
        
        for start in range(0, len(d), window_size):
            end = start + window_size
            chunk = d.iloc[start:end]

            if len(chunk) < window_size:
                continue

            features = {
                "session_id": session_id,
                "label": label,

                "n_events": len(chunk),
                "mean_speed": chunk["speed"].mean(),
                "std_speed": chunk["speed"].std(),
                "max_speed": chunk["speed"].max(),

                "mean_dt": chunk["dt"].mean(),
                "std_dt": chunk["dt"].std(),

                "mean_dx": chunk["dx"].mean(),
                "mean_dy": chunk["dy"].mean(),

                "n_clicks": (chunk["event_type"] == "click").sum(),
                "n_keys": (chunk["type"] == "key").sum(),

                "main_proc": chunk["process_name"].mode().iloc[0]
                            if not chunk["process_name"].mode().empty
                            else "none",
            }

            rows.append(features)

    return pd.DataFrame(rows)


In [21]:
df_all = pd.concat([human_df, bot_df], ignore_index=True)

In [22]:
windows = build_windows(df_all, window_size=100)
windows.head()

Unnamed: 0,session_id,label,n_events,mean_speed,std_speed,max_speed,mean_dt,std_dt,mean_dx,mean_dy,n_clicks,n_keys,main_proc
0,7cdf9a1d-0738-40d1-acd7-fd3254f3a004,1,100,0.784957,0.619197,2.236068,3.45647,5.204117,0.47,0.05,1,0,explorer.exe
1,7cdf9a1d-0738-40d1-acd7-fd3254f3a004,1,100,2.321762,0.543171,3.605551,1.01967,0.040272,1.77,-1.5,0,0,explorer.exe
2,7cdf9a1d-0738-40d1-acd7-fd3254f3a004,1,100,1.503278,0.402793,2.236068,1.02011,0.044471,0.81,-1.22,0,0,explorer.exe
3,7cdf9a1d-0738-40d1-acd7-fd3254f3a004,1,100,3.837563,2.774243,9.219544,9.4425,43.819955,-2.93,2.37,1,0,explorer.exe
4,7cdf9a1d-0738-40d1-acd7-fd3254f3a004,1,100,0.918832,0.319295,1.414214,1.97564,5.275621,-0.57,0.08,0,0,explorer.exe


In [23]:
windows["label"].value_counts()

label
0    100
1     80
Name: count, dtype: int64

In [24]:
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report

# Encode process name
proc_le = LabelEncoder()
windows["main_proc_enc"] = proc_le.fit_transform(windows["main_proc"])

X = windows.drop(["session_id", "label", "main_proc"], axis=1)
y = windows["label"]

# Scale numeric features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Train/test split
X_train, X_test, y_train, y_test = train_test_split(
    X_scaled, y, test_size=0.25, random_state=42, stratify=y
)

clf = LogisticRegression(max_iter=500)
clf.fit(X_train, y_train)

y_pred = clf.predict(X_test)
print(classification_report(y_test, y_pred))


              precision    recall  f1-score   support

           0       0.96      0.92      0.94        25
           1       0.90      0.95      0.93        20

    accuracy                           0.93        45
   macro avg       0.93      0.94      0.93        45
weighted avg       0.93      0.93      0.93        45

