## Find the basic `**df.log_message**` pattens for RegEX

In [None]:
import numpy as np
import pandas as pd

# pd.set_option("display.max_rows", None)      # show all rows
# pd.set_option("display.max_columns", None)   # show all columns
# pd.set_option("display.max_colwidth", None)  # don't truncate column text
# pd.set_option("display.expand_frame_repr", False)  # don't wrap output


df = pd.read_csv("datasets\synthetic_logs.csv")
df.head()

Unnamed: 0,timestamp,source,log_message,target_label
0,27-06-2025 07:20,ModernCRM,nova.osapi_compute.wsgi.server [req-b9718cd8-f...,HTTP Status
1,1/14/2025 23:07,ModernCRM,Email service experiencing issues with sending,Critical Error
2,1/17/2025 1:29,AnalyticsEngine,Unauthorized access to data was attempted,Security Alert
3,12-07-2025 00:24,ModernHR,nova.osapi_compute.wsgi.server [req-4895c258-b...,HTTP Status
4,02-06-2025 18:25,BillingSystem,nova.osapi_compute.wsgi.server [req-ee8bc8ba-9...,HTTP Status


In [3]:
from sentence_transformers import SentenceTransformer

encoder = SentenceTransformer("all-MiniLM-L6-v2")

embeddings = encoder.encode(df['log_message'].tolist())

In [4]:
# Save the model for future reference

encoder.save("../models/transformer_encoder")

In [5]:
print(embeddings.shape)

embeddings[0]

(2410, 384)


array([-1.02939636e-01,  3.35459523e-02, -2.20260546e-02,  1.55100389e-03,
       -9.86921228e-03, -1.78956240e-01, -6.34410754e-02, -6.01762086e-02,
        2.81108841e-02,  5.99619858e-02, -1.72618460e-02,  1.43366517e-03,
       -1.49560049e-01,  3.15286848e-03, -5.66030517e-02,  2.71685906e-02,
       -1.49889914e-02, -3.54037695e-02, -3.62935811e-02, -1.45410020e-02,
       -5.61488094e-03,  8.75538737e-02,  4.55121286e-02,  2.50962824e-02,
        1.00187296e-02,  1.24266446e-02, -1.39923573e-01,  7.68696517e-02,
        3.14094983e-02, -4.15249960e-03,  4.36903723e-02,  1.71250124e-02,
       -8.00951347e-02,  5.74005879e-02,  1.89092755e-02,  8.55261460e-02,
        3.96399647e-02, -1.34371817e-01, -1.44359958e-03,  3.06711020e-03,
        1.76854119e-01,  4.44889208e-03, -1.69274341e-02,  2.24267263e-02,
       -4.35049385e-02,  6.09024614e-03, -9.98167042e-03, -6.23972081e-02,
        1.07372217e-02, -6.04892010e-03, -7.14660808e-02, -8.45806301e-03,
       -3.18020806e-02, -

## clustering to find the `RegEx` model pattern

In [6]:
from sklearn.cluster import DBSCAN
from sklearn.metrics import silhouette_score
import itertools 


eps_values = [0.1, 0.2, 0.3]
min_samples_values = [1, 2, 3]
metrics = ["euclidean", "manhattan", "cosine"]

for eps, ms, m in itertools.product(eps_values, min_samples_values, metrics):
    model = DBSCAN(eps=eps, min_samples=ms, metric=m)
    labels = model.fit_predict(embeddings)
    print(f"params={eps, ms, m}, score :{silhouette_score(embeddings, labels)}")

params=(0.1, 1, 'euclidean'), score :0.11310874670743942
params=(0.1, 1, 'manhattan'), score :0.06680496782064438
params=(0.1, 1, 'cosine'), score :0.38429829478263855
params=(0.1, 2, 'euclidean'), score :-0.10708826035261154
params=(0.1, 2, 'manhattan'), score :-0.016362475231289864
params=(0.1, 2, 'cosine'), score :0.3475953936576843
params=(0.1, 3, 'euclidean'), score :-0.036138638854026794
params=(0.1, 3, 'manhattan'), score :0.08554428070783615
params=(0.1, 3, 'cosine'), score :0.32906919717788696
params=(0.2, 1, 'euclidean'), score :0.2678360939025879
params=(0.2, 1, 'manhattan'), score :0.06680496782064438
params=(0.2, 1, 'cosine'), score :0.45236268639564514
params=(0.2, 2, 'euclidean'), score :0.1817118376493454
params=(0.2, 2, 'manhattan'), score :-0.016362475231289864
params=(0.2, 2, 'cosine'), score :0.4567190408706665
params=(0.2, 3, 'euclidean'), score :0.21452811360359192
params=(0.2, 3, 'manhattan'), score :0.08554428070783615
params=(0.2, 3, 'cosine'), score :0.4547082

- I want my circle could be too tight so `params=(0.2, 1, 'cosine'), score :0.45236268639564514`

In [7]:
model = DBSCAN(eps=0.2, min_samples=1, metric='cosine')
cluster = model.fit_predict(embeddings)

df['cluster'] = cluster
df['cluster'].unique()

array([  0,   1,   2,   3,   4,   5,   6,   7,   8,   9,  10,  11,  12,
        13,  14,  15,  16,  17,  18,  19,  20,  21,  22,  23,  24,  25,
        26,  27,  28,  29,  30,  31,  32,  33,  34,  35,  36,  37,  38,
        39,  40,  41,  42,  43,  44,  45,  46,  47,  48,  49,  50,  51,
        52,  53,  54,  55,  56,  57,  58,  59,  60,  61,  62,  63,  64,
        65,  66,  67,  68,  69,  70,  71,  72,  73,  74,  75,  76,  77,
        78,  79,  80,  81,  82,  83,  84,  85,  86,  87,  88,  89,  90,
        91,  92,  93,  94,  95,  96,  97,  98,  99, 100, 101, 102, 103,
       104, 105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116,
       117, 118, 119, 120, 121, 122, 123, 124, 125, 126, 127, 128, 129,
       130, 131, 132, 133, 134, 135], dtype=int64)

In [8]:
df[df['cluster'] == 0][:5]

Unnamed: 0,timestamp,source,log_message,target_label,cluster
0,27-06-2025 07:20,ModernCRM,nova.osapi_compute.wsgi.server [req-b9718cd8-f...,HTTP Status,0
3,12-07-2025 00:24,ModernHR,nova.osapi_compute.wsgi.server [req-4895c258-b...,HTTP Status,0
4,02-06-2025 18:25,BillingSystem,nova.osapi_compute.wsgi.server [req-ee8bc8ba-9...,HTTP Status,0
5,09-10-2025 10:30,ModernHR,nova.osapi_compute.wsgi.server [req-f0bffbc3-5...,HTTP Status,0
9,30-03-2025 04:01,ModernHR,nova.osapi_compute.wsgi.server [req-2bf7cfee-a...,HTTP Status,0


In [9]:
cluster_count = df['cluster'].value_counts()

large_clusters = [cluster for cluster, count in cluster_count.items() if count > 10]

for c in large_clusters:
    logs = df[df['cluster'] == c]['log_message']
    print(f"\n=== Cluster {c} ===")
    print(logs.head().to_string(index=False))


=== Cluster 0 ===
nova.osapi_compute.wsgi.server [req-b9718cd8-f6...
nova.osapi_compute.wsgi.server [req-4895c258-b2...
nova.osapi_compute.wsgi.server [req-ee8bc8ba-92...
nova.osapi_compute.wsgi.server [req-f0bffbc3-5a...
nova.osapi_compute.wsgi.server [req-2bf7cfee-a2...

=== Cluster 5 ===
nova.compute.claims [req-a07ac654-8e81-416d-bfb...
nova.compute.claims [req-d6986b54-3735-4a42-907...
nova.compute.claims [req-72b4858f-049e-49e1-b31...
nova.compute.claims [req-5c8f52bd-8e3c-41f0-95a...
nova.compute.claims [req-d38f479d-9bb9-4276-968...

=== Cluster 11 ===
User User685 logged out.
 User User395 logged in.
 User User225 logged in.
User User494 logged out.
 User User900 logged in.

=== Cluster 13 ===
Backup started at 2025-05-14 07:06:55.
Backup started at 2025-02-15 20:00:19.
  Backup ended at 2025-08-08 13:06:23.
Backup started at 2025-11-14 08:27:43.
Backup started at 2025-12-09 10:19:11.

=== Cluster 7 ===
Multiple bad login attempts detected on user 85...
Multiple login failure

## `Stage: 1` - **Regex** Classification

In [10]:
import re

def classify_with_regex(message:str) -> str:
    regex_pattens = {
        r"User User\d+ logged (in|out).": "User Action",
        r"Backup (started|ended) at .*": "System Notification",
        r"Backup completed successfully.": "System Notification",
        r"System updated to version .*": "System Notification",
        r"File .* uploaded successfully by user .*": "System Notification",
        r"Disk cleanup completed successfully.": "System Notification",
        r"System reboot initiated by user .*": "System Notification",
        r"Account with ID .* created by .*": "User Action"
    }
    for patten, label in regex_pattens.items():
        if re.search(patten, message):
            return label
    return None

In [11]:
classify_with_regex("Backup started at 2025-05-14 07:06:55.")

'System Notification'

In [12]:
df['regex_label'] = df['log_message'].apply(lambda x : classify_with_regex(x))

In [13]:
df[df['regex_label'].isnull()].head()

Unnamed: 0,timestamp,source,log_message,target_label,cluster,regex_label
0,27-06-2025 07:20,ModernCRM,nova.osapi_compute.wsgi.server [req-b9718cd8-f...,HTTP Status,0,
1,1/14/2025 23:07,ModernCRM,Email service experiencing issues with sending,Critical Error,1,
2,1/17/2025 1:29,AnalyticsEngine,Unauthorized access to data was attempted,Security Alert,2,
3,12-07-2025 00:24,ModernHR,nova.osapi_compute.wsgi.server [req-4895c258-b...,HTTP Status,0,
4,02-06-2025 18:25,BillingSystem,nova.osapi_compute.wsgi.server [req-ee8bc8ba-9...,HTTP Status,0,


In [14]:
df[df['regex_label'].notnull()].head()

Unnamed: 0,timestamp,source,log_message,target_label,cluster,regex_label
7,10-11-2025 08:44,ModernHR,File data_6169.csv uploaded successfully by us...,System Notification,4,System Notification
14,01-04-2025 01:43,ThirdPartyAPI,File data_3847.csv uploaded successfully by us...,System Notification,4,System Notification
15,05-01-2025 09:41,ModernCRM,Backup completed successfully.,System Notification,8,System Notification
18,2/22/2025 17:49,ModernCRM,Account with ID 5351 created by User634.,User Action,9,User Action
27,9/24/2025 19:57,ThirdPartyAPI,User User685 logged out.,User Action,11,User Action


In [15]:
df_non_regex = df[df['regex_label'].isnull()].copy()
df_non_regex.shape

(1910, 6)

In [None]:
df_non_regex.to_csv("datasets/bert_df.csv")