In [32]:
import pandas as pd
import numpy as np

# Read data

In [33]:
df = pd.read_csv('synthetic_logs.csv')
df.head()

Unnamed: 0,timestamp,source,log_message,target_label,complexity
0,2025-06-27 07:20:25,ModernCRM,nova.osapi_compute.wsgi.server [req-b9718cd8-f...,HTTP Status,bert
1,1/14/2025 23:07,ModernCRM,Email service experiencing issues with sending,Critical Error,bert
2,1/17/2025 1:29,AnalyticsEngine,Unauthorized access to data was attempted,Security Alert,bert
3,2025-07-12 00:24:16,ModernHR,nova.osapi_compute.wsgi.server [req-4895c258-b...,HTTP Status,bert
4,2025-06-02 18:25:23,BillingSystem,nova.osapi_compute.wsgi.server [req-ee8bc8ba-9...,HTTP Status,bert


In [34]:
df.shape

(2410, 5)

In [35]:
df.source.unique()

array(['ModernCRM', 'AnalyticsEngine', 'ModernHR', 'BillingSystem',
       'ThirdPartyAPI', 'LegacyCRM'], dtype=object)

In [36]:
df.target_label.unique()

array(['HTTP Status', 'Critical Error', 'Security Alert', 'Error',
       'System Notification', 'Resource Usage', 'User Action',

### Clustering
culster log messages Using DBSCAN and sentence transformer to generate embeddings

In [37]:
from sentence_transformers import SentenceTransformer
from sklearn.cluster import DBSCAN

# Generate embeddings for log messages
model = SentenceTransformer('all-MiniLM-L6-v2')
embeddings = model.encode(df['log_message'].tolist(), show_progress_bar=True)



Batches: 100%|██████████| 76/76 [00:03<00:00, 23.18it/s]


In [38]:
embeddings[:2]

array([[-1.02939673e-01,  3.35459784e-02, -2.20260806e-02,
         1.55102601e-03, -9.86915827e-03, -1.78956300e-01,
        -6.34410828e-02, -6.01761751e-02,  2.81108413e-02,
         5.99619783e-02, -1.72618497e-02,  1.43372838e-03,
        -1.49560034e-01,  3.15285241e-03, -5.66031151e-02,
         2.71685794e-02, -1.49890231e-02, -3.54038030e-02,
        -3.62936929e-02, -1.45410579e-02, -5.61500154e-03,
         8.75538066e-02,  4.55120578e-02,  2.50964463e-02,
         1.00187277e-02,  1.24266697e-02, -1.39923573e-01,
         7.68695846e-02,  3.14095095e-02, -4.15249076e-03,
         4.36902270e-02,  1.71250384e-02, -8.00950825e-02,
         5.74005730e-02,  1.89091824e-02,  8.55261683e-02,
         3.96399498e-02, -1.34371817e-01, -1.44371425e-03,
         3.06706317e-03,  1.76854134e-01,  4.44883108e-03,
        -1.69274695e-02,  2.24266555e-02, -4.35050204e-02,
         6.09027082e-03, -9.98169463e-03, -6.23972826e-02,
         1.07371965e-02, -6.04894804e-03, -7.14660361e-0

In [39]:
# Perform DBSCAN clustering
dbscan = DBSCAN(eps=0.2, min_samples=1, metric='cosine')
df['cluster'] = dbscan.fit_predict(embeddings)

df[['log_message', 'cluster']].head()

  ret = a @ b
  ret = a @ b
  ret = a @ b


Unnamed: 0,log_message,cluster
0,nova.osapi_compute.wsgi.server [req-b9718cd8-f...,0
1,Email service experiencing issues with sending,1
2,Unauthorized access to data was attempted,2
3,nova.osapi_compute.wsgi.server [req-4895c258-b...,0
4,nova.osapi_compute.wsgi.server [req-ee8bc8ba-9...,0


In [40]:
df[df.cluster==5].head(10)

Unnamed: 0,timestamp,source,log_message,target_label,complexity,cluster
8,2025-02-12 10:42:29,ThirdPartyAPI,nova.compute.claims [req-a07ac654-8e81-416d-bf...,Resource Usage,bert,5
26,2025-03-03 17:11:11,ModernCRM,nova.compute.claims [req-d6986b54-3735-4a42-90...,Resource Usage,bert,5
40,2025-06-19 21:42:34,ThirdPartyAPI,nova.compute.claims [req-72b4858f-049e-49e1-b3...,Resource Usage,bert,5
58,2025-09-13 14:45:14,AnalyticsEngine,nova.compute.claims [req-5c8f52bd-8e3c-41f0-95...,Resource Usage,bert,5
61,2025-04-27 11:18:18,ThirdPartyAPI,nova.compute.claims [req-d38f479d-9bb9-4276-96...,Resource Usage,bert,5
64,2025-06-20 23:40:51,ModernHR,nova.compute.claims [req-d82fab16-60f8-4c9f-bd...,Resource Usage,bert,5
109,2025-02-03 06:35:20,AnalyticsEngine,nova.compute.claims [req-868a5460-dbb6-416b-b4...,Resource Usage,bert,5
138,2025-06-26 02:46:29,AnalyticsEngine,nova.compute.claims [req-2d658d2c-7eff-414e-a6...,Resource Usage,bert,5
152,2025-03-29 08:14:30,ThirdPartyAPI,nova.compute.claims [req-29a09cdb-3169-4c40-8b...,Resource Usage,bert,5
177,2025-11-03 04:49:47,ModernCRM,nova.compute.claims [req-9118475d-6e72-48fa-9d...,Resource Usage,bert,5


In [41]:
cluster_counts = df['cluster'].value_counts().sort_values(ascending=False)

# Iterate over clusters with more than 10 records and print 5 log messages from each
for cluster_id, count in cluster_counts.items():
    if count > 10:
        print(f"\nCluster {cluster_id} (count: {count}):")
        sample_logs = df[df['cluster'] == cluster_id]['log_message'].head(5)
        for log in sample_logs:
            print(f"- {log}")


Cluster 0 (count: 1017):
- nova.osapi_compute.wsgi.server [req-b9718cd8-f65e-49cc-8349-6cf7122af137 113d3a99c3da401fbd62cc2caa5b96d2 54fadb412c4e40cdbaed9335e4c35a9e - - -] 10.11.10.1 "GET /v2/54fadb412c4e40cdbaed9335e4c35a9e/servers/detail HTTP/1.1" status: 200 len: 1893 time: 0.2675118
- nova.osapi_compute.wsgi.server [req-4895c258-b2f8-488f-a2a3-4fae63982e48 113d3a99c3da401fbd62cc2caa5b96d2 54fadb412c4e40cdbaed9335e4c35a9e - - -] 10.11.10.1 "GET /v2/54fadb412c4e40cdbaed9335e4c35a9e/servers/detail HTTP/1.1" HTTP status code -  200 len: 211 time: 0.0968180
- nova.osapi_compute.wsgi.server [req-ee8bc8ba-9265-4280-9215-dbe000a41209 113d3a99c3da401fbd62cc2caa5b96d2 54fadb412c4e40cdbaed9335e4c35a9e - - -] 10.11.10.1 "GET /v2/54fadb412c4e40cdbaed9335e4c35a9e/servers/detail HTTP/1.1" RCODE  200 len: 1874 time: 0.2280791
- nova.osapi_compute.wsgi.server [req-f0bffbc3-5ab0-4916-91c1-0a61dd7d4ec2 113d3a99c3da401fbd62cc2caa5b96d2 54fadb412c4e40cdbaed9335e4c35a9e - - -] 10.11.10.1 "GET /v2/54fa

In [42]:
df[df['cluster'] == 0]['log_message'][70:100]

162    nova.osapi_compute.wsgi.server [req-36f6bb06-1...
164    nova.osapi_compute.wsgi.server [req-964322ee-2...
165    nova.metadata.wsgi.server [-] 10.11.21.136,10....
166    nova.osapi_compute.wsgi.server [req-63dbb4ce-0...
167    nova.osapi_compute.wsgi.server [req-3f659609-f...
170    nova.osapi_compute.wsgi.server [req-f8409217-a...
171    nova.osapi_compute.wsgi.server [req-60ef3ade-e...
172    nova.metadata.wsgi.server [req-e5e061d6-f2aa-4...
174    nova.osapi_compute.wsgi.server [req-a6b9779d-b...
178    nova.osapi_compute.wsgi.server [req-ab030d68-6...
179    nova.osapi_compute.wsgi.server [req-0f3adb60-b...
180    nova.metadata.wsgi.server [req-81a97e64-19c0-4...
182    nova.metadata.wsgi.server [req-35ea282d-a35a-4...
184    nova.metadata.wsgi.server [req-c22c0e7f-ff55-4...
188    nova.metadata.wsgi.server [-] 10.11.21.129,10....
189    nova.osapi_compute.wsgi.server [req-95cb0030-6...
199    nova.osapi_compute.wsgi.server [req-a6d2fe15-c...
201    nova.osapi_compute.wsgi.

### Classification Stage 1: Regex


In [43]:
import re
def classify_with_regex(log_message):
    regex_pattern = {
        r"User User\d+ logged (out|in).": "User Action",
        r"Backup (started|ended) at .*": "System Notification",
        r"/Elevation of admin privileges detected for user \d+": "System Notification",
        r"System updated to version .*": "System Notification",
        r"File .* uploaded successfully by user .*": "System Notification",
        r"Disk cleanup completed successfully.": "System Notification",
        r"System reboot initiated by user .*": "System Notification",
        r"Account with ID .* created by .*": "User Action"
    }
    for pattern, label in regex_pattern.items():
        if re.search(pattern, log_message, re.IGNORECASE):
            return label
    return None

In [44]:
classify_with_regex("Account with ID A3837737 created by hgf")

'User Action'

In [45]:
df.head()

Unnamed: 0,timestamp,source,log_message,target_label,complexity,cluster
0,2025-06-27 07:20:25,ModernCRM,nova.osapi_compute.wsgi.server [req-b9718cd8-f...,HTTP Status,bert,0
1,1/14/2025 23:07,ModernCRM,Email service experiencing issues with sending,Critical Error,bert,1
2,1/17/2025 1:29,AnalyticsEngine,Unauthorized access to data was attempted,Security Alert,bert,2
3,2025-07-12 00:24:16,ModernHR,nova.osapi_compute.wsgi.server [req-4895c258-b...,HTTP Status,bert,0
4,2025-06-02 18:25:23,BillingSystem,nova.osapi_compute.wsgi.server [req-ee8bc8ba-9...,HTTP Status,bert,0


In [46]:
df['regex_label'] = df['log_message'].apply(classify_with_regex)
df[df.regex_label.notnull()]


Unnamed: 0,timestamp,source,log_message,target_label,complexity,cluster,regex_label
7,10/11/2025 8:44,ModernHR,File data_6169.csv uploaded successfully by us...,System Notification,regex,4,System Notification
14,1/4/2025 1:43,ThirdPartyAPI,File data_3847.csv uploaded successfully by us...,System Notification,regex,4,System Notification
18,2/22/2025 17:49,ModernCRM,Account with ID 5351 created by User634.,User Action,regex,9,User Action
27,9/24/2025 19:57,ThirdPartyAPI,User User685 logged out.,User Action,regex,11,User Action
30,4/26/2025 7:54,AnalyticsEngine,Backup started at 2025-05-14 07:06:55.,System Notification,regex,13,System Notification
...,...,...,...,...,...,...,...
2368,12/13/2025 20:04,ThirdPartyAPI,Disk cleanup completed successfully.,System Notification,regex,32,System Notification
2376,6/27/2025 8:47,ModernCRM,System updated to version 2.0.5.,System Notification,regex,21,System Notification
2381,9/5/2025 6:39,ThirdPartyAPI,Disk cleanup completed successfully.,System Notification,regex,32,System Notification
2394,4/3/2025 13:13,ModernHR,Disk cleanup completed successfully.,System Notification,regex,32,System Notification


In [47]:
df[df.regex_label.isnull()].shape

(1970, 7)

### Classification Stage 2: Classification Using Embeddings


In [48]:
# Apply regex classification
df_non_regex = df[df.regex_label.isnull()].copy()
print(df_non_regex.shape)
df_non_regex

(1970, 7)


Unnamed: 0,timestamp,source,log_message,target_label,complexity,cluster,regex_label
0,2025-06-27 07:20:25,ModernCRM,nova.osapi_compute.wsgi.server [req-b9718cd8-f...,HTTP Status,bert,0,
1,1/14/2025 23:07,ModernCRM,Email service experiencing issues with sending,Critical Error,bert,1,
2,1/17/2025 1:29,AnalyticsEngine,Unauthorized access to data was attempted,Security Alert,bert,2,
3,2025-07-12 00:24:16,ModernHR,nova.osapi_compute.wsgi.server [req-4895c258-b...,HTTP Status,bert,0,
4,2025-06-02 18:25:23,BillingSystem,nova.osapi_compute.wsgi.server [req-ee8bc8ba-9...,HTTP Status,bert,0,
...,...,...,...,...,...,...,...
2405,2025-08-13 07:29:25,ModernHR,nova.osapi_compute.wsgi.server [req-96c3ec98-2...,HTTP Status,bert,0,
2406,1/11/2025 5:32,ModernHR,User 3844 account experienced multiple failed ...,Security Alert,bert,7,
2407,2025-08-03 03:07:47,ThirdPartyAPI,nova.metadata.wsgi.server [req-b6d4a270-accb-4...,HTTP Status,bert,0,
2408,11/11/2025 11:52,BillingSystem,Email service affected by failed transmission,Critical Error,bert,1,


In [49]:
print(df_non_regex['target_label'].value_counts()[df_non_regex['target_label'].value_counts() <=5].index.to_list())



In [50]:
df_non_legacy = df_non_regex[df_non_regex.source != 'LegacyCRM']
df_non_legacy.source.unique()

array(['ModernCRM', 'AnalyticsEngine', 'ModernHR', 'BillingSystem',
       'ThirdPartyAPI'], dtype=object)

In [51]:
# Generate embeddings for log messages

filtered_embeddings = model.encode(df_non_legacy['log_message'].tolist(), show_progress_bar=True)
filtered_embeddings[:2]



Batches: 100%|██████████| 62/62 [00:03<00:00, 20.57it/s]


array([[-1.02939673e-01,  3.35459784e-02, -2.20260806e-02,
         1.55102601e-03, -9.86915827e-03, -1.78956300e-01,
        -6.34410828e-02, -6.01761751e-02,  2.81108413e-02,
         5.99619783e-02, -1.72618497e-02,  1.43372838e-03,
        -1.49560034e-01,  3.15285241e-03, -5.66031151e-02,
         2.71685794e-02, -1.49890231e-02, -3.54038030e-02,
        -3.62936929e-02, -1.45410579e-02, -5.61500154e-03,
         8.75538066e-02,  4.55120578e-02,  2.50964463e-02,
         1.00187277e-02,  1.24266697e-02, -1.39923573e-01,
         7.68695846e-02,  3.14095095e-02, -4.15249076e-03,
         4.36902270e-02,  1.71250384e-02, -8.00950825e-02,
         5.74005730e-02,  1.89091824e-02,  8.55261683e-02,
         3.96399498e-02, -1.34371817e-01, -1.44371425e-03,
         3.06706317e-03,  1.76854134e-01,  4.44883108e-03,
        -1.69274695e-02,  2.24266555e-02, -4.35050204e-02,
         6.09027082e-03, -9.98169463e-03, -6.23972826e-02,
         1.07371965e-02, -6.04894804e-03, -7.14660361e-0

In [52]:
X = filtered_embeddings
y = df_non_legacy['target_label']

In [53]:
X.shape

(1963, 384)

In [54]:
y.shape

(1963,)

In [55]:
X[:10]

array([[-0.10293967,  0.03354598, -0.02202608, ...,  0.00457782,
        -0.04259717,  0.00322625],
       [ 0.00804574, -0.03573923,  0.04938735, ...,  0.0153832 ,
        -0.06230948, -0.02774666],
       [-0.00908222,  0.13003926, -0.05275569, ...,  0.02014105,
        -0.05117101, -0.02930291],
       ...,
       [-0.06878446,  0.04965053, -0.05144547, ..., -0.02104377,
        -0.0047827 , -0.03082291],
       [-0.10485216,  0.0480139 , -0.03708714, ...,  0.01767948,
        -0.03316486, -0.02589289],
       [ 0.04371327, -0.00678536,  0.08758418, ...,  0.05661778,
        -0.04497545, -0.05268194]], dtype=float32)

In [56]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
clf = LogisticRegression(max_iter=1000)
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)
report = classification_report(y_test, y_pred)
print(report)

                     precision    recall  f1-score   support

     Critical Error       0.95      0.98      0.97        58
              Error       0.98      0.93      0.95        44
        HTTP Status       1.00      1.00      1.00       305
     Resource Usage       1.00      1.00      1.00        50
     Security Alert       0.99      0.99      0.99       111
System Notification       1.00      1.00      1.00        21

           accuracy                           0.99       589
          macro avg       0.99      0.98      0.99       589
       weighted avg       0.99      0.99      0.99       589



  raw_prediction = X @ weights.T + intercept  # ndarray, likely C-contiguous
  raw_prediction = X @ weights.T + intercept  # ndarray, likely C-contiguous
  raw_prediction = X @ weights.T + intercept  # ndarray, likely C-contiguous
  grad[:, :n_features] = grad_pointwise.T @ X + l2_reg_strength * weights
  grad[:, :n_features] = grad_pointwise.T @ X + l2_reg_strength * weights
  grad[:, :n_features] = grad_pointwise.T @ X + l2_reg_strength * weights
  ret = a @ b
  ret = a @ b
  ret = a @ b


In [57]:
import joblib

In [59]:
import os

os.makedirs('../models', exist_ok=True)
joblib.dump(clf, '../models/log_classifier.joblib')

['../models/log_classifier.joblib']