In [1]:
import pandas as pd

df = pd.read_csv('dataset\synthetic_logs.csv')
df.head()

Unnamed: 0,timestamp,source,log_message,target_label,complexity
0,2025-06-27 07:20:25,ModernCRM,nova.osapi_compute.wsgi.server [req-b9718cd8-f...,HTTP Status,bert
1,1/14/2025 23:07,ModernCRM,Email service experiencing issues with sending,Critical Error,bert
2,1/17/2025 1:29,AnalyticsEngine,Unauthorized access to data was attempted,Security Alert,bert
3,2025-07-12 00:24:16,ModernHR,nova.osapi_compute.wsgi.server [req-4895c258-b...,HTTP Status,bert
4,2025-06-02 18:25:23,BillingSystem,nova.osapi_compute.wsgi.server [req-ee8bc8ba-9...,HTTP Status,bert


In [2]:
df.source.unique()

array(['ModernCRM', 'AnalyticsEngine', 'ModernHR', 'BillingSystem',
       'ThirdPartyAPI', 'LegacyCRM'], dtype=object)

In [3]:
df.target_label.unique()

array(['HTTP Status', 'Critical Error', 'Security Alert', 'Error',
       'System Notification', 'Resource Usage', 'User Action',

In [5]:
from sklearn.cluster import DBSCAN
from sentence_transformers import SentenceTransformer
import numpy as np

#Load pre-trained SentenceTransformer Model
model =  SentenceTransformer('all-MiniLM-L6-v2')

# Generate embeddings for log messages
embeddings = model.encode(df['log_message'].tolist())
embeddings[:2]

array([[-1.02939673e-01,  3.35458741e-02, -2.20260676e-02,
         1.55104266e-03, -9.86922532e-03, -1.78956285e-01,
        -6.34410158e-02, -6.01762086e-02,  2.81108543e-02,
         5.99619895e-02, -1.72618385e-02,  1.43369357e-03,
        -1.49560049e-01,  3.15287942e-03, -5.66030405e-02,
         2.71685142e-02, -1.49890827e-02, -3.54037359e-02,
        -3.62936184e-02, -1.45410867e-02, -5.61493495e-03,
         8.75538886e-02,  4.55120392e-02,  2.50963718e-02,
         1.00187194e-02,  1.24266790e-02, -1.39923587e-01,
         7.68696144e-02,  3.14095393e-02, -4.15245071e-03,
         4.36902530e-02,  1.71249788e-02, -8.00950751e-02,
         5.74005730e-02,  1.89092103e-02,  8.55261907e-02,
         3.96399423e-02, -1.34371817e-01, -1.44361309e-03,
         3.06702894e-03,  1.76854089e-01,  4.44887718e-03,
        -1.69274695e-02,  2.24266443e-02, -4.35050502e-02,
         6.09023077e-03, -9.98169743e-03, -6.23972639e-02,
         1.07372375e-02, -6.04898110e-03, -7.14660957e-0

In [6]:
#Perform DBSCAN clustering
dbscan = DBSCAN(eps=0.2,min_samples=1,metric='cosine')
clusters = dbscan.fit_predict(embeddings)

# Add cluster labels to the DataFrame
df['cluster'] = clusters
df.head()

Unnamed: 0,timestamp,source,log_message,target_label,complexity,cluster
0,2025-06-27 07:20:25,ModernCRM,nova.osapi_compute.wsgi.server [req-b9718cd8-f...,HTTP Status,bert,0
1,1/14/2025 23:07,ModernCRM,Email service experiencing issues with sending,Critical Error,bert,1
2,1/17/2025 1:29,AnalyticsEngine,Unauthorized access to data was attempted,Security Alert,bert,2
3,2025-07-12 00:24:16,ModernHR,nova.osapi_compute.wsgi.server [req-4895c258-b...,HTTP Status,bert,0
4,2025-06-02 18:25:23,BillingSystem,nova.osapi_compute.wsgi.server [req-ee8bc8ba-9...,HTTP Status,bert,0


In [7]:
df[df.cluster==1]

Unnamed: 0,timestamp,source,log_message,target_label,complexity,cluster
1,1/14/2025 23:07,ModernCRM,Email service experiencing issues with sending,Critical Error,bert,1
10,8/9/2025 18:58,ModernCRM,Email server encountered a sending fault,Error,bert,1
217,1/22/2025 5:45,BillingSystem,Mail service encountered a delivery glitch,Error,bert,1
248,5/2/2025 23:04,ModernHR,Service disruption caused by email sending error,Critical Error,bert,1
265,3/30/2025 23:53,ModernCRM,Email system had a problem sending emails,Error,bert,1
361,11/19/2025 23:06,BillingSystem,Email service experienced a sending issue,Error,bert,1
450,10/27/2025 5:59,ThirdPartyAPI,Email delivery system encountered an error,Error,bert,1
477,12/2/2025 10:30,AnalyticsEngine,Email transmission error caused service impact,Critical Error,bert,1
570,11/7/2025 18:08,ThirdPartyAPI,Email service impacted by sending failure,Critical Error,bert,1
678,4/28/2025 15:13,AnalyticsEngine,Email delivery problem affected system,Critical Error,bert,1


In [8]:
cluster_counts = df['cluster'].value_counts()
large_clusters = cluster_counts[cluster_counts > 10].index

for cluster in large_clusters:
    print(f"Cluster {cluster}:")
    print(df[df['cluster'] == cluster]['log_message'].head(5).to_string(index=False))
    print()

Cluster 0:
nova.osapi_compute.wsgi.server [req-b9718cd8-f6...
nova.osapi_compute.wsgi.server [req-4895c258-b2...
nova.osapi_compute.wsgi.server [req-ee8bc8ba-92...
nova.osapi_compute.wsgi.server [req-f0bffbc3-5a...
nova.osapi_compute.wsgi.server [req-2bf7cfee-a2...

Cluster 5:
nova.compute.claims [req-a07ac654-8e81-416d-bfb...
nova.compute.claims [req-d6986b54-3735-4a42-907...
nova.compute.claims [req-72b4858f-049e-49e1-b31...
nova.compute.claims [req-5c8f52bd-8e3c-41f0-95a...
nova.compute.claims [req-d38f479d-9bb9-4276-968...

Cluster 11:
User User685 logged out.
 User User395 logged in.
 User User225 logged in.
User User494 logged out.
 User User900 logged in.

Cluster 13:
Backup started at 2025-05-14 07:06:55.
Backup started at 2025-02-15 20:00:19.
  Backup ended at 2025-08-08 13:06:23.
Backup started at 2025-11-14 08:27:43.
Backup started at 2025-12-09 10:19:11.

Cluster 7:
Multiple bad login attempts detected on user 85...
Multiple login failures occurred on user 9052 a...
  User 

In [9]:
import re
def classify_with_regex(log_message):
    regex_patterns = {
        r"User User\d+ logged (in|out)." : "User Action",
        r"Backup (started|ended) at .*" : "System Notification",
        r"Backup completed successfully." : "System Notification",
        r"System updated to version .*" : "System Notification",
        r"File .* uploaded successfully by user .*" : "System Notification",
        r"Disk cleanup completed successfully." : "System Notification",
        r"System reboot initiated by user .*" : "System Notification",
        r"Account with ID .* created by .*" : "User Action"
    }
    for pattern, label in regex_patterns.items():
        if re.search(pattern, log_message, re.IGNORECASE):
            return label
    return None

In [10]:
classify_with_regex("User User800 logged OUT.")

'User Action'

In [11]:
classify_with_regex("Hello bro")

In [12]:
df['regex_label'] = df['log_message'].apply(classify_with_regex)
df.head()

Unnamed: 0,timestamp,source,log_message,target_label,complexity,cluster,regex_label
0,2025-06-27 07:20:25,ModernCRM,nova.osapi_compute.wsgi.server [req-b9718cd8-f...,HTTP Status,bert,0,
1,1/14/2025 23:07,ModernCRM,Email service experiencing issues with sending,Critical Error,bert,1,
2,1/17/2025 1:29,AnalyticsEngine,Unauthorized access to data was attempted,Security Alert,bert,2,
3,2025-07-12 00:24:16,ModernHR,nova.osapi_compute.wsgi.server [req-4895c258-b...,HTTP Status,bert,0,
4,2025-06-02 18:25:23,BillingSystem,nova.osapi_compute.wsgi.server [req-ee8bc8ba-9...,HTTP Status,bert,0,


In [13]:
df_non_regex = df[df['regex_label'].isnull()].copy()
df_non_regex.shape

(1910, 7)

In [14]:
df.shape

(2410, 7)

**Filter Out Less Samples**

In [15]:
df_non_legacy = df[df.source != 'LegacyCMR']

In [16]:
# Generate embeddings for log messages
filtered_embeddings = model.encode(df_non_legacy['log_message'].tolist())
filtered_embeddings[:2]

array([[-1.02939673e-01,  3.35458741e-02, -2.20260676e-02,
         1.55104266e-03, -9.86922532e-03, -1.78956285e-01,
        -6.34410158e-02, -6.01762086e-02,  2.81108543e-02,
         5.99619895e-02, -1.72618385e-02,  1.43369357e-03,
        -1.49560049e-01,  3.15287942e-03, -5.66030405e-02,
         2.71685142e-02, -1.49890827e-02, -3.54037359e-02,
        -3.62936184e-02, -1.45410867e-02, -5.61493495e-03,
         8.75538886e-02,  4.55120392e-02,  2.50963718e-02,
         1.00187194e-02,  1.24266790e-02, -1.39923587e-01,
         7.68696144e-02,  3.14095393e-02, -4.15245071e-03,
         4.36902530e-02,  1.71249788e-02, -8.00950751e-02,
         5.74005730e-02,  1.89092103e-02,  8.55261907e-02,
         3.96399423e-02, -1.34371817e-01, -1.44361309e-03,
         3.06702894e-03,  1.76854089e-01,  4.44887718e-03,
        -1.69274695e-02,  2.24266443e-02, -4.35050502e-02,
         6.09023077e-03, -9.98169743e-03, -6.23972639e-02,
         1.07372375e-02, -6.04898110e-03, -7.14660957e-0

In [17]:
X = filtered_embeddings
y = df_non_legacy['target_label']

from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report

# Split into test and train
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.3,random_state=42)
clf = LogisticRegression(max_iter = 1000)
clf.fit(X_train,y_train)
y_pred = clf.predict(X_test)
print(classification_report(y_test,y_pred))

                     precision    recall  f1-score   support

     Critical Error       0.90      1.00      0.95        45
              Error       0.96      0.91      0.93        54
        HTTP Status       1.00      1.00      1.00       295
     Resource Usage       1.00      1.00      1.00        56
     Security Alert       0.99      0.99      0.99       112
System Notification       1.00      1.00      1.00       111
        User Action       1.00      1.00      1.00        48
     Workflow Error       0.00      0.00      0.00         1

           accuracy                           0.99       723
          macro avg       0.76      0.77      0.76       723
       weighted avg       0.99      0.99      0.99       723



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [18]:
clf.score(X_test,y_test)

0.9889349930843707

In [24]:
import joblib

joblib.dump(clf,'../model/log_classfier.joblib')

['../model/log_classfier.joblib']