Data Load

In [4]:
import numpy as np
import pandas as pd

In [5]:
df = pd.read_csv("dataset/synthetic_logs.csv")
df.head()

Unnamed: 0,timestamp,source,log_message,target_label,complexity
0,2025-06-27 07:20:25,ModernCRM,nova.osapi_compute.wsgi.server [req-b9718cd8-f...,HTTP Status,bert
1,1/14/2025 23:07,ModernCRM,Email service experiencing issues with sending,Critical Error,bert
2,1/17/2025 1:29,AnalyticsEngine,Unauthorized access to data was attempted,Security Alert,bert
3,2025-07-12 00:24:16,ModernHR,nova.osapi_compute.wsgi.server [req-4895c258-b...,HTTP Status,bert
4,2025-06-02 18:25:23,BillingSystem,nova.osapi_compute.wsgi.server [req-ee8bc8ba-9...,HTTP Status,bert


In [6]:
df.target_label.unique()

array(['HTTP Status', 'Critical Error', 'Security Alert', 'Error',
       'System Notification', 'Resource Usage', 'User Action',

Create Word Embeddings

In [7]:
df.source.unique()

array(['ModernCRM', 'AnalyticsEngine', 'ModernHR', 'BillingSystem',
       'ThirdPartyAPI', 'LegacyCRM'], dtype=object)

In [8]:
# To find out regex patters of log_messages we use clustering = DBSCAN
from sentence_transformers import SentenceTransformer

# Loading pretrained sentence transformer model
model = SentenceTransformer('all-MiniLM-L6-v2')

# generate embeddings for log_messages
embeddings = model.encode(df['log_message'].tolist())


  from .autonotebook import tqdm as notebook_tqdm


In [9]:
# Create a DataFrame for the first two embeddings
embeddings_df = pd.DataFrame(embeddings, columns=[f"Dim_{i}" for i in range(embeddings.shape[1])])
print(embeddings_df.head(2))

      Dim_0     Dim_1     Dim_2     Dim_3     Dim_4     Dim_5     Dim_6  \
0 -0.102940  0.033546 -0.022026  0.001551 -0.009869 -0.178956 -0.063441   
1  0.008046 -0.035739  0.049387 -0.007192  0.000618 -0.090568  0.057559   

      Dim_7     Dim_8     Dim_9  ...   Dim_374   Dim_375   Dim_376   Dim_377  \
0 -0.060176  0.028111  0.059962  ... -0.074033  0.018422 -0.008649 -0.088356   
1 -0.043945  0.017475  0.052120  ...  0.014534 -0.046097  0.050255 -0.072703   

    Dim_378   Dim_379   Dim_380   Dim_381   Dim_382   Dim_383  
0 -0.059198 -0.000101  0.112914  0.004578 -0.042597  0.003226  
1  0.029278  0.056266 -0.038635  0.015383 -0.062309 -0.027747  

[2 rows x 384 columns]


Perform DBSCAN Clustering

In [10]:

from sklearn.cluster import DBSCAN 
# Perform DBSCAN clustering
# Check should be tight
dbscan =DBSCAN(eps=0.2, min_samples=1, metric='cosine')
clusters  = dbscan.fit_predict(embeddings)

# Add cluster to dataframe
df['cluster'] = clusters
df.head(2)

Unnamed: 0,timestamp,source,log_message,target_label,complexity,cluster
0,2025-06-27 07:20:25,ModernCRM,nova.osapi_compute.wsgi.server [req-b9718cd8-f...,HTTP Status,bert,0
1,1/14/2025 23:07,ModernCRM,Email service experiencing issues with sending,Critical Error,bert,1


In [11]:
# From these clustersour attempt is to find out the regex patterns
df[df.cluster == 1].head(10)

Unnamed: 0,timestamp,source,log_message,target_label,complexity,cluster
1,1/14/2025 23:07,ModernCRM,Email service experiencing issues with sending,Critical Error,bert,1
10,8/9/2025 18:58,ModernCRM,Email server encountered a sending fault,Error,bert,1
217,1/22/2025 5:45,BillingSystem,Mail service encountered a delivery glitch,Error,bert,1
248,5/2/2025 23:04,ModernHR,Service disruption caused by email sending error,Critical Error,bert,1
265,3/30/2025 23:53,ModernCRM,Email system had a problem sending emails,Error,bert,1
361,11/19/2025 23:06,BillingSystem,Email service experienced a sending issue,Error,bert,1
450,10/27/2025 5:59,ThirdPartyAPI,Email delivery system encountered an error,Error,bert,1
477,12/2/2025 10:30,AnalyticsEngine,Email transmission error caused service impact,Critical Error,bert,1
570,11/7/2025 18:08,ThirdPartyAPI,Email service impacted by sending failure,Critical Error,bert,1
678,4/28/2025 15:13,AnalyticsEngine,Email delivery problem affected system,Critical Error,bert,1


In [12]:
cluster_counts = df['cluster'].value_counts()
larger_clusters= cluster_counts[cluster_counts> 10].index

for cluster in larger_clusters:
    print(f"Cluster {cluster}: ")
    print(df[df['cluster'] == cluster]['log_message'].head(5).to_string(index=False))
    print()

Cluster 0: 
nova.osapi_compute.wsgi.server [req-b9718cd8-f6...
nova.osapi_compute.wsgi.server [req-4895c258-b2...
nova.osapi_compute.wsgi.server [req-ee8bc8ba-92...
nova.osapi_compute.wsgi.server [req-f0bffbc3-5a...
nova.osapi_compute.wsgi.server [req-2bf7cfee-a2...

Cluster 5: 
nova.compute.claims [req-a07ac654-8e81-416d-bfb...
nova.compute.claims [req-d6986b54-3735-4a42-907...
nova.compute.claims [req-72b4858f-049e-49e1-b31...
nova.compute.claims [req-5c8f52bd-8e3c-41f0-95a...
nova.compute.claims [req-d38f479d-9bb9-4276-968...

Cluster 11: 
User User685 logged out.
 User User395 logged in.
 User User225 logged in.
User User494 logged out.
 User User900 logged in.

Cluster 13: 
Backup started at 2025-05-14 07:06:55.
Backup started at 2025-02-15 20:00:19.
  Backup ended at 2025-08-08 13:06:23.
Backup started at 2025-11-14 08:27:43.
Backup started at 2025-12-09 10:19:11.

Cluster 7: 
Multiple bad login attempts detected on user 85...
Multiple login failures occurred on user 9052 a...
  

In [13]:
# Now from these clusters we can make regex
# Library for Regular Expressions
import re
def classify_with_regex(log_message):
    regex_patterns = {
        r"User User\d+ logged (in|out).": "User Action",
        r"Backup (started|ended) at .*": "System Notification",
        r"Backup completed successfully.": "System Notification",
        r"System updated to version .*": "System Notification",
        r"File .* uploaded successfully by user .*": "System Notification",
        r"Disk cleanup completed successfully.": "System Notification",
        r"System reboot initiated by user .*": "System Notification",
        r"Account with ID .* created by .*": "User Action"
    }
    for pattern, label in regex_patterns.items():
        if re.search(pattern, log_message, re.IGNORECASE):
            return label
        
    return None

In [14]:
df['regex_label']= df['log_message'].apply(classify_with_regex)
print(df[df.regex_label.notnull()].shape)

(500, 7)


In [15]:
# Remaining data sampels that were not classififed by regex we save it in non_regex
df_non_regex = df[df['regex_label'].isnull()].copy()
print(df_non_regex.shape)

(1910, 7)


In [16]:
# Print the target labels that have less than 5 samples, for these labels we can use LLM because we do not have many samples
# By looking at the data we know that WorkFlow Error and Deprecation warning are only from LegacyCRM source
# For non Legacy source samples we will again apply BERT for embeddings
print(df_non_regex['target_label'].value_counts()[df_non_regex['target_label'].value_counts() <= 5].index.to_list())



In [17]:
# To this df_non_legacy we again apply BERT and logistic regression
df_non_legacy= df_non_regex[df_non_regex.source != "LegacyCRM"]
df_non_legacy.source.unique()

array(['ModernCRM', 'AnalyticsEngine', 'ModernHR', 'BillingSystem',
       'ThirdPartyAPI'], dtype=object)

In [18]:
# generate embeddings for log_mesages
filtered_embeddings = model.encode(df_non_legacy['log_message'].tolist())
print(filtered_embeddings)

[[-0.10293964  0.03354589 -0.02202601 ...  0.00457782 -0.04259718
   0.00322626]
 [ 0.00804574 -0.0357392   0.04938737 ...  0.01538317 -0.06230947
  -0.02774666]
 [-0.00908221  0.13003926 -0.0527557  ...  0.02014106 -0.05117102
  -0.02930291]
 ...
 [-0.04022278  0.0422435  -0.06610423 ...  0.02363656 -0.0053088
   0.0204446 ]
 [-0.03603455  0.01960893  0.10052752 ...  0.03668107 -0.02487844
  -0.00578846]
 [ 0.01457425  0.04911837 -0.00301353 ...  0.01029741 -0.00068495
   0.00708861]]


In [19]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report

X = filtered_embeddings
y = df_non_legacy["target_label"]

X_train, X_test, y_train, y_test = train_test_split(X,y, test_size=0.3, random_state=42)
clf = LogisticRegression(max_iter=1000)
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)
report = classification_report(y_test, y_pred)
print(report)

                precision    recall  f1-score   support

Critical Error       0.91      1.00      0.95        48
         Error       0.98      0.89      0.93        47
   HTTP Status       1.00      1.00      1.00       304
Resource Usage       1.00      1.00      1.00        49
Security Alert       1.00      0.99      1.00       123

      accuracy                           0.99       571
     macro avg       0.98      0.98      0.98       571
  weighted avg       0.99      0.99      0.99       571



In [23]:
import joblib
joblib.dump( clf, "../models/log-classifier.joblib")

['../models/log-classifier.joblib']