In [58]:
import pandas as pd

df = pd.read_csv("dataset/synthetic_logs.csv")
df.head()

Unnamed: 0,timestamp,source,log_message,target_label,complexity
0,2025-06-27 07:20:25,ModernCRM,nova.osapi_compute.wsgi.server [req-b9718cd8-f...,HTTP Status,bert
1,1/14/2025 23:07,ModernCRM,Email service experiencing issues with sending,Critical Error,bert
2,1/17/2025 1:29,AnalyticsEngine,Unauthorized access to data was attempted,Security Alert,bert
3,2025-07-12 00:24:16,ModernHR,nova.osapi_compute.wsgi.server [req-4895c258-b...,HTTP Status,bert
4,2025-06-02 18:25:23,BillingSystem,nova.osapi_compute.wsgi.server [req-ee8bc8ba-9...,HTTP Status,bert


In [59]:
df.source.unique()

array(['ModernCRM', 'AnalyticsEngine', 'ModernHR', 'BillingSystem',
       'ThirdPartyAPI', 'LegacyCRM'], dtype=object)

In [60]:
df.target_label.unique()

array(['HTTP Status', 'Critical Error', 'Security Alert', 'Error',
       'System Notification', 'Resource Usage', 'User Action',

In [61]:
df[df.target_label=='System Notification'].sample(10)

Unnamed: 0,timestamp,source,log_message,target_label,complexity
1121,10/9/2025 20:33,AnalyticsEngine,System updated to version 3.0.7.,System Notification,regex
618,9/2/2025 18:41,BillingSystem,Backup ended at 2025-07-31 13:19:58.,System Notification,regex
2088,9/25/2025 3:50,BillingSystem,Backup ended at 2025-04-27 00:46:45.,System Notification,regex
1651,6/2/2025 23:20,ModernCRM,Backup completed successfully.,System Notification,regex
2126,5/13/2025 5:04,ThirdPartyAPI,File data_2777.csv uploaded successfully by us...,System Notification,regex
1624,12/14/2025 5:14,AnalyticsEngine,System reboot initiated by user User268.,System Notification,regex
191,5/6/2025 23:58,ThirdPartyAPI,Backup started at 2025-12-09 10:19:11.,System Notification,regex
1419,1/14/2025 3:37,AnalyticsEngine,System updated to version 2.1.3.,System Notification,regex
1099,12/8/2025 10:48,BillingSystem,Backup ended at 2025-07-31 09:29:01.,System Notification,regex
634,1/11/2025 12:26,ModernCRM,Backup started at 2025-10-31 23:48:54.,System Notification,regex


In [62]:
df[df.log_message.str.startswith("System reboot initiated by user")]

Unnamed: 0,timestamp,source,log_message,target_label,complexity
36,11/19/2025 13:14,BillingSystem,System reboot initiated by user User243.,System Notification,regex
92,12/4/2025 21:20,BillingSystem,System reboot initiated by user User471.,System Notification,regex
139,5/8/2025 16:34,ModernHR,System reboot initiated by user User216.,System Notification,regex
140,9/11/2025 8:49,AnalyticsEngine,System reboot initiated by user User639.,System Notification,regex
161,3/31/2025 19:40,BillingSystem,System reboot initiated by user User819.,System Notification,regex
163,6/6/2025 15:29,BillingSystem,System reboot initiated by user User938.,System Notification,regex
307,4/12/2025 0:41,BillingSystem,System reboot initiated by user User929.,System Notification,regex
365,10/20/2025 22:32,ModernHR,System reboot initiated by user User533.,System Notification,regex
508,4/15/2025 2:04,ThirdPartyAPI,System reboot initiated by user User591.,System Notification,regex
552,9/22/2025 20:54,ModernHR,System reboot initiated by user User421.,System Notification,regex


In [63]:
###Clustering

In [64]:
from sklearn.cluster import DBSCAN
from sentence_transformers import SentenceTransformer

In [65]:
from sentence_transformers import SentenceTransformer

# Load the newer and better model
model = SentenceTransformer('intfloat/e5-small-v2')

# IMPORTANT: E5 expects special formatting for inputs (prefix with 'passage:')
log_messages = ["passage: " + str(msg) for msg in df['log_message'].tolist()]

# Get dense vector embeddings
embeddings = model.encode(log_messages, convert_to_numpy=True)


In [66]:
embeddings[:5]


array([[-0.07121054, -0.00226594,  0.01295996, ...,  0.05175097,
        -0.04592197,  0.00479105],
       [-0.08592297,  0.0431099 ,  0.01785462, ...,  0.02057916,
        -0.02006268,  0.05841121],
       [-0.05587291,  0.0297737 , -0.02856353, ...,  0.02190062,
        -0.01744094,  0.04758003],
       [-0.06956097,  0.00144228,  0.00954679, ...,  0.04422173,
        -0.05030501,  0.00933755],
       [-0.07640073, -0.01084113,  0.01425744, ...,  0.0538137 ,
        -0.03154893,  0.00687141]], dtype=float32)

In [67]:
clustering = DBSCAN(eps=0.2, min_samples=1, metric='cosine').fit(embeddings)
df['cluster'] = clustering.labels_


In [68]:
df.head()


Unnamed: 0,timestamp,source,log_message,target_label,complexity,cluster
0,2025-06-27 07:20:25,ModernCRM,nova.osapi_compute.wsgi.server [req-b9718cd8-f...,HTTP Status,bert,0
1,1/14/2025 23:07,ModernCRM,Email service experiencing issues with sending,Critical Error,bert,0
2,1/17/2025 1:29,AnalyticsEngine,Unauthorized access to data was attempted,Security Alert,bert,0
3,2025-07-12 00:24:16,ModernHR,nova.osapi_compute.wsgi.server [req-4895c258-b...,HTTP Status,bert,0
4,2025-06-02 18:25:23,BillingSystem,nova.osapi_compute.wsgi.server [req-ee8bc8ba-9...,HTTP Status,bert,0


In [69]:
#Group by cluster to inspect patterns
clusters=df.groupby('cluster')['log_message'].apply(list)
sorted_clusters = clusters.sort_values(key=lambda x: x.map(len), ascending=False)

In [70]:
print("Clustered Patterns:")
for cluster_id, messages in sorted_clusters.items():
    if len(messages)>10:
        print(f"Cluster{cluster_id}:")
        for msg in messages[:5]:
            print(f"  {msg}")

Clustered Patterns:
Cluster0:
  nova.osapi_compute.wsgi.server [req-b9718cd8-f65e-49cc-8349-6cf7122af137 113d3a99c3da401fbd62cc2caa5b96d2 54fadb412c4e40cdbaed9335e4c35a9e - - -] 10.11.10.1 "GET /v2/54fadb412c4e40cdbaed9335e4c35a9e/servers/detail HTTP/1.1" status: 200 len: 1893 time: 0.2675118
  Email service experiencing issues with sending
  Unauthorized access to data was attempted
  nova.osapi_compute.wsgi.server [req-4895c258-b2f8-488f-a2a3-4fae63982e48 113d3a99c3da401fbd62cc2caa5b96d2 54fadb412c4e40cdbaed9335e4c35a9e - - -] 10.11.10.1 "GET /v2/54fadb412c4e40cdbaed9335e4c35a9e/servers/detail HTTP/1.1" HTTP status code -  200 len: 211 time: 0.0968180
  nova.osapi_compute.wsgi.server [req-ee8bc8ba-9265-4280-9215-dbe000a41209 113d3a99c3da401fbd62cc2caa5b96d2 54fadb412c4e40cdbaed9335e4c35a9e - - -] 10.11.10.1 "GET /v2/54fadb412c4e40cdbaed9335e4c35a9e/servers/detail HTTP/1.1" RCODE  200 len: 1874 time: 0.2280791


In [71]:
###Classificstion stage 1: Regex

In [72]:
import re
def classify_with_regex(log_message):
    regex_patterns = { 
        r"User User\d+ logged (in | out).": "User Action",
        r"Backup (started|ended) at .*": "System Notification",
        r"Backup completed successfully.": "System Notification",
        r"System updated to version .*": "System Notification",
        r"File .* uploaded successfully by user .*": "System Notification",
        r"Disk cleanup completed successfully.": "System Notification",
        r"System reboot initiated by user .*": "System Notification",
        r"Account with ID .* created by .*": "User Action"
    }
    for pattern, label in regex_patterns.items():
        if re.search(pattern, log_message):
            return label
    return None
    

In [73]:
classify_with_regex("User User123 logged in.")

In [74]:
classify_with_regex("System reboot initiated by user User179.")

'System Notification'

In [75]:
classify_with_regex("Hey you, chill bro")

In [76]:
# Apply regex classification
df['regex_label'] = df['log_message'].apply(lambda x: classify_with_regex(x))
df[df['regex_label'].notnull()]

Unnamed: 0,timestamp,source,log_message,target_label,complexity,cluster,regex_label
7,10/11/2025 8:44,ModernHR,File data_6169.csv uploaded successfully by us...,System Notification,regex,0,System Notification
14,1/4/2025 1:43,ThirdPartyAPI,File data_3847.csv uploaded successfully by us...,System Notification,regex,0,System Notification
15,5/1/2025 9:41,ModernCRM,Backup completed successfully.,System Notification,regex,0,System Notification
18,2/22/2025 17:49,ModernCRM,Account with ID 5351 created by User634.,User Action,regex,0,User Action
30,4/26/2025 7:54,AnalyticsEngine,Backup started at 2025-05-14 07:06:55.,System Notification,regex,0,System Notification
...,...,...,...,...,...,...,...
2376,6/27/2025 8:47,ModernCRM,System updated to version 2.0.5.,System Notification,regex,0,System Notification
2381,9/5/2025 6:39,ThirdPartyAPI,Disk cleanup completed successfully.,System Notification,regex,0,System Notification
2394,4/3/2025 13:13,ModernHR,Disk cleanup completed successfully.,System Notification,regex,0,System Notification
2395,5/2/2025 14:29,ThirdPartyAPI,Backup ended at 2025-05-06 11:23:16.,System Notification,regex,0,System Notification


In [77]:
df[df['regex_label'].isnull()].head(5)

Unnamed: 0,timestamp,source,log_message,target_label,complexity,cluster,regex_label
0,2025-06-27 07:20:25,ModernCRM,nova.osapi_compute.wsgi.server [req-b9718cd8-f...,HTTP Status,bert,0,
1,1/14/2025 23:07,ModernCRM,Email service experiencing issues with sending,Critical Error,bert,0,
2,1/17/2025 1:29,AnalyticsEngine,Unauthorized access to data was attempted,Security Alert,bert,0,
3,2025-07-12 00:24:16,ModernHR,nova.osapi_compute.wsgi.server [req-4895c258-b...,HTTP Status,bert,0,
4,2025-06-02 18:25:23,BillingSystem,nova.osapi_compute.wsgi.server [req-ee8bc8ba-9...,HTTP Status,bert,0,


In [78]:
###Classification stage 2: Classification using embedings

In [79]:
df_non_regex = df[df['regex_label'].isnull()].copy()
df_non_regex.shape

(2010, 7)

In [81]:
df_legacy = df_non_regex[df_non_regex.source == "LegacyCRM"]
df_legacy.head()


Unnamed: 0,timestamp,source,log_message,target_label,complexity,cluster,regex_label
60,2025-10-06 16:55:23,LegacyCRM,Lead conversion failed for prospect ID 7842 du...,Workflow Error,llm,0,
255,2025-05-03 16:55:35,LegacyCRM,API endpoint 'getCustomerDetails' is deprecate...,Deprecation Warning,llm,0,
377,2025-06-24 12:16:29,LegacyCRM,Customer follow-up process for lead ID 5621 fa...,Workflow Error,llm,0,
1325,2025-04-17 07:33:44,LegacyCRM,Escalation rule execution failed for ticket ID...,Workflow Error,llm,0,
1734,2025-04-30 07:47:30,LegacyCRM,The 'ExportToCSV' feature is outdated. Please ...,Deprecation Warning,llm,0,


In [None]:
df_non_legacy = df_non_regex[df_non_regex.source!="LegacyCRM"]
df_non_legacy

Unnamed: 0,timestamp,source,log_message,target_label,complexity,cluster,regex_label
0,2025-06-27 07:20:25,ModernCRM,nova.osapi_compute.wsgi.server [req-b9718cd8-f...,HTTP Status,bert,0,
1,1/14/2025 23:07,ModernCRM,Email service experiencing issues with sending,Critical Error,bert,0,
2,1/17/2025 1:29,AnalyticsEngine,Unauthorized access to data was attempted,Security Alert,bert,0,
3,2025-07-12 00:24:16,ModernHR,nova.osapi_compute.wsgi.server [req-4895c258-b...,HTTP Status,bert,0,
4,2025-06-02 18:25:23,BillingSystem,nova.osapi_compute.wsgi.server [req-ee8bc8ba-9...,HTTP Status,bert,0,
...,...,...,...,...,...,...,...
2405,2025-08-13 07:29:25,ModernHR,nova.osapi_compute.wsgi.server [req-96c3ec98-2...,HTTP Status,bert,0,
2406,1/11/2025 5:32,ModernHR,User 3844 account experienced multiple failed ...,Security Alert,bert,0,
2407,2025-08-03 03:07:47,ThirdPartyAPI,nova.metadata.wsgi.server [req-b6d4a270-accb-4...,HTTP Status,bert,0,
2408,11/11/2025 11:52,BillingSystem,Email service affected by failed transmission,Critical Error,bert,0,


In [82]:
df_non_legacy.shape

(2003, 7)

In [83]:
from sentence_transformers import SentenceTransformer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, accuracy_score

# Load the E5-small-v2 model
model = SentenceTransformer('intfloat/e5-small-v2')


In [84]:
# Filter logs that are not from "LegacyCRM"
df_non_legacy = df_non_regex[df_non_regex.source != "LegacyCRM"].copy()

# ✅ Prefix with "passage:" as required by E5 model
log_messages = ["passage: " + str(msg) for msg in df_non_legacy['log_message'].tolist()]


In [85]:
# Get dense vector embeddings
embeddings_filtered = model.encode(log_messages, convert_to_numpy=True)

# Prepare features and labels
X = embeddings_filtered
y = df_non_legacy['target_label'].values

In [86]:
# Train/test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Train classifier
clf = LogisticRegression(max_iter=1000)
clf.fit(X_train, y_train)

In [87]:
# Predict and evaluate
y_pred = clf.predict(X_test)
report = classification_report(y_test, y_pred)
print(report)

                precision    recall  f1-score   support

Critical Error       0.90      1.00      0.95        46
         Error       0.98      0.87      0.92        47
   HTTP Status       1.00      1.00      1.00       316
Resource Usage       1.00      1.00      1.00        46
Security Alert       0.99      0.99      0.99       114
   User Action       1.00      1.00      1.00        32

      accuracy                           0.99       601
     macro avg       0.98      0.98      0.98       601
  weighted avg       0.99      0.99      0.99       601



In [88]:
import joblib
joblib.dump(clf, '../models/log_classifier.joblib')

['../models/log_classifier.joblib']