In [1]:
import pandas as pd


In [2]:
df=pd.read_csv('dataset/synthetic_logs.csv')

In [3]:
from sentence_transformers import SentenceTransformer
model = SentenceTransformer('all-MiniLM-L6-v2')

  from .autonotebook import tqdm as notebook_tqdm


In [4]:
log_messages = df['log_message'].tolist()
embeddings = model.encode(log_messages)

In [5]:
# Add embeddings as a new column
df['embeddings'] = embeddings.tolist()


In [6]:
from sklearn.cluster import DBSCAN
dbscan=DBSCAN(eps=0.2, min_samples=1, metric='cosine')
clusters=dbscan.fit_predict(embeddings)
df['clusters']=clusters

In [7]:
import re

def classify_with_regex(log_message):
    regex_patterns = {
        r"User [User\d]+ logged (in|out).*" : "User Action",
        r"Backup (started|ended) at .*" : "System Notification",
        r"Backup completed successfully.*" : "System Notification",
        r"System updated to version .*" : "System Notification",
        r"FILE .* uploaded successfully by user .*" : "System Notification",
        r"DISK cleanup completed successfully.*" : "System Notification",
        r"System reboot initiated by user .*" : "System Notification",
        r"ACCOUNT with ID .* created by .*" : "User Action"
    }

    for pattern, label in regex_patterns.items():
        if re.search(pattern, log_message, re.IGNORECASE):
            return label

    return None

In [8]:
df['regex-label']=df['log_message'].apply(classify_with_regex)

In [9]:
df_non_regex=df[df['regex-label'].isnull()].copy()

In [10]:
df_legacy = df_non_regex[df_non_regex.source=="LegacyCRM"]
df_legacy

Unnamed: 0,timestamp,source,log_message,target_label,complexity,embeddings,clusters,regex-label
60,2025-10-06 16:55:23,LegacyCRM,Lead conversion failed for prospect ID 7842 du...,Workflow Error,llm,"[-0.04226111248135567, 0.04626970738172531, -0...",24,
255,2025-05-03 16:55:35,LegacyCRM,API endpoint 'getCustomerDetails' is deprecate...,Deprecation Warning,llm,"[-0.021058598533272743, -0.011281420476734638,...",48,
377,2025-06-24 12:16:29,LegacyCRM,Customer follow-up process for lead ID 5621 fa...,Workflow Error,llm,"[-0.08893539756536484, -0.00770153570920229, -...",62,
1325,2025-04-17 07:33:44,LegacyCRM,Escalation rule execution failed for ticket ID...,Workflow Error,llm,"[-0.02439083717763424, 0.01563330739736557, -0...",105,
1734,2025-04-30 07:47:30,LegacyCRM,The 'ExportToCSV' feature is outdated. Please ...,Deprecation Warning,llm,"[-0.038602638989686966, -0.06171542778611183, ...",118,
1826,2025-01-23 10:33:36,LegacyCRM,Support for legacy authentication methods will...,Deprecation Warning,llm,"[-0.0886654257774353, 0.0008167490595951676, -...",122,
2217,2025-05-12 09:46:54,LegacyCRM,Task assignment for TeamID 3425 could not comp...,Workflow Error,llm,"[-0.0633983165025711, -0.02702908031642437, -0...",133,


In [11]:
df_non_legacy = df_non_regex[df_non_regex.source!="LegacyCRM"]
df_non_legacy

Unnamed: 0,timestamp,source,log_message,target_label,complexity,embeddings,clusters,regex-label
0,2025-06-27 07:20:25,ModernCRM,nova.osapi_compute.wsgi.server [req-b9718cd8-f...,HTTP Status,bert,"[-0.10293962061405182, 0.03354594111442566, -0...",0,
1,1/14/2025 23:07,ModernCRM,Email service experiencing issues with sending,Critical Error,bert,"[0.008045717142522335, -0.0357392281293869, 0....",1,
2,1/17/2025 1:29,AnalyticsEngine,Unauthorized access to data was attempted,Security Alert,bert,"[-0.009082237258553505, 0.130039244890213, -0....",2,
3,2025-07-12 00:24:16,ModernHR,nova.osapi_compute.wsgi.server [req-4895c258-b...,HTTP Status,bert,"[-0.0975104570388794, 0.04911298677325249, -0....",0,
4,2025-06-02 18:25:23,BillingSystem,nova.osapi_compute.wsgi.server [req-ee8bc8ba-9...,HTTP Status,bert,"[-0.10468337684869766, 0.059260375797748566, -...",0,
...,...,...,...,...,...,...,...,...
2405,2025-08-13 07:29:25,ModernHR,nova.osapi_compute.wsgi.server [req-96c3ec98-2...,HTTP Status,bert,"[-0.1001005470752716, 0.05426649749279022, -0....",0,
2406,1/11/2025 5:32,ModernHR,User 3844 account experienced multiple failed ...,Security Alert,bert,"[0.07716462016105652, -0.013951756991446018, -...",7,
2407,2025-08-03 03:07:47,ThirdPartyAPI,nova.metadata.wsgi.server [req-b6d4a270-accb-4...,HTTP Status,bert,"[-0.04022269695997238, 0.04224354401230812, -0...",0,
2408,11/11/2025 11:52,BillingSystem,Email service affected by failed transmission,Critical Error,bert,"[-0.03603454679250717, 0.019608931615948677, 0...",1,


In [12]:
embeddings_filtered = model.encode(df_non_legacy['log_message'].tolist())

In [13]:
X = embeddings_filtered
y = df_non_legacy['target_label'].values

In [14]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
clf = LogisticRegression(max_iter=1000)
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)
report = classification_report(y_test, y_pred)
print(report)

                precision    recall  f1-score   support

Critical Error       0.91      1.00      0.95        48
         Error       0.98      0.89      0.93        47
   HTTP Status       1.00      1.00      1.00       304
Resource Usage       1.00      1.00      1.00        49
Security Alert       1.00      0.99      1.00       123

      accuracy                           0.99       571
     macro avg       0.98      0.98      0.98       571
  weighted avg       0.99      0.99      0.99       571



In [15]:
import joblib
joblib.dump(clf, '../models/log_classifier.joblib')


['../models/log_classifier.joblib']