# Import Libraries

In [2]:
import pandas as pd
from sklearn.cluster import DBSCAN
from sentence_transformers import SentenceTransformer

  from .autonotebook import tqdm as notebook_tqdm


In [51]:
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report

## Load and Explore Data

In [4]:
df = pd.read_csv("DATASET/synthetic_logs.csv")
df.head()

Unnamed: 0,timestamp,source,log_message,target_label
0,2025-06-27 07:20:25,ModernCRM,nova.osapi_compute.wsgi.server [req-b9718cd8-f...,HTTP Status
1,1/14/2025 23:07,ModernCRM,Email service experiencing issues with sending,Critical Error
2,1/17/2025 1:29,AnalyticsEngine,Unauthorized access to data was attempted,Security Alert
3,2025-07-12 00:24:16,ModernHR,nova.osapi_compute.wsgi.server [req-4895c258-b...,HTTP Status
4,2025-06-02 18:25:23,BillingSystem,nova.osapi_compute.wsgi.server [req-ee8bc8ba-9...,HTTP Status


In [5]:
df.source.value_counts()

source
ThirdPartyAPI      496
ModernHR           492
BillingSystem      479
AnalyticsEngine    471
ModernCRM          465
LegacyCRM            7
Name: count, dtype: int64

In [6]:
df.target_label.value_counts()

target_label
HTTP Status            1017
Security Alert          371
System Notification     356
Error                   177
Resource Usage          177
Critical Error          161
User Action             144
Workflow Error            4
Name: count, dtype: int64

# Clustering

In [7]:
model = SentenceTransformer('all-MiniLM-L6-v2')  
embeddings = model.encode(df['log_message'].tolist())

In [8]:
print("Embeddings shape:", embeddings.shape)   
print("Each vector length:", len(embeddings[0]))

Embeddings shape: (2410, 384)
Each vector length: 384


In [9]:
embeddings[:5]

array([[-0.10293962,  0.03354594, -0.02202607, ...,  0.00457793,
        -0.04259717,  0.00322621],
       [ 0.00804572, -0.03573923,  0.04938739, ...,  0.01538319,
        -0.06230947, -0.02774666],
       [-0.00908224,  0.13003924, -0.05275568, ...,  0.02014104,
        -0.05117098, -0.02930294],
       [-0.09751046,  0.04911299, -0.03977424, ...,  0.02477502,
        -0.03546079, -0.00018598],
       [-0.10468338,  0.05926038, -0.02488499, ...,  0.02502055,
        -0.037193  , -0.0256891 ]], shape=(5, 384), dtype=float32)

In [10]:
clustering = DBSCAN(eps=0.2, min_samples=1, metric='cosine').fit(embeddings)
df['cluster'] = clustering.labels_

print("Number of clusters:", len(set(clustering.labels_)))

Number of clusters: 136


In [11]:
cluster_counts = df['cluster'].value_counts()

big_clusters = cluster_counts[cluster_counts > 10].index

for cluster_id in big_clusters:
    logs = df[df['cluster'] == cluster_id]['log_message'].head(5).tolist()
    print(f"\nCluster {cluster_id} (size: {cluster_counts[cluster_id]})")
    for log in logs:
        print("   ", log)


Cluster 0 (size: 1017)
    nova.osapi_compute.wsgi.server [req-b9718cd8-f65e-49cc-8349-6cf7122af137 113d3a99c3da401fbd62cc2caa5b96d2 54fadb412c4e40cdbaed9335e4c35a9e - - -] 10.11.10.1 "GET /v2/54fadb412c4e40cdbaed9335e4c35a9e/servers/detail HTTP/1.1" status: 200 len: 1893 time: 0.2675118
    nova.osapi_compute.wsgi.server [req-4895c258-b2f8-488f-a2a3-4fae63982e48 113d3a99c3da401fbd62cc2caa5b96d2 54fadb412c4e40cdbaed9335e4c35a9e - - -] 10.11.10.1 "GET /v2/54fadb412c4e40cdbaed9335e4c35a9e/servers/detail HTTP/1.1" HTTP status code -  200 len: 211 time: 0.0968180
    nova.osapi_compute.wsgi.server [req-ee8bc8ba-9265-4280-9215-dbe000a41209 113d3a99c3da401fbd62cc2caa5b96d2 54fadb412c4e40cdbaed9335e4c35a9e - - -] 10.11.10.1 "GET /v2/54fadb412c4e40cdbaed9335e4c35a9e/servers/detail HTTP/1.1" RCODE  200 len: 1874 time: 0.2280791
    nova.osapi_compute.wsgi.server [req-f0bffbc3-5ab0-4916-91c1-0a61dd7d4ec2 113d3a99c3da401fbd62cc2caa5b96d2 54fadb412c4e40cdbaed9335e4c35a9e - - -] 10.11.10.1 "GET /v

# 1. Classification with Regex

In [12]:
import re
def classify_with_regex(log_message):
    regex_patterns = {
        r"User User\d+ logged (in|out).": "User Action",
        r"Backup (started|ended) at .*": "System Notification",
        r"Backup completed successfully.": "System Notification",
        r"System updated to version .*": "System Notification",
        r"File .* uploaded successfully by user .*": "System Notification",
        r"Disk cleanup completed successfully.": "System Notification",
        r"System reboot initiated by user .*": "System Notification",
        r"Account with ID .* created by .*": "User Action"
    }
    for pattern, label in regex_patterns.items():
        if re.search(pattern, log_message):
            return label
    return None

### Test the function classify_with_regex

In [13]:
classify_with_regex("User User123 logged in.")

'User Action'

In [14]:
# for unseen mesaage, it will return None
classify_with_regex("unseen log message")

### Add New Column and Classify with Regex

In [None]:
df['regex_label'] = df['log_message'].apply(lambda x: classify_with_regex(x))

### Logs Classified with regex

In [25]:
df_regex = df[df['regex_label'].notnull()]

print(f"Logs classified with Regex: {df_regex.shape[0]}")
df_regex.head()

Logs classified with Regex: 500


Unnamed: 0,timestamp,source,log_message,target_label,cluster,regex_label
7,10/11/2025 8:44,ModernHR,File data_6169.csv uploaded successfully by us...,System Notification,4,System Notification
14,1/4/2025 1:43,ThirdPartyAPI,File data_3847.csv uploaded successfully by us...,System Notification,4,System Notification
15,5/1/2025 9:41,ModernCRM,Backup completed successfully.,System Notification,8,System Notification
18,2/22/2025 17:49,ModernCRM,Account with ID 5351 created by User634.,User Action,9,User Action
27,9/24/2025 19:57,ThirdPartyAPI,User User685 logged out.,User Action,11,User Action


In [31]:
# Accuracy for regex predictions

re_correct_pred= df_regex[df_regex['target_label']==df_regex['regex_label']].shape[0]
regex_acc = re_correct_pred / df_regex.shape[0]

print(f"Regex Accuracy: {regex_acc}")

Regex Accuracy: 1.0


### Logs Not classified with regex

In [33]:
df_non_regex = df[df['regex_label'].isnull()]

print(f"Logs not classified with Regex: {df_non_regex.shape[0]}")
df_non_regex.head()

Logs not classified with Regex: 1910


Unnamed: 0,timestamp,source,log_message,target_label,cluster,regex_label
0,2025-06-27 07:20:25,ModernCRM,nova.osapi_compute.wsgi.server [req-b9718cd8-f...,HTTP Status,0,
1,1/14/2025 23:07,ModernCRM,Email service experiencing issues with sending,Critical Error,1,
2,1/17/2025 1:29,AnalyticsEngine,Unauthorized access to data was attempted,Security Alert,2,
3,2025-07-12 00:24:16,ModernHR,nova.osapi_compute.wsgi.server [req-4895c258-b...,HTTP Status,0,
4,2025-06-02 18:25:23,BillingSystem,nova.osapi_compute.wsgi.server [req-ee8bc8ba-9...,HTTP Status,0,


# 2. Classification Using ML

For Classification using ML need atleast few training samples, so lets eliminate training samples with very few `target_label` and `source`.

In [37]:
df_non_regex['target_label'].value_counts()

target_label
HTTP Status            1017
Security Alert          371
Error                   177
Resource Usage          177
Critical Error          161
Workflow Error            4
Name: count, dtype: int64

In [38]:
df_non_regex['source'].value_counts()

source
ModernHR           402
ThirdPartyAPI      386
ModernCRM          373
AnalyticsEngine    371
BillingSystem      371
LegacyCRM            7
Name: count, dtype: int64

Safely filter out `Workflow Error`, `Deprecation Error` and `LegacyCRM`.

In [48]:
df_ml = df_non_regex[
    (df_non_regex['source'] != 'LegacyCRM') &
    (df_non_regex['target_label'] != 'Workflow Error') &
    (df_non_regex['target_label'] != 'Deprecation Warning')
]
df_ml.shape

(1903, 6)

In [49]:
model = SentenceTransformer('all-MiniLM-L6-v2')  
embeddings2= model.encode(df_ml['log_message'].tolist())

In [52]:
X = embeddings2
y = df_ml['target_label'].values

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
clf = LogisticRegression(max_iter=1000)
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)
report = classification_report(y_test, y_pred)
print(report)

                precision    recall  f1-score   support

Critical Error       0.91      1.00      0.95        48
         Error       0.98      0.89      0.93        47
   HTTP Status       1.00      1.00      1.00       304
Resource Usage       1.00      1.00      1.00        49
Security Alert       1.00      0.99      1.00       123

      accuracy                           0.99       571
     macro avg       0.98      0.98      0.98       571
  weighted avg       0.99      0.99      0.99       571



In [54]:
import joblib
joblib.dump(clf, 'models/log_classifier.joblib')

['models/log_classifier.joblib']