# Import Libraries

In [2]:
import pandas as pd
from sklearn.cluster import DBSCAN
from sentence_transformers import SentenceTransformer

  from .autonotebook import tqdm as notebook_tqdm


In [13]:
from sklearn.preprocessing import StandardScaler

## Load and Explore Data

In [3]:
df = pd.read_csv("DATASET/synthetic_logs.csv")
df.head()

Unnamed: 0,timestamp,source,log_message,target_label
0,2025-06-27 07:20:25,ModernCRM,nova.osapi_compute.wsgi.server [req-b9718cd8-f...,HTTP Status
1,1/14/2025 23:07,ModernCRM,Email service experiencing issues with sending,Critical Error
2,1/17/2025 1:29,AnalyticsEngine,Unauthorized access to data was attempted,Security Alert
3,2025-07-12 00:24:16,ModernHR,nova.osapi_compute.wsgi.server [req-4895c258-b...,HTTP Status
4,2025-06-02 18:25:23,BillingSystem,nova.osapi_compute.wsgi.server [req-ee8bc8ba-9...,HTTP Status


In [4]:
df.source.value_counts()

source
ThirdPartyAPI      496
ModernHR           492
BillingSystem      479
AnalyticsEngine    471
ModernCRM          465
LegacyCRM            7
Name: count, dtype: int64

In [5]:
df.target_label.value_counts()

target_label
HTTP Status            1017
Security Alert          371
System Notification     356
Error                   177
Resource Usage          177
Critical Error          161
User Action             144
Workflow Error            4
Name: count, dtype: int64

# Clustering

In [6]:
model = SentenceTransformer('all-MiniLM-L6-v2')  
embeddings = model.encode(df['log_message'].tolist())

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development
Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


In [10]:
print("Embeddings shape:", embeddings.shape)   
print("Each vector length:", len(embeddings[0]))

Embeddings shape: (2410, 384)
Each vector length: 384


In [12]:
embeddings[:5]

array([[-0.10293962,  0.03354594, -0.02202607, ...,  0.00457793,
        -0.04259717,  0.00322621],
       [ 0.00804572, -0.03573923,  0.04938739, ...,  0.01538319,
        -0.06230947, -0.02774666],
       [-0.00908224,  0.13003924, -0.05275568, ...,  0.02014104,
        -0.05117098, -0.02930294],
       [-0.09751046,  0.04911299, -0.03977424, ...,  0.02477502,
        -0.03546079, -0.00018598],
       [-0.10468338,  0.05926038, -0.02488499, ...,  0.02502055,
        -0.037193  , -0.0256891 ]], shape=(5, 384), dtype=float32)

In [None]:
clustering = DBSCAN(eps=0.2, min_samples=1, metric='cosine').fit(embeddings)
df['cluster'] = clustering.labels_

print("Number of clusters:", len(set(clustering.labels_)))

Number of clusters: 136
