In [4]:
import pandas as pd

df = pd.read_csv('dataset/synthetic_logs (1).csv')
df.drop(columns=['complexity'], inplace=True)

In [5]:
df

Unnamed: 0,timestamp,source,log_message,target_label
0,2025-06-27 07:20:25,ModernCRM,nova.osapi_compute.wsgi.server [req-b9718cd8-f...,HTTP Status
1,1/14/2025 23:07,ModernCRM,Email service experiencing issues with sending,Critical Error
2,1/17/2025 1:29,AnalyticsEngine,Unauthorized access to data was attempted,Security Alert
3,2025-07-12 00:24:16,ModernHR,nova.osapi_compute.wsgi.server [req-4895c258-b...,HTTP Status
4,2025-06-02 18:25:23,BillingSystem,nova.osapi_compute.wsgi.server [req-ee8bc8ba-9...,HTTP Status
...,...,...,...,...
2405,2025-08-13 07:29:25,ModernHR,nova.osapi_compute.wsgi.server [req-96c3ec98-2...,HTTP Status
2406,1/11/2025 5:32,ModernHR,User 3844 account experienced multiple failed ...,Security Alert
2407,2025-08-03 03:07:47,ThirdPartyAPI,nova.metadata.wsgi.server [req-b6d4a270-accb-4...,HTTP Status
2408,11/11/2025 11:52,BillingSystem,Email service affected by failed transmission,Critical Error


In [6]:
df.source.unique()

array(['ModernCRM', 'AnalyticsEngine', 'ModernHR', 'BillingSystem',
       'ThirdPartyAPI', 'LegacyCRM'], dtype=object)

In [7]:
df.target_label.unique()

array(['HTTP Status', 'Critical Error', 'Security Alert', 'Error',
       'System Notification', 'Resource Usage', 'User Action',

The user wants to cluster log messages from a DataFrame named `df` using the DBSCAN algorithm. To achieve this, I will:
1.  Install and import the `sentence-transformers` library to generate vector embeddings for the text.
2.  Use a pre-trained model (e.g., `'all-MiniLM-L6-v2'`) to transform the `log_message` column into embeddings.
3.  Apply the `DBSCAN` algorithm from `scikit-learn` to cluster these embeddings.
4.  Store the resulting cluster labels back into the DataFrame.



In [8]:
!pip install sentence-transformers

Collecting sentence-transformers
  Downloading sentence_transformers-5.2.0-py3-none-any.whl.metadata (16 kB)
Collecting transformers<6.0.0,>=4.41.0 (from sentence-transformers)
  Using cached transformers-4.57.3-py3-none-any.whl.metadata (43 kB)
Collecting torch>=1.11.0 (from sentence-transformers)
  Downloading torch-2.9.1-cp312-none-macosx_11_0_arm64.whl.metadata (30 kB)
Collecting huggingface-hub>=0.20.0 (from sentence-transformers)
  Using cached huggingface_hub-1.2.3-py3-none-any.whl.metadata (13 kB)
Collecting hf-xet<2.0.0,>=1.2.0 (from huggingface-hub>=0.20.0->sentence-transformers)
  Using cached hf_xet-1.2.0-cp37-abi3-macosx_11_0_arm64.whl.metadata (4.9 kB)
Collecting typer-slim (from huggingface-hub>=0.20.0->sentence-transformers)
  Using cached typer_slim-0.21.0-py3-none-any.whl.metadata (16 kB)
Collecting sympy>=1.13.3 (from torch>=1.11.0->sentence-transformers)
  Using cached sympy-1.14.0-py3-none-any.whl.metadata (12 kB)
Collecting huggingface-hub>=0.20.0 (f

In [9]:
from sentence_transformers import SentenceTransformer
from sklearn.cluster import DBSCAN
import numpy as np

# Initialize the sentence transformer model
model = SentenceTransformer('all-MiniLM-L6-v2')

# Generate embeddings for the log messages
embeddings = model.encode(df['log_message'].tolist(), show_progress_bar=True)

modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

Batches:   0%|          | 0/76 [00:00<?, ?it/s]

In [10]:
# Initialize and fit DBSCAN
# eps and min_samples may need tuning based on the specific dataset density
dbscan = DBSCAN(eps=0.5, min_samples=5, metric='cosine')
clusters = dbscan.fit_predict(embeddings)

# Add cluster labels to the dataframe
df['cluster'] = clusters

# Display cluster distribution (Note: -1 represents noise)
print(df['cluster'].value_counts())

cluster
 0    1194
 1     902
 3     197
 4      58
 2      53
-1       6
Name: count, dtype: int64


In [11]:
df.head()


Unnamed: 0,timestamp,source,log_message,target_label,cluster
0,2025-06-27 07:20:25,ModernCRM,nova.osapi_compute.wsgi.server [req-b9718cd8-f...,HTTP Status,0
1,1/14/2025 23:07,ModernCRM,Email service experiencing issues with sending,Critical Error,1
2,1/17/2025 1:29,AnalyticsEngine,Unauthorized access to data was attempted,Security Alert,1
3,2025-07-12 00:24:16,ModernHR,nova.osapi_compute.wsgi.server [req-4895c258-b...,HTTP Status,0
4,2025-06-02 18:25:23,BillingSystem,nova.osapi_compute.wsgi.server [req-ee8bc8ba-9...,HTTP Status,0


In [12]:
embeddings[:2]

array([[-1.02939673e-01,  3.35459784e-02, -2.20260806e-02,
         1.55102601e-03, -9.86915827e-03, -1.78956300e-01,
        -6.34410828e-02, -6.01761751e-02,  2.81108413e-02,
         5.99619783e-02, -1.72618497e-02,  1.43372838e-03,
        -1.49560034e-01,  3.15285241e-03, -5.66031151e-02,
         2.71685794e-02, -1.49890231e-02, -3.54038030e-02,
        -3.62936929e-02, -1.45410579e-02, -5.61500154e-03,
         8.75538066e-02,  4.55120578e-02,  2.50964463e-02,
         1.00187277e-02,  1.24266697e-02, -1.39923573e-01,
         7.68695846e-02,  3.14095095e-02, -4.15249076e-03,
         4.36902270e-02,  1.71250384e-02, -8.00950825e-02,
         5.74005730e-02,  1.89091824e-02,  8.55261683e-02,
         3.96399498e-02, -1.34371817e-01, -1.44371425e-03,
         3.06706317e-03,  1.76854134e-01,  4.44883108e-03,
        -1.69274695e-02,  2.24266555e-02, -4.35050204e-02,
         6.09027082e-03, -9.98169463e-03, -6.23972826e-02,
         1.07371965e-02, -6.04894804e-03, -7.14660361e-0

In [13]:
df[df.cluster == 1]

Unnamed: 0,timestamp,source,log_message,target_label,cluster
1,1/14/2025 23:07,ModernCRM,Email service experiencing issues with sending,Critical Error,1
2,1/17/2025 1:29,AnalyticsEngine,Unauthorized access to data was attempted,Security Alert,1
6,3/1/2025 19:14,ModernHR,Shard 6 replication task ended in failure,Error,1
10,8/9/2025 18:58,ModernCRM,Email server encountered a sending fault,Error,1
11,6/15/2025 11:44,ModernHR,Critical system unit error: unit ID Component55,Critical Error,1
...,...,...,...,...,...
2398,3/31/2025 4:11,ModernHR,Potential vulnerability exploit detected from ...,Security Alert,1
2402,3/13/2025 9:44,BillingSystem,Replication error occurred for shard 10,Error,1
2406,1/11/2025 5:32,ModernHR,User 3844 account experienced multiple failed ...,Security Alert,1
2408,11/11/2025 11:52,BillingSystem,Email service affected by failed transmission,Critical Error,1


Ab idhar kya hora ki bhai cluster 1 hi tha mera but usme achese ek tarah ke cluster nhi dikhre the merko to fir let's change eps and min_smaples value to tight out the boundary and see if we get a bit better clustering results.

In [14]:
dbscan = DBSCAN(eps=0.2, min_samples=1, metric='cosine')
clusters = dbscan.fit_predict(embeddings)
df['cluster'] = clusters
print(df['cluster'].value_counts())

cluster
0      1017
5       147
11      100
13       86
7        60
       ... 
102       1
103       1
105       1
106       1
135       1
Name: count, Length: 136, dtype: int64


In [17]:
df[df.cluster == 1]

Unnamed: 0,timestamp,source,log_message,target_label,cluster
1,1/14/2025 23:07,ModernCRM,Email service experiencing issues with sending,Critical Error,1
10,8/9/2025 18:58,ModernCRM,Email server encountered a sending fault,Error,1
217,1/22/2025 5:45,BillingSystem,Mail service encountered a delivery glitch,Error,1
248,5/2/2025 23:04,ModernHR,Service disruption caused by email sending error,Critical Error,1
265,3/30/2025 23:53,ModernCRM,Email system had a problem sending emails,Error,1
361,11/19/2025 23:06,BillingSystem,Email service experienced a sending issue,Error,1
450,10/27/2025 5:59,ThirdPartyAPI,Email delivery system encountered an error,Error,1
477,12/2/2025 10:30,AnalyticsEngine,Email transmission error caused service impact,Critical Error,1
570,11/7/2025 18:08,ThirdPartyAPI,Email service impacted by sending failure,Critical Error,1
678,4/28/2025 15:13,AnalyticsEngine,Email delivery problem affected system,Critical Error,1


In [18]:
# ab jaake thode similar aaye , better h pehle se!

I will now summarize the task: I'll identify the clusters with more than 10 records, sort them by size in descending order, and then print 5 sample log messages for each of these significant clusters.



<llm-snippet-file>training.ipynb</llm-snippet-file>


In [19]:
# Identify clusters with more than 10 records and sort them by size
cluster_counts = df['cluster'].value_counts()
large_clusters = cluster_counts[cluster_counts > 10].index

# Iterate through sorted large clusters and print 5 sample log messages
for cluster_id in large_clusters:
    print(f"--- Cluster {cluster_id} (Size: {cluster_counts[cluster_id]}) ---")
    samples = df[df['cluster'] == cluster_id]['log_message'].head(5)
    for msg in samples:
        print(f"- {msg}")
    print("\n")


--- Cluster 0 (Size: 1017) ---
- nova.osapi_compute.wsgi.server [req-b9718cd8-f65e-49cc-8349-6cf7122af137 113d3a99c3da401fbd62cc2caa5b96d2 54fadb412c4e40cdbaed9335e4c35a9e - - -] 10.11.10.1 "GET /v2/54fadb412c4e40cdbaed9335e4c35a9e/servers/detail HTTP/1.1" status: 200 len: 1893 time: 0.2675118
- nova.osapi_compute.wsgi.server [req-4895c258-b2f8-488f-a2a3-4fae63982e48 113d3a99c3da401fbd62cc2caa5b96d2 54fadb412c4e40cdbaed9335e4c35a9e - - -] 10.11.10.1 "GET /v2/54fadb412c4e40cdbaed9335e4c35a9e/servers/detail HTTP/1.1" HTTP status code -  200 len: 211 time: 0.0968180
- nova.osapi_compute.wsgi.server [req-ee8bc8ba-9265-4280-9215-dbe000a41209 113d3a99c3da401fbd62cc2caa5b96d2 54fadb412c4e40cdbaed9335e4c35a9e - - -] 10.11.10.1 "GET /v2/54fadb412c4e40cdbaed9335e4c35a9e/servers/detail HTTP/1.1" RCODE  200 len: 1874 time: 0.2280791
- nova.osapi_compute.wsgi.server [req-f0bffbc3-5ab0-4916-91c1-0a61dd7d4ec2 113d3a99c3da401fbd62cc2caa5b96d2 54fadb412c4e40cdbaed9335e4c35a9e - - -] 10.11.10.1 "GET /v2

now here i see patterns in some of the clusters with dynamic values. so i can create regex expression for their sorting now.

Based on the samples observed in the clusters, here is a function that uses regular expressions to classify those log messages.



In [21]:
import re
def classify_with_regex(log_message):
    regex_patterns = {
        r"User User\d+ logged (in|out).": "User Action",
        r"Backup (started|ended) at .*": "System Notification",
        r"Backup completed successfully.": "System Notification",
        r"System updated to version .*": "System Notification",
        r"File .* uploaded successfully by user .*": "System Notification",
        r"Disk cleanup completed successfully.": "System Notification",
        r"System reboot initiated by user .*": "System Notification",
        r"Account with ID .* created by .*": "User Action"
    }
    for pattern, label in regex_patterns.items():
        if re.search(pattern, log_message, re.IGNORECASE):
            return label
    return None

In [22]:
classify_with_regex("USer User123 logged in.")

'User Action'

In [23]:
classify_with_regex("Hey you, chill bro")

In [26]:
df['regex_label'] = df['log_message'].apply(classify_with_regex)
df[df.regex_label.isna()]

Unnamed: 0,timestamp,source,log_message,target_label,cluster,regex_label
0,2025-06-27 07:20:25,ModernCRM,nova.osapi_compute.wsgi.server [req-b9718cd8-f...,HTTP Status,0,
1,1/14/2025 23:07,ModernCRM,Email service experiencing issues with sending,Critical Error,1,
2,1/17/2025 1:29,AnalyticsEngine,Unauthorized access to data was attempted,Security Alert,2,
3,2025-07-12 00:24:16,ModernHR,nova.osapi_compute.wsgi.server [req-4895c258-b...,HTTP Status,0,
4,2025-06-02 18:25:23,BillingSystem,nova.osapi_compute.wsgi.server [req-ee8bc8ba-9...,HTTP Status,0,
...,...,...,...,...,...,...
2405,2025-08-13 07:29:25,ModernHR,nova.osapi_compute.wsgi.server [req-96c3ec98-2...,HTTP Status,0,
2406,1/11/2025 5:32,ModernHR,User 3844 account experienced multiple failed ...,Security Alert,7,
2407,2025-08-03 03:07:47,ThirdPartyAPI,nova.metadata.wsgi.server [req-b6d4a270-accb-4...,HTTP Status,0,
2408,11/11/2025 11:52,BillingSystem,Email service affected by failed transmission,Critical Error,1,


In [27]:
df[df.regex_label.notna()]

Unnamed: 0,timestamp,source,log_message,target_label,cluster,regex_label
7,10/11/2025 8:44,ModernHR,File data_6169.csv uploaded successfully by us...,System Notification,4,System Notification
14,1/4/2025 1:43,ThirdPartyAPI,File data_3847.csv uploaded successfully by us...,System Notification,4,System Notification
15,5/1/2025 9:41,ModernCRM,Backup completed successfully.,System Notification,8,System Notification
18,2/22/2025 17:49,ModernCRM,Account with ID 5351 created by User634.,User Action,9,User Action
27,9/24/2025 19:57,ThirdPartyAPI,User User685 logged out.,User Action,11,User Action
...,...,...,...,...,...,...
2376,6/27/2025 8:47,ModernCRM,System updated to version 2.0.5.,System Notification,21,System Notification
2381,9/5/2025 6:39,ThirdPartyAPI,Disk cleanup completed successfully.,System Notification,32,System Notification
2394,4/3/2025 13:13,ModernHR,Disk cleanup completed successfully.,System Notification,32,System Notification
2395,5/2/2025 14:29,ThirdPartyAPI,Backup ended at 2025-05-06 11:23:16.,System Notification,13,System Notification


In [28]:
df_non_regex = df[df['regex_label'].isna()].copy()
df_non_regex.shape

(1910, 6)

## to dekho abhi apan ne dekha ki regex ki madad se apan ne 500 to classify krliye h with help of regex
## ab baaki bache hue ka we will go with BERT OR LLM