In [1]:
import pandas as pd

In [2]:
df=pd.read_csv('dataset/synthetic_logs.csv')

In [3]:
df.head()

Unnamed: 0,timestamp,source,log_message,target_label,complexity
0,2025-06-27 07:20:25,ModernCRM,nova.osapi_compute.wsgi.server [req-b9718cd8-f...,HTTP Status,bert
1,1/14/2025 23:07,ModernCRM,Email service experiencing issues with sending,Critical Error,bert
2,1/17/2025 1:29,AnalyticsEngine,Unauthorized access to data was attempted,Security Alert,bert
3,2025-07-12 00:24:16,ModernHR,nova.osapi_compute.wsgi.server [req-4895c258-b...,HTTP Status,bert
4,2025-06-02 18:25:23,BillingSystem,nova.osapi_compute.wsgi.server [req-ee8bc8ba-9...,HTTP Status,bert


In [4]:
df.source.unique()

array(['ModernCRM', 'AnalyticsEngine', 'ModernHR', 'BillingSystem',
       'ThirdPartyAPI', 'LegacyCRM'], dtype=object)

In [5]:
df.target_label.unique()

array(['HTTP Status', 'Critical Error', 'Security Alert', 'Error',
       'System Notification', 'Resource Usage', 'User Action',

In [6]:
# !pip install torch sentence-transformers scikit-learn




In [7]:
from sentence_transformers import SentenceTransformer
from sklearn.cluster import DBSCAN

model=SentenceTransformer('all-MiniLM-L6-v2')

embeddings=model.encode(df['log_message'].tolist())
embeddings[:2]


  from .autonotebook import tqdm as notebook_tqdm


array([[-1.02939621e-01,  3.35459411e-02, -2.20260732e-02,
         1.55101740e-03, -9.86917876e-03, -1.78956270e-01,
        -6.34409785e-02, -6.01761639e-02,  2.81109158e-02,
         5.99620491e-02, -1.72618348e-02,  1.43363548e-03,
        -1.49560034e-01,  3.15287686e-03, -5.66030927e-02,
         2.71685235e-02, -1.49891041e-02, -3.54037657e-02,
        -3.62936445e-02, -1.45410765e-02, -5.61491773e-03,
         8.75539035e-02,  4.55120578e-02,  2.50963885e-02,
         1.00187510e-02,  1.24267349e-02, -1.39923573e-01,
         7.68696293e-02,  3.14095505e-02, -4.15247958e-03,
         4.36902344e-02,  1.71250012e-02, -8.00951198e-02,
         5.74006326e-02,  1.89091656e-02,  8.55262503e-02,
         3.96398641e-02, -1.34371817e-01, -1.44360063e-03,
         3.06704035e-03,  1.76854044e-01,  4.44885530e-03,
        -1.69274509e-02,  2.24266481e-02, -4.35049310e-02,
         6.09034160e-03, -9.98169929e-03, -6.23972900e-02,
         1.07372422e-02, -6.04895083e-03, -7.14660808e-0

In [8]:
dbscan=DBSCAN(eps=0.35, min_samples=1, metric='euclidean')
clusters=dbscan.fit_predict(embeddings)

df['cluster']=clusters

In [9]:
df.head(10)

Unnamed: 0,timestamp,source,log_message,target_label,complexity,cluster
0,2025-06-27 07:20:25,ModernCRM,nova.osapi_compute.wsgi.server [req-b9718cd8-f...,HTTP Status,bert,0
1,1/14/2025 23:07,ModernCRM,Email service experiencing issues with sending,Critical Error,bert,1
2,1/17/2025 1:29,AnalyticsEngine,Unauthorized access to data was attempted,Security Alert,bert,2
3,2025-07-12 00:24:16,ModernHR,nova.osapi_compute.wsgi.server [req-4895c258-b...,HTTP Status,bert,0
4,2025-06-02 18:25:23,BillingSystem,nova.osapi_compute.wsgi.server [req-ee8bc8ba-9...,HTTP Status,bert,0
5,2025-10-09 10:30:31,ModernHR,nova.osapi_compute.wsgi.server [req-f0bffbc3-5...,HTTP Status,bert,0
6,3/1/2025 19:14,ModernHR,Shard 6 replication task ended in failure,Error,bert,3
7,10/11/2025 8:44,ModernHR,File data_6169.csv uploaded successfully by us...,System Notification,regex,4
8,2025-02-12 10:42:29,ThirdPartyAPI,nova.compute.claims [req-a07ac654-8e81-416d-bf...,Resource Usage,bert,5
9,2025-03-30 04:01:45,ModernHR,nova.osapi_compute.wsgi.server [req-2bf7cfee-a...,HTTP Status,bert,0


In [10]:
df[df.cluster==1]

Unnamed: 0,timestamp,source,log_message,target_label,complexity,cluster
1,1/14/2025 23:07,ModernCRM,Email service experiencing issues with sending,Critical Error,bert,1
361,11/19/2025 23:06,BillingSystem,Email service experienced a sending issue,Error,bert,1
1901,9/22/2025 2:52,ThirdPartyAPI,Email service experienced a mail sending issue,Error,bert,1


In [11]:
cluster_counts=df['cluster'].value_counts()
large_clusters=cluster_counts[cluster_counts>10].index

for cluster in large_clusters:
    print(f"Cluster {cluster}:")
    print(df[df['cluster']==cluster]['log_message'].head(5).to_string(index=False))
    print()

        

pd.set_option('display.max_colwidth', None)

# Then run your output saving block
with open("clusters.txt", "w", encoding="utf-8") as f:
    for cluster in large_clusters:
        f.write(f"Cluster {cluster}:\n")
        messages = df[df['cluster'] == cluster]['log_message'].head(5).to_string(index=False)
        f.write(messages + "\n\n")
    


Cluster 0:
nova.osapi_compute.wsgi.server [req-b9718cd8-f6...
nova.osapi_compute.wsgi.server [req-4895c258-b2...
nova.osapi_compute.wsgi.server [req-ee8bc8ba-92...
nova.osapi_compute.wsgi.server [req-f0bffbc3-5a...
nova.osapi_compute.wsgi.server [req-2bf7cfee-a2...

Cluster 16:
nova.metadata.wsgi.server [-] 10.11.21.138,10.1...
nova.metadata.wsgi.server [req-61196723-e034-48...
nova.metadata.wsgi.server [req-7d3eeb2d-3948-43...
nova.metadata.wsgi.server [-] 10.11.21.137,10.1...
nova.metadata.wsgi.server [req-e0d4ce94-e0cb-41...

Cluster 9:
Backup completed successfully.
Backup completed successfully.
Backup completed successfully.
Backup completed successfully.
Backup completed successfully.

Cluster 57:
Backup ended at 2025-08-08 13:06:23.
Backup ended at 2025-06-01 06:27:38.
Backup ended at 2025-11-16 08:17:13.
Backup ended at 2025-07-18 17:06:54.
Backup ended at 2025-08-18 14:29:26.

Cluster 48:
Disk cleanup completed successfully.
Disk cleanup completed successfully.
Disk cleanup c

In [22]:
import re
def classify_with_regex(log_message):
    regex_patterns={
        r"nova.osapi_compute.wsgi.server \[req-([a-f0-9\- \].\"]+)GET \/v2\/54fadb412c4e40cdbaed9335e4c35a9e\/servers\/detail HTTP\/1.1\" ([a-z:0-9\- \].\" A-Z]*)": "HTTP Status",
        r"nova.metadata.wsgi.server \[([-a-z\] 0-9.,]+)\"GET \/openstack\/201([0-9])-([0-9]+)-([0-9]+)([a-z_. \/]+)HTTP\/1.1\" ([A-Za-z 0-9:.]*)": "HTTP Status",
        r"Backup completed successfully\.?": "System Notification",
        r"Backup ended at 2025-[0-9][0-9]-[0-9][0-9] [0-9][0-9]:[0-9][0-9]:[0-9][0-9]\.?": "System Notification",
        r"Disk cleanup completed successfully\.?": "System Notification",
        r"User User([0-9]+) logged out\.?":"User Action",
        r"File data_([0-9][0-9][0-9][0-9]).csv uploaded successfully by user User([0-9][0-9][0-9])\.?":"System Notification",
        r"nova.osapi_compute.wsgi.server \[req-([a-zA-Z: 0-9.-]*)] 10.11.10.1 \"POST \/v2\/e9746973ac574cf56b8a9e88576a7608\/os-server-external-events HTTP\/1.1\"([a-zA-z0-9 -:]*)":"HTTP Status",
        r"nova.compute.claims \[req-([a-zA-Z: 0-9.-]*)] \[instance: ([a-zA-Z: 0-9.-]*)] ([a-zA-Z: 0-9.-]*), ([a-zA-Z: 0-9.-]*)":"Resource Usage",
        "System reboot initiated by user User([0-9][0-9][0-9])\.?":"System Notification",
        r"Backup started at 2025-[0-9][0-9]-[0-9][0-9] [0-9][0-9]:[0-9][0-9]:[0-9][0-9]\.?":"System Notification",
        r"nova.metadata.wsgi.server \[([-a-z0-9 ]*)] 10.11.21.1([0-9,.]*) \"GET \/latest\/meta-data\/block-device-mapping\/([ a-z]*)HTTP\/1.1\" ([sStatus codeHTTP:-]*)200 len: 124 time: 0.([0-9]*)":"HTTP Status",
        r"nova.metadata.wsgi.server \[([-a-z0-9 ]*)] 10.11.21.1([0-9,.]*) \"GET \/latest\/meta-data\/([placement \/]*)HTTP\/1.1\" ([ status HTTP cdoeRCODE:-]*)200 len: ([-a-z0-9 ]*): 0.([0-9]*)":"HTTP Status",
        r"nova.compute.claims \[req-([0-9-a-f]*) ([0-9a-f]*) ([0-9a-f]*) - - -\] \[instance: ([0-9a-f]*)-([0-9a-f]*)-([0-9a-f]*)-([0-9a-f]*)-([0-9a-f]*)\] Attempting claim: memory 2048 MB, disk 20 GB, vcpus 1 CPU":"Resource Usage",
        r"nova.compute.claims \[req-([0-9-a-f]*) ([0-9a-f]*) ([0-9a-f]*) - - -\] \[instance: ([0-9a-f]*)-([0-9a-f]*)-([0-9a-f]*)-([0-9a-f]*)-([0-9a-f]*)\] Total disk: 15 GB, used: 0.00 GB":"Resource Usage",
        r"/nova.compute.claims \[req-([0-9-a-f]*) ([0-9a-f]*) ([0-9a-f]*) - - -\] \[instance: ([0-9a-f]*)-([0-9a-f]*)-([0-9a-f]*)-([0-9a-f]*)-([0-9a-f]*)\] Total vcpu: 16 VCPU, used: 0.00 VCPU/gm":"Resource Usage",
        r"nova.compute.claims \[req-([0-9-a-f]*) ([0-9a-f]*) ([0-9a-f]*) - - -\] \[instance: ([0-9a-f]*)-([0-9a-f]*)-([0-9a-f]*)-([0-9a-f]*)-([0-9a-f]*)\] vcpu limit not specified, defaulting to unlimited":"Resource Usage",
        r"nova.compute.claims \[req-([0-9-a-f]*) ([0-9a-f]*) ([0-9a-f]*) - - -\] \[instance: ([0-9a-f]*)-([0-9a-f]*)-([0-9a-f]*)-([0-9a-f]*)-([0-9a-f]*)\] disk limit not specified, defaulting to unlimited":"Resource Usage",
        r"([A-Za-z ]*)health check([A-Za-z ]*)\.?":"Error",
        r"nova.metadata.wsgi.server \[([a-z0-9 -]*)\] 10.11.21.1([0-9,.]*) \"GET \/openstack\/2013-10-17\/user_data HTTP\/1.1\" ([RCODE status:codeHTTP-]*)404 len: 176 time: 0.([0-9]*)":"HTTP Status",
        r"nova.compute.resource_tracker \[req-([0-9-a-f]*) - - - - -\] Final resource view: name=cp-1.slowvm1.tcloud-pg0.utah.cloudlab.us phys_ram=64172MB used_ram=[0-9]*MB phys_disk=15GB used_disk=[0-9]*GB total_vcpus=16 used_vcpus=[0-9] pci_stats=\[\]":"Resource Usage",
        r"nova.compute.resource_tracker \[req-([0-9-a-f]*) - - - - -\] Total usable vcpus: 16, total allocated vcpus: [0-9]":"Resource Usage",
        r"Account with ID .* created by.*":"User Action"
    }
    for pattern,label in regex_patterns.items():
        if re.search(pattern,log_message,re.IGNORECASE):
            return label
    return None
    


In [23]:
classify_with_regex("Account with ID A0098234 created by Dhaval")

'User Action'

In [27]:
df['regex_label'] = df['log_message'].apply(classify_with_regex)
df[df.regex_label.notnull()].shape

(1466, 7)

In [28]:
df_non_regex=df[df['regex_label'].isnull()].copy()
df_non_regex.shape

(944, 7)

In [29]:
print(df_non_regex['target_label'].value_counts()[df_non_regex['target_label'].value_counts()<=5] .index.tolist())



In [30]:
df

Unnamed: 0,timestamp,source,log_message,target_label,complexity,cluster,regex_label
0,2025-06-27 07:20:25,ModernCRM,"nova.osapi_compute.wsgi.server [req-b9718cd8-f65e-49cc-8349-6cf7122af137 113d3a99c3da401fbd62cc2caa5b96d2 54fadb412c4e40cdbaed9335e4c35a9e - - -] 10.11.10.1 ""GET /v2/54fadb412c4e40cdbaed9335e4c35a9e/servers/detail HTTP/1.1"" status: 200 len: 1893 time: 0.2675118",HTTP Status,bert,0,HTTP Status
1,1/14/2025 23:07,ModernCRM,Email service experiencing issues with sending,Critical Error,bert,1,
2,1/17/2025 1:29,AnalyticsEngine,Unauthorized access to data was attempted,Security Alert,bert,2,
3,2025-07-12 00:24:16,ModernHR,"nova.osapi_compute.wsgi.server [req-4895c258-b2f8-488f-a2a3-4fae63982e48 113d3a99c3da401fbd62cc2caa5b96d2 54fadb412c4e40cdbaed9335e4c35a9e - - -] 10.11.10.1 ""GET /v2/54fadb412c4e40cdbaed9335e4c35a9e/servers/detail HTTP/1.1"" HTTP status code - 200 len: 211 time: 0.0968180",HTTP Status,bert,0,HTTP Status
4,2025-06-02 18:25:23,BillingSystem,"nova.osapi_compute.wsgi.server [req-ee8bc8ba-9265-4280-9215-dbe000a41209 113d3a99c3da401fbd62cc2caa5b96d2 54fadb412c4e40cdbaed9335e4c35a9e - - -] 10.11.10.1 ""GET /v2/54fadb412c4e40cdbaed9335e4c35a9e/servers/detail HTTP/1.1"" RCODE 200 len: 1874 time: 0.2280791",HTTP Status,bert,0,HTTP Status
...,...,...,...,...,...,...,...
2405,2025-08-13 07:29:25,ModernHR,"nova.osapi_compute.wsgi.server [req-96c3ec98-21a0-4af2-84a8-d4989512413e 113d3a99c3da401fbd62cc2caa5b96d2 54fadb412c4e40cdbaed9335e4c35a9e - - -] 10.11.10.1 ""GET /v2/54fadb412c4e40cdbaed9335e4c35a9e/servers/detail HTTP/1.1"" Return code: 200 len: 1916 time: 0.2677610",HTTP Status,bert,0,HTTP Status
2406,1/11/2025 5:32,ModernHR,User 3844 account experienced multiple failed logins,Security Alert,bert,748,
2407,2025-08-03 03:07:47,ThirdPartyAPI,"nova.metadata.wsgi.server [req-b6d4a270-accb-4c3a-8179-9611e52e1768 - - - - -] 10.11.21.124,10.11.10.1 ""GET /openstack/2013-10-17 HTTP/1.1"" RCODE 200 len: 157 time: 0.2249990",HTTP Status,bert,16,HTTP Status
2408,11/11/2025 11:52,BillingSystem,Email service affected by failed transmission,Critical Error,bert,749,


In [32]:

df_non_legacy=df_non_regex[df_non_regex.source!="LegacyCRM"]
df_non_legacy.source.unique()


array(['ModernCRM', 'AnalyticsEngine', 'ModernHR', 'ThirdPartyAPI',
       'BillingSystem'], dtype=object)

In [33]:
embeddings=model.encode(df_non_legacy['log_message'].tolist())
embeddings[:2]

array([[ 8.04571714e-03, -3.57392281e-02,  4.93873917e-02,
        -7.19184149e-03,  6.17644750e-04, -9.05685052e-02,
         5.75594865e-02, -4.39446270e-02,  1.74751636e-02,
         5.21203242e-02, -4.62332033e-02,  2.49039363e-02,
         4.80811782e-02,  6.86636567e-02,  3.56146656e-02,
         3.00089642e-02,  2.61449497e-02, -9.05241296e-02,
        -8.02173615e-02,  5.04851192e-02, -7.90870488e-02,
        -1.76855002e-03, -2.48927530e-02,  9.02152434e-02,
         5.83698181e-03, -6.91070855e-02, -6.67966083e-02,
        -6.99113635e-03, -1.04565986e-01, -2.66473852e-02,
         3.05640064e-02,  2.02934369e-02,  2.25454886e-02,
        -4.22687922e-03,  1.11173484e-02,  6.78260773e-02,
         1.06073590e-02,  7.56636113e-02, -8.22777823e-02,
         8.45735706e-03, -6.97612464e-02, -4.21974584e-02,
        -1.05674397e-02,  2.23673340e-02,  5.64496815e-02,
         8.16500373e-03,  1.09344080e-01,  3.16564441e-02,
        -1.76923554e-02, -3.21071148e-02, -3.39975841e-0

In [35]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score,classification_report
X=embeddings
y=df_non_legacy['target_label']
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.3, random_state=42)
clf=LogisticRegression(max_iter=1000)
clf.fit(X_train,y_train)
y_pred=clf.predict(X_test)
report = classification_report(y_test, y_pred)
print(report)

                     precision    recall  f1-score   support

     Critical Error       0.96      1.00      0.98        44
              Error       1.00      0.96      0.98        46
        HTTP Status       1.00      1.00      1.00        40
     Security Alert       1.00      1.00      1.00       114
System Notification       1.00      1.00      1.00        21
        User Action       1.00      1.00      1.00        17

           accuracy                           0.99       282
          macro avg       0.99      0.99      0.99       282
       weighted avg       0.99      0.99      0.99       282



In [37]:
import joblib
joblib.dump(clf, 'models/log_clf.joblib ')

['models/log_clf.joblib ']