In [6]:
from webbrowser import open_new

import pandas as pd
from sklearn.linear_model import LogisticRegression
from transformers.agents.default_tools import custom_print

df = pd.read_csv("dataset/synthetic_logs.csv")
df.head()

Unnamed: 0,timestamp,source,log_message,target_label,complexity
0,2025-06-27 07:20:25,ModernCRM,nova.osapi_compute.wsgi.server [req-b9718cd8-f...,HTTP Status,bert
1,1/14/2025 23:07,ModernCRM,Email service experiencing issues with sending,Critical Error,bert
2,1/17/2025 1:29,AnalyticsEngine,Unauthorized access to data was attempted,Security Alert,bert
3,2025-07-12 00:24:16,ModernHR,nova.osapi_compute.wsgi.server [req-4895c258-b...,HTTP Status,bert
4,2025-06-02 18:25:23,BillingSystem,nova.osapi_compute.wsgi.server [req-ee8bc8ba-9...,HTTP Status,bert


In [7]:
df.source.unique()

array(['ModernCRM', 'AnalyticsEngine', 'ModernHR', 'BillingSystem',
       'ThirdPartyAPI', 'LegacyCRM'], dtype=object)

In [8]:
df.target_label.unique()

array(['HTTP Status', 'Critical Error', 'Security Alert', 'Error',
       'System Notification', 'Resource Usage', 'User Action',

In [9]:
from sklearn.cluster import DBSCAN
from sentence_transformers import SentenceTransformer
import numpy as np

# Load pre-trained SentenceTransformer model
model = SentenceTransformer('all-MiniLM-L6-v2')

# Generate embeddings for log messages
embeddings = model.encode(df['log_message'].values, convert_to_tensor=True)

embeddings[:2]

tensor([[-1.0294e-01,  3.3546e-02, -2.2026e-02,  1.5510e-03, -9.8692e-03,
         -1.7896e-01, -6.3441e-02, -6.0176e-02,  2.8111e-02,  5.9962e-02,
         -1.7262e-02,  1.4336e-03, -1.4956e-01,  3.1529e-03, -5.6603e-02,
          2.7169e-02, -1.4989e-02, -3.5404e-02, -3.6294e-02, -1.4541e-02,
         -5.6149e-03,  8.7554e-02,  4.5512e-02,  2.5096e-02,  1.0019e-02,
          1.2427e-02, -1.3992e-01,  7.6870e-02,  3.1410e-02, -4.1525e-03,
          4.3690e-02,  1.7125e-02, -8.0095e-02,  5.7401e-02,  1.8909e-02,
          8.5526e-02,  3.9640e-02, -1.3437e-01, -1.4436e-03,  3.0670e-03,
          1.7685e-01,  4.4489e-03, -1.6927e-02,  2.2427e-02, -4.3505e-02,
          6.0903e-03, -9.9817e-03, -6.2397e-02,  1.0737e-02, -6.0490e-03,
         -7.1466e-02, -8.4580e-03, -3.1802e-02, -1.0052e-01,  2.2587e-04,
         -1.8934e-02, -3.1779e-02,  4.0086e-02,  2.1918e-02, -2.5953e-02,
          5.8453e-02, -8.1251e-02,  2.1203e-02, -5.0269e-02,  1.3206e-02,
          1.3268e-03,  2.2706e-02, -3.

In [13]:
# Apply DBSCAN for clustering
dbscan = DBSCAN(eps=0.2, min_samples=1, metric='cosine')
clusters = dbscan.fit_predict(embeddings)

# Add cluster labels to the DataFrame
df['cluster_label'] = clusters

df.head()

Unnamed: 0,timestamp,source,log_message,target_label,complexity,cluster_label
0,2025-06-27 07:20:25,ModernCRM,"nova.osapi_compute.wsgi.server [req-b9718cd8-f65e-49cc-8349-6cf7122af137 113d3a99c3da401fbd62cc2caa5b96d2 54fadb412c4e40cdbaed9335e4c35a9e - - -] 10.11.10.1 ""GET /v2/54fadb412c4e40cdbaed9335e4c35a9e/servers/detail HTTP/1.1"" status: 200 len: 1893 time: 0.2675118",HTTP Status,bert,0
1,1/14/2025 23:07,ModernCRM,Email service experiencing issues with sending,Critical Error,bert,1
2,1/17/2025 1:29,AnalyticsEngine,Unauthorized access to data was attempted,Security Alert,bert,2
3,2025-07-12 00:24:16,ModernHR,"nova.osapi_compute.wsgi.server [req-4895c258-b2f8-488f-a2a3-4fae63982e48 113d3a99c3da401fbd62cc2caa5b96d2 54fadb412c4e40cdbaed9335e4c35a9e - - -] 10.11.10.1 ""GET /v2/54fadb412c4e40cdbaed9335e4c35a9e/servers/detail HTTP/1.1"" HTTP status code - 200 len: 211 time: 0.0968180",HTTP Status,bert,0
4,2025-06-02 18:25:23,BillingSystem,"nova.osapi_compute.wsgi.server [req-ee8bc8ba-9265-4280-9215-dbe000a41209 113d3a99c3da401fbd62cc2caa5b96d2 54fadb412c4e40cdbaed9335e4c35a9e - - -] 10.11.10.1 ""GET /v2/54fadb412c4e40cdbaed9335e4c35a9e/servers/detail HTTP/1.1"" RCODE 200 len: 1874 time: 0.2280791",HTTP Status,bert,0


In [14]:
df[df['cluster_label'] == 1]

Unnamed: 0,timestamp,source,log_message,target_label,complexity,cluster_label
1,1/14/2025 23:07,ModernCRM,Email service experiencing issues with sending,Critical Error,bert,1
10,8/9/2025 18:58,ModernCRM,Email server encountered a sending fault,Error,bert,1
217,1/22/2025 5:45,BillingSystem,Mail service encountered a delivery glitch,Error,bert,1
248,5/2/2025 23:04,ModernHR,Service disruption caused by email sending error,Critical Error,bert,1
265,3/30/2025 23:53,ModernCRM,Email system had a problem sending emails,Error,bert,1
361,11/19/2025 23:06,BillingSystem,Email service experienced a sending issue,Error,bert,1
450,10/27/2025 5:59,ThirdPartyAPI,Email delivery system encountered an error,Error,bert,1
477,12/2/2025 10:30,AnalyticsEngine,Email transmission error caused service impact,Critical Error,bert,1
570,11/7/2025 18:08,ThirdPartyAPI,Email service impacted by sending failure,Critical Error,bert,1
678,4/28/2025 15:13,AnalyticsEngine,Email delivery problem affected system,Critical Error,bert,1


In [72]:
small_clusters = cluster_counts[cluster_counts < 5].index
for cluster in small_clusters:
    print(f"Cluster {cluster}:")
    print(df[df['cluster_label'] == cluster]['target_label'].unique())
    print()

Cluster 76:
['Security Alert']

Cluster 97:
['Security Alert']

Cluster 2:
['Security Alert']

Cluster 19:
['Security Alert']

Cluster 15:
['Security Alert']

Cluster 67:
['Critical Error']

Cluster 69:
['Security Alert']

Cluster 65:
['Critical Error']

Cluster 64:
['Security Alert']

Cluster 33:
['Security Alert']

Cluster 39:
['Security Alert']

Cluster 37:
['Error']

Cluster 35:
['Security Alert']

Cluster 54:
['Security Alert']

Cluster 50:
['Error']

Cluster 63:
['Security Alert']

Cluster 60:
['Security Alert']

Cluster 56:
['Error']

Cluster 46:
['Error']

Cluster 44:
['Security Alert']

Cluster 41:
['Security Alert']

Cluster 71:
['Error']

Cluster 58:
['Error']

Cluster 57:
['Error']

Cluster 88:
['Error']

Cluster 94:
['Security Alert']

Cluster 101:
['Security Alert']

Cluster 98:
['Security Alert']

Cluster 89:
['Error']

Cluster 72:
['Security Alert']

Cluster 78:
['Error']

Cluster 87:
['Security Alert']

Cluster 84:
['Security Alert']

Cluster 23:
['Error']

Cluster 22:

In [73]:
cluster_counts = df['cluster_label'].value_counts()
large_cluster = cluster_counts[cluster_counts > 10].index
for cluster in large_cluster:
    print(f"Cluster {cluster}:")
    print(df[df['cluster_label'] == cluster]['log_message'].head(5))
    print()


Cluster 0:
0               nova.osapi_compute.wsgi.server [req-b9718cd8-f65e-49cc-8349-6cf7122af137 113d3a99c3da401fbd62cc2caa5b96d2 54fadb412c4e40cdbaed9335e4c35a9e - - -] 10.11.10.1 "GET /v2/54fadb412c4e40cdbaed9335e4c35a9e/servers/detail HTTP/1.1" status: 200 len: 1893 time: 0.2675118
3    nova.osapi_compute.wsgi.server [req-4895c258-b2f8-488f-a2a3-4fae63982e48 113d3a99c3da401fbd62cc2caa5b96d2 54fadb412c4e40cdbaed9335e4c35a9e - - -] 10.11.10.1 "GET /v2/54fadb412c4e40cdbaed9335e4c35a9e/servers/detail HTTP/1.1" HTTP status code -  200 len: 211 time: 0.0968180
4                nova.osapi_compute.wsgi.server [req-ee8bc8ba-9265-4280-9215-dbe000a41209 113d3a99c3da401fbd62cc2caa5b96d2 54fadb412c4e40cdbaed9335e4c35a9e - - -] 10.11.10.1 "GET /v2/54fadb412c4e40cdbaed9335e4c35a9e/servers/detail HTTP/1.1" RCODE  200 len: 1874 time: 0.2280791
5          nova.osapi_compute.wsgi.server [req-f0bffbc3-5ab0-4916-91c1-0a61dd7d4ec2 113d3a99c3da401fbd62cc2caa5b96d2 54fadb412c4e40cdbaed9335e4c35a9e - - -

In [74]:
import re
def classify_with_regex(log_message):
    regex_patterns = {
        r"User User\d+ logged (in|out)" : "User Action",
        r"Backup (started|ended) at .*" : "System Notification",
        r"Backup completed successfully" : "System Notification",
        r"System updated version .*" : "System Notification",
        r"File .* uploaded successfully" : "System Notification",
        r"Disk cleanup completed successfully" : "System Notification",
        r"System reboot initiated by user" : "System Notification",
        r"Account with ID .* created .*" : "User Action"
    }

    for pattern, label in regex_patterns.items():
        match = re.search(pattern, log_message, re.IGNORECASE)
        if match:
            return label
    return None



In [76]:
classify_with_regex("Account with ID fsdf created sdfsd")

'User Action'

In [80]:
df["regex_classify"] = df["log_message"].apply(classify_with_regex)
df_non_regex = df[df["regex_classify"].isnull()]

df_non_regex


Unnamed: 0,timestamp,source,log_message,target_label,complexity,cluster_label,regex_classify
0,2025-06-27 07:20:25,ModernCRM,"nova.osapi_compute.wsgi.server [req-b9718cd8-f65e-49cc-8349-6cf7122af137 113d3a99c3da401fbd62cc2caa5b96d2 54fadb412c4e40cdbaed9335e4c35a9e - - -] 10.11.10.1 ""GET /v2/54fadb412c4e40cdbaed9335e4c35a9e/servers/detail HTTP/1.1"" status: 200 len: 1893 time: 0.2675118",HTTP Status,bert,0,
1,1/14/2025 23:07,ModernCRM,Email service experiencing issues with sending,Critical Error,bert,1,
2,1/17/2025 1:29,AnalyticsEngine,Unauthorized access to data was attempted,Security Alert,bert,2,
3,2025-07-12 00:24:16,ModernHR,"nova.osapi_compute.wsgi.server [req-4895c258-b2f8-488f-a2a3-4fae63982e48 113d3a99c3da401fbd62cc2caa5b96d2 54fadb412c4e40cdbaed9335e4c35a9e - - -] 10.11.10.1 ""GET /v2/54fadb412c4e40cdbaed9335e4c35a9e/servers/detail HTTP/1.1"" HTTP status code - 200 len: 211 time: 0.0968180",HTTP Status,bert,0,
4,2025-06-02 18:25:23,BillingSystem,"nova.osapi_compute.wsgi.server [req-ee8bc8ba-9265-4280-9215-dbe000a41209 113d3a99c3da401fbd62cc2caa5b96d2 54fadb412c4e40cdbaed9335e4c35a9e - - -] 10.11.10.1 ""GET /v2/54fadb412c4e40cdbaed9335e4c35a9e/servers/detail HTTP/1.1"" RCODE 200 len: 1874 time: 0.2280791",HTTP Status,bert,0,
...,...,...,...,...,...,...,...
2405,2025-08-13 07:29:25,ModernHR,"nova.osapi_compute.wsgi.server [req-96c3ec98-21a0-4af2-84a8-d4989512413e 113d3a99c3da401fbd62cc2caa5b96d2 54fadb412c4e40cdbaed9335e4c35a9e - - -] 10.11.10.1 ""GET /v2/54fadb412c4e40cdbaed9335e4c35a9e/servers/detail HTTP/1.1"" Return code: 200 len: 1916 time: 0.2677610",HTTP Status,bert,0,
2406,1/11/2025 5:32,ModernHR,User 3844 account experienced multiple failed logins,Security Alert,bert,7,
2407,2025-08-03 03:07:47,ThirdPartyAPI,"nova.metadata.wsgi.server [req-b6d4a270-accb-4c3a-8179-9611e52e1768 - - - - -] 10.11.21.124,10.11.10.1 ""GET /openstack/2013-10-17 HTTP/1.1"" RCODE 200 len: 157 time: 0.2249990",HTTP Status,bert,0,
2408,11/11/2025 11:52,BillingSystem,Email service affected by failed transmission,Critical Error,bert,1,


In [70]:
df_not_regex = df[df["regex_classify"].isnull()]
df_not_regex

Unnamed: 0,timestamp,source,log_message,target_label,complexity,cluster_label,regex_classify
0,2025-06-27 07:20:25,ModernCRM,"nova.osapi_compute.wsgi.server [req-b9718cd8-f65e-49cc-8349-6cf7122af137 113d3a99c3da401fbd62cc2caa5b96d2 54fadb412c4e40cdbaed9335e4c35a9e - - -] 10.11.10.1 ""GET /v2/54fadb412c4e40cdbaed9335e4c35a9e/servers/detail HTTP/1.1"" status: 200 len: 1893 time: 0.2675118",HTTP Status,bert,0,
1,1/14/2025 23:07,ModernCRM,Email service experiencing issues with sending,Critical Error,bert,1,
2,1/17/2025 1:29,AnalyticsEngine,Unauthorized access to data was attempted,Security Alert,bert,2,
3,2025-07-12 00:24:16,ModernHR,"nova.osapi_compute.wsgi.server [req-4895c258-b2f8-488f-a2a3-4fae63982e48 113d3a99c3da401fbd62cc2caa5b96d2 54fadb412c4e40cdbaed9335e4c35a9e - - -] 10.11.10.1 ""GET /v2/54fadb412c4e40cdbaed9335e4c35a9e/servers/detail HTTP/1.1"" HTTP status code - 200 len: 211 time: 0.0968180",HTTP Status,bert,0,
4,2025-06-02 18:25:23,BillingSystem,"nova.osapi_compute.wsgi.server [req-ee8bc8ba-9265-4280-9215-dbe000a41209 113d3a99c3da401fbd62cc2caa5b96d2 54fadb412c4e40cdbaed9335e4c35a9e - - -] 10.11.10.1 ""GET /v2/54fadb412c4e40cdbaed9335e4c35a9e/servers/detail HTTP/1.1"" RCODE 200 len: 1874 time: 0.2280791",HTTP Status,bert,0,
...,...,...,...,...,...,...,...
2405,2025-08-13 07:29:25,ModernHR,"nova.osapi_compute.wsgi.server [req-96c3ec98-21a0-4af2-84a8-d4989512413e 113d3a99c3da401fbd62cc2caa5b96d2 54fadb412c4e40cdbaed9335e4c35a9e - - -] 10.11.10.1 ""GET /v2/54fadb412c4e40cdbaed9335e4c35a9e/servers/detail HTTP/1.1"" Return code: 200 len: 1916 time: 0.2677610",HTTP Status,bert,0,
2406,1/11/2025 5:32,ModernHR,User 3844 account experienced multiple failed logins,Security Alert,bert,7,
2407,2025-08-03 03:07:47,ThirdPartyAPI,"nova.metadata.wsgi.server [req-b6d4a270-accb-4c3a-8179-9611e52e1768 - - - - -] 10.11.21.124,10.11.10.1 ""GET /openstack/2013-10-17 HTTP/1.1"" RCODE 200 len: 157 time: 0.2249990",HTTP Status,bert,0,
2408,11/11/2025 11:52,BillingSystem,Email service affected by failed transmission,Critical Error,bert,1,


In [89]:
# classify the llm

llm_targets = df["target_label"].value_counts()[df["target_label"].value_counts() <= 5].index.tolist()
df["source"][df.target_label.isin(llm_targets)].unique()

array(['LegacyCRM'], dtype=object)

In [92]:
#In this Nan now filter out llm and bert df

df_non_legacy = df_non_regex[df_non_regex["source"] != "LegacyCRM"]
df_non_legacy.source.unique()

array(['ModernCRM', 'AnalyticsEngine', 'ModernHR', 'BillingSystem',
       'ThirdPartyAPI'], dtype=object)

In [96]:
filtered_embeddings = model.encode(df_non_legacy["log_message"].tolist())
filtered_embeddings[:2]

array([[-1.02939621e-01,  3.35459411e-02, -2.20260732e-02,
         1.55101740e-03, -9.86917876e-03, -1.78956270e-01,
        -6.34409785e-02, -6.01761639e-02,  2.81109158e-02,
         5.99620491e-02, -1.72618348e-02,  1.43363548e-03,
        -1.49560034e-01,  3.15287686e-03, -5.66030927e-02,
         2.71685235e-02, -1.49891041e-02, -3.54037657e-02,
        -3.62936445e-02, -1.45410765e-02, -5.61491773e-03,
         8.75539035e-02,  4.55120578e-02,  2.50963885e-02,
         1.00187510e-02,  1.24267349e-02, -1.39923573e-01,
         7.68696293e-02,  3.14095505e-02, -4.15247958e-03,
         4.36902344e-02,  1.71250012e-02, -8.00951198e-02,
         5.74006326e-02,  1.89091656e-02,  8.55262503e-02,
         3.96398641e-02, -1.34371817e-01, -1.44360063e-03,
         3.06704035e-03,  1.76854044e-01,  4.44885530e-03,
        -1.69274509e-02,  2.24266481e-02, -4.35049310e-02,
         6.09034160e-03, -9.98169929e-03, -6.23972900e-02,
         1.07372422e-02, -6.04895083e-03, -7.14660808e-0

In [103]:
from sklearn.metrics import classification_report
from sklearn.model_selection import train_test_split

from sklearn.linear_model import LogisticRegression

clf = LogisticRegression()
y = df_non_legacy["target_label"]

X_train, X_test, y_train, y_test = train_test_split(filtered_embeddings, y, test_size=0.2, random_state=42)
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)

clf_rpt = classification_report(y_test, y_pred)
print(clf_rpt)

                     precision    recall  f1-score   support

     Critical Error       0.93      1.00      0.96        25
              Error       0.95      0.91      0.93        23
        HTTP Status       1.00      1.00      1.00       217
     Resource Usage       1.00      1.00      1.00        33
     Security Alert       1.00      0.99      0.99        84
System Notification       1.00      1.00      1.00        11

           accuracy                           0.99       393
          macro avg       0.98      0.98      0.98       393
       weighted avg       0.99      0.99      0.99       393



In [118]:
# save the model
import joblib
import os

save_path = os.path.abspath("D:\\Tutorials ll\\python\\PythonProject\\classificationLogs\\models\\clf_model.joblib")

with open(save_path, "wb") as f:
    joblib.dump(clf, f)