In [26]:
import pandas as pd

df1 = pd.read_csv('datasets/logs.csv')
df1.head()

Unnamed: 0,timestamp,source,log_message,target_label,complexity
0,2025-06-27 07:20:25,ModernCRM,"nova.osapi_compute.wsgi.server [req-b9718cd8-f65e-49cc-8349-6cf7122af137 113d3a99c3da401fbd62cc2caa5b96d2 54fadb412c4e40cdbaed9335e4c35a9e - - -] 10.11.10.1 ""GET /v2/54fadb412c4e40cdbaed9335e4c35a9e/servers/detail HTTP/1.1"" status: 200 len: 1893 time: 0.2675118",HTTP Status,bert
1,1/14/2025 23:07,ModernCRM,Email service experiencing issues with sending,Critical Error,bert
2,1/17/2025 1:29,AnalyticsEngine,Unauthorized access to data was attempted,Security Alert,bert
3,2025-07-12 00:24:16,ModernHR,"nova.osapi_compute.wsgi.server [req-4895c258-b2f8-488f-a2a3-4fae63982e48 113d3a99c3da401fbd62cc2caa5b96d2 54fadb412c4e40cdbaed9335e4c35a9e - - -] 10.11.10.1 ""GET /v2/54fadb412c4e40cdbaed9335e4c35a9e/servers/detail HTTP/1.1"" HTTP status code - 200 len: 211 time: 0.0968180",HTTP Status,bert
4,2025-06-02 18:25:23,BillingSystem,"nova.osapi_compute.wsgi.server [req-ee8bc8ba-9265-4280-9215-dbe000a41209 113d3a99c3da401fbd62cc2caa5b96d2 54fadb412c4e40cdbaed9335e4c35a9e - - -] 10.11.10.1 ""GET /v2/54fadb412c4e40cdbaed9335e4c35a9e/servers/detail HTTP/1.1"" RCODE 200 len: 1874 time: 0.2280791",HTTP Status,bert


In [27]:
# unique values in each column
df1.nunique()

timestamp       2407
source             6
log_message     2265
target_label       9
complexity         3
dtype: int64

In [28]:
# all unique values in target label column
df1.target_label.unique()

array(['HTTP Status', 'Critical Error', 'Security Alert', 'Error',
       'System Notification', 'Resource Usage', 'User Action',

In [None]:
#creating embeddings for the log messages
from sentence_transformers import SentenceTransformer
from sklearn.cluster import DBSCAN
model = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2")

embeddings = model.encode(df1['log_message'].tolist())
print(embeddings.shape)
similarities = model.similarity(embeddings, embeddings)
print(similarities)


(2410, 384)
tensor([[ 1.0000,  0.1455,  0.1794,  ...,  0.7812,  0.0524,  0.1620],
        [ 0.1455,  1.0000,  0.1705,  ...,  0.0439,  0.6327,  0.1867],
        [ 0.1794,  0.1705,  1.0000,  ...,  0.1258,  0.1599,  0.4098],
        ...,
        [ 0.7812,  0.0439,  0.1258,  ...,  1.0000, -0.0101,  0.0595],
        [ 0.0524,  0.6327,  0.1599,  ..., -0.0101,  1.0000,  0.1970],
        [ 0.1620,  0.1867,  0.4098,  ...,  0.0595,  0.1970,  1.0000]])


In [30]:
# Clustering the embeddings using DBSCAN
dbscan = DBSCAN(eps=0.2, min_samples=1, metric='euclidean')
cluster = dbscan.fit_predict(embeddings)
df1['cluster'] = cluster
df1.head()

Unnamed: 0,timestamp,source,log_message,target_label,complexity,cluster
0,2025-06-27 07:20:25,ModernCRM,"nova.osapi_compute.wsgi.server [req-b9718cd8-f65e-49cc-8349-6cf7122af137 113d3a99c3da401fbd62cc2caa5b96d2 54fadb412c4e40cdbaed9335e4c35a9e - - -] 10.11.10.1 ""GET /v2/54fadb412c4e40cdbaed9335e4c35a9e/servers/detail HTTP/1.1"" status: 200 len: 1893 time: 0.2675118",HTTP Status,bert,0
1,1/14/2025 23:07,ModernCRM,Email service experiencing issues with sending,Critical Error,bert,1
2,1/17/2025 1:29,AnalyticsEngine,Unauthorized access to data was attempted,Security Alert,bert,2
3,2025-07-12 00:24:16,ModernHR,"nova.osapi_compute.wsgi.server [req-4895c258-b2f8-488f-a2a3-4fae63982e48 113d3a99c3da401fbd62cc2caa5b96d2 54fadb412c4e40cdbaed9335e4c35a9e - - -] 10.11.10.1 ""GET /v2/54fadb412c4e40cdbaed9335e4c35a9e/servers/detail HTTP/1.1"" HTTP status code - 200 len: 211 time: 0.0968180",HTTP Status,bert,0
4,2025-06-02 18:25:23,BillingSystem,"nova.osapi_compute.wsgi.server [req-ee8bc8ba-9265-4280-9215-dbe000a41209 113d3a99c3da401fbd62cc2caa5b96d2 54fadb412c4e40cdbaed9335e4c35a9e - - -] 10.11.10.1 ""GET /v2/54fadb412c4e40cdbaed9335e4c35a9e/servers/detail HTTP/1.1"" RCODE 200 len: 1874 time: 0.2280791",HTTP Status,bert,3


In [31]:
df1[df1.cluster==1].head()

Unnamed: 0,timestamp,source,log_message,target_label,complexity,cluster
1,1/14/2025 23:07,ModernCRM,Email service experiencing issues with sending,Critical Error,bert,1


In [32]:
df1.cluster.unique()

array([   0,    1,    2, ..., 1061, 1062, 1063])

In [33]:
cluster_count = df1['cluster'].value_counts()
largest_cluster = cluster_count[cluster_count > 10].index

for cluster in largest_cluster:
    print(f"Cluster {cluster}:")
    pd.set_option('display.max_colwidth', None)
    print(df1[df1['cluster'] == cluster]['log_message'].tail(5).to_string(index=False))
    print("\n")


Cluster 0:
nova.osapi_compute.wsgi.server [req-1239a305-a3f9-4451-8fd8-fc1da207fd05 113d3a99c3da401fbd62cc2caa5b96d2 54fadb412c4e40cdbaed9335e4c35a9e - - -] 10.11.10.1 "GET /v2/54fadb412c4e40cdbaed9335e4c35a9e/servers/detail HTTP/1.1" HTTP status code -  200 len: 1759 time: 0.2832491
            nova.osapi_compute.wsgi.server [req-86058deb-b5e1-4cc2-96b2-4b1f0ca81306 113d3a99c3da401fbd62cc2caa5b96d2 54fadb412c4e40cdbaed9335e4c35a9e - - -] 10.11.10.1 "GET /v2/54fadb412c4e40cdbaed9335e4c35a9e/servers/detail HTTP/1.1" status: 200 len: 1893 time: 0.2519629
            nova.osapi_compute.wsgi.server [req-31a940b9-3604-4c3f-8aec-386824bc1e9d 113d3a99c3da401fbd62cc2caa5b96d2 54fadb412c4e40cdbaed9335e4c35a9e - - -] 10.11.10.1 "GET /v2/54fadb412c4e40cdbaed9335e4c35a9e/servers/detail HTTP/1.1" status: 200 len: 1893 time: 0.2664509
nova.osapi_compute.wsgi.server [req-410ed8a3-3cb0-47ca-920a-653be17f284e 113d3a99c3da401fbd62cc2caa5b96d2 54fadb412c4e40cdbaed9335e4c35a9e - - -] 10.11.10.1 "GET /v2/5

In [None]:
import re
def classify(log_msg):
    patterns = {
                r'Backup completed successfully.' : 'System Notification',
                r'Backup started at \d{4}-\d{2}-\d{2} \d{2}:\d{2}:\d{2}.' : 'System Notification',
                r'Disk cleanup completed successfully.' : 'System Notification',
                r'Backup ended at \d{4}-\d{2}-\d{2} \d{2}:\d{2}:\d{2}.' : 'System Notification'
            }
    
    for pattern, label in patterns.items():
        if re.search(pattern, log_msg):
            return label
        elif 'user' in log_msg.lower():
            return 'User Activity'
        elif 'email service' in log_msg.lower():
            return 'Email Service'
    return None

In [35]:
classify("Backup started at 2025-04-22 09:11:13.")

'System Notification'

In [36]:
df1['regex_label'] = df1['log_message'].apply(classify)

In [37]:
df1.regex_label.notnull().value_counts()

regex_label
False    1758
True      652
Name: count, dtype: int64

In [47]:
df_non_labeled=df1[df1['regex_label'].isnull()].copy()
df_non_labeled.drop(columns=['complexity'], inplace=True)
df_non_labeled.shape

(1758, 6)

In [60]:
df_non_labeled['target_label'].value_counts()[df_non_labeled.target_label.value_counts() <= 10]

target_label
Workflow Error         4
Name: count, dtype: int64

In [66]:
labels_with_10_or_more = df_non_labeled['target_label'].value_counts()
labels_with_10_or_more = labels_with_10_or_more[labels_with_10_or_more >= 10].index
df_bert = df_non_labeled[df_non_labeled['target_label'].isin(labels_with_10_or_more)]
df_bert['target_label'].value_counts()

target_label
HTTP Status            997
Security Alert         194
Resource Usage         177
Error                  171
Critical Error         154
System Notification     58
Name: count, dtype: int64

In [68]:
filtered_embeddings = model.encode(df_bert['log_message'].tolist())

In [72]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report

X = filtered_embeddings
Y = df_bert['target_label']

X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.3, random_state=42)

clf = LogisticRegression(max_iter=1000)
clf.fit(X_train, y_train)

y_pred = clf.predict(X_test)
print(classification_report(y_test, y_pred))

                     precision    recall  f1-score   support

     Critical Error       0.98      0.96      0.97        45
              Error       0.96      0.98      0.97        51
        HTTP Status       1.00      1.00      1.00       290
     Resource Usage       1.00      1.00      1.00        55
     Security Alert       1.00      1.00      1.00        66
System Notification       1.00      1.00      1.00        19

           accuracy                           0.99       526
          macro avg       0.99      0.99      0.99       526
       weighted avg       0.99      0.99      0.99       526



In [74]:
import joblib
joblib.dump(clf, '../models/log_classifier.joblib')

['../models/log_classifier.joblib']

In [79]:
df1[df1['source'] == 'LegacyCRM']

Unnamed: 0,timestamp,source,log_message,target_label,complexity,cluster,regex_label
60,2025-10-06 16:55:23,LegacyCRM,Lead conversion failed for prospect ID 7842 due to missing contact information.,Workflow Error,llm,40,
255,2025-05-03 16:55:35,LegacyCRM,API endpoint 'getCustomerDetails' is deprecated and will be removed in version 3.2. Use 'fetchCustomerInfo' instead.,Deprecation Warning,llm,129,
377,2025-06-24 12:16:29,LegacyCRM,Customer follow-up process for lead ID 5621 failed due to missing next action,Workflow Error,llm,190,
1325,2025-04-17 07:33:44,LegacyCRM,Escalation rule execution failed for ticket ID 9807 - undefined escalation level.,Workflow Error,llm,606,
1734,2025-04-30 07:47:30,LegacyCRM,The 'ExportToCSV' feature is outdated. Please migrate to 'ExportToXLSX' by the end of Q3.,Deprecation Warning,llm,778,
1826,2025-01-23 10:33:36,LegacyCRM,Support for legacy authentication methods will be discontinued after 2025-06-01.,Deprecation Warning,llm,813,
2217,2025-05-12 09:46:54,LegacyCRM,Task assignment for TeamID 3425 could not complete due to invalid priority level.,Workflow Error,llm,986,
