**Importing Required Libraries**

In [96]:
import pandas as pd
import re
import numpy as np
from sentence_transformers import SentenceTransformer
from sklearn.cluster import DBSCAN
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import classification_report,confusion_matrix,accuracy_score

**Loading the Data**

In [48]:
data = pd.read_csv('synthetic_logs.csv')

In [49]:
data.head()

Unnamed: 0,timestamp,source,log_message,target_label,complexity
0,2025-06-27 07:20:25,ModernCRM,nova.osapi_compute.wsgi.server [req-b9718cd8-f...,HTTP Status,bert
1,1/14/2025 23:07,ModernCRM,Email service experiencing issues with sending,Critical Error,bert
2,1/17/2025 1:29,AnalyticsEngine,Unauthorized access to data was attempted,Security Alert,bert
3,2025-07-12 00:24:16,ModernHR,nova.osapi_compute.wsgi.server [req-4895c258-b...,HTTP Status,bert
4,2025-06-02 18:25:23,BillingSystem,nova.osapi_compute.wsgi.server [req-ee8bc8ba-9...,HTTP Status,bert


In [50]:
data.shape

(2410, 5)

In [51]:
data.source.unique()

array(['ModernCRM', 'AnalyticsEngine', 'ModernHR', 'BillingSystem',
       'ThirdPartyAPI', 'LegacyCRM'], dtype=object)

In [52]:
data.target_label.unique()

array(['HTTP Status', 'Critical Error', 'Security Alert', 'Error',
       'System Notification', 'Resource Usage', 'User Action',

**Loading the Sentence Transformer Model**

In this step, we load the all-MiniLM-L6-v2 model from the SentenceTransformers library. This model is a lightweight and efficient transformer-based embedding model that converts text into numerical vector representations.
It is particularly useful for semantic similarity tasks, clustering, retrieval, and NLP applications.

In [53]:
model = SentenceTransformer('all-MiniLM-L6-v2')

In [54]:
embeddings = model.encode(data['log_message'].tolist())

**Clustering Sentence Embeddings using DBSCAN**

In [56]:
dbscan = DBSCAN(eps=0.2, min_samples=1, metric='cosine').fit(embeddings)
clusters = dbscan.labels_

In [57]:
data['cluster'] = clusters

In [58]:
data.head()

Unnamed: 0,timestamp,source,log_message,target_label,complexity,cluster
0,2025-06-27 07:20:25,ModernCRM,nova.osapi_compute.wsgi.server [req-b9718cd8-f...,HTTP Status,bert,0
1,1/14/2025 23:07,ModernCRM,Email service experiencing issues with sending,Critical Error,bert,1
2,1/17/2025 1:29,AnalyticsEngine,Unauthorized access to data was attempted,Security Alert,bert,2
3,2025-07-12 00:24:16,ModernHR,nova.osapi_compute.wsgi.server [req-4895c258-b...,HTTP Status,bert,0
4,2025-06-02 18:25:23,BillingSystem,nova.osapi_compute.wsgi.server [req-ee8bc8ba-9...,HTTP Status,bert,0


In [60]:
clusters = data.groupby('cluster')['log_message'].apply(list)
sorted_clusters = clusters.sort_values(key= lambda x: x.map(len),ascending=False)

**Displaying Clustered Log Patterns more than 10 messages are displayed to focus on significant patterns**

In [61]:
print("Clustered Patterns:")
for cluster_id, messages in sorted_clusters.items():
    if len(messages) > 10:
        print(f"Cluster {cluster_id}:")
        for msg in messages[:5]:
            print(f"  {msg}")

Clustered Patterns:
Cluster 0:
  nova.osapi_compute.wsgi.server [req-b9718cd8-f65e-49cc-8349-6cf7122af137 113d3a99c3da401fbd62cc2caa5b96d2 54fadb412c4e40cdbaed9335e4c35a9e - - -] 10.11.10.1 "GET /v2/54fadb412c4e40cdbaed9335e4c35a9e/servers/detail HTTP/1.1" status: 200 len: 1893 time: 0.2675118
  nova.osapi_compute.wsgi.server [req-4895c258-b2f8-488f-a2a3-4fae63982e48 113d3a99c3da401fbd62cc2caa5b96d2 54fadb412c4e40cdbaed9335e4c35a9e - - -] 10.11.10.1 "GET /v2/54fadb412c4e40cdbaed9335e4c35a9e/servers/detail HTTP/1.1" HTTP status code -  200 len: 211 time: 0.0968180
  nova.osapi_compute.wsgi.server [req-ee8bc8ba-9265-4280-9215-dbe000a41209 113d3a99c3da401fbd62cc2caa5b96d2 54fadb412c4e40cdbaed9335e4c35a9e - - -] 10.11.10.1 "GET /v2/54fadb412c4e40cdbaed9335e4c35a9e/servers/detail HTTP/1.1" RCODE  200 len: 1874 time: 0.2280791
  nova.osapi_compute.wsgi.server [req-f0bffbc3-5ab0-4916-91c1-0a61dd7d4ec2 113d3a99c3da401fbd62cc2caa5b96d2 54fadb412c4e40cdbaed9335e4c35a9e - - -] 10.11.10.1 "GET /v2

**Classifying Log Messages Using Regular Expressions**

In [65]:
def classify_with_regex(log_message):
    regex_patterns = {
        r"User User\d+ logged (in|out).": "User Action",
        r"Backup (started|ended) at .*": "System Notification",
        r"Backup completed successfully.": "System Notification",
        r"System updated to version .*": "System Notification",
        r"File .* uploaded successfully by user .*": "System Notification",
        r"Disk cleanup completed successfully.": "System Notification",
        r"System reboot initiated by user .*": "System Notification",
        r"Account with ID .* created by .*": "User Action"
    }
    for pattern, label in regex_patterns.items():
        if re.search(pattern, log_message, re.IGNORECASE):
            return label
    return None

In [71]:
data['regex_labels'] = data['log_message'].apply(classify_with_regex)

In [76]:
data.head()

Unnamed: 0,timestamp,source,log_message,target_label,complexity,cluster,regex_labels
0,2025-06-27 07:20:25,ModernCRM,nova.osapi_compute.wsgi.server [req-b9718cd8-f...,HTTP Status,bert,0,
1,1/14/2025 23:07,ModernCRM,Email service experiencing issues with sending,Critical Error,bert,1,
2,1/17/2025 1:29,AnalyticsEngine,Unauthorized access to data was attempted,Security Alert,bert,2,
3,2025-07-12 00:24:16,ModernHR,nova.osapi_compute.wsgi.server [req-4895c258-b...,HTTP Status,bert,0,
4,2025-06-02 18:25:23,BillingSystem,nova.osapi_compute.wsgi.server [req-ee8bc8ba-9...,HTTP Status,bert,0,


In [75]:
data[data['regex_labels'].notna()].shape

(500, 7)

In [77]:
non_regex_df = data[data['regex_labels'].isnull()].copy()

In [78]:
non_regex_df.sample(10)

Unnamed: 0,timestamp,source,log_message,target_label,complexity,cluster,regex_labels
409,8/25/2025 13:58,ThirdPartyAPI,Account Account8057 login attempt was not succ...,Security Alert,bert,17,
1049,2025-12-07 20:19:37,ThirdPartyAPI,nova.osapi_compute.wsgi.server [req-61ab7fb1-e...,HTTP Status,bert,0,
74,5/9/2025 21:18,BillingSystem,User 8300 made multiple incorrect login attempts,Security Alert,bert,7,
239,2025-08-17 21:29:57,ModernHR,nova.osapi_compute.wsgi.server [req-cfb8d696-c...,HTTP Status,bert,0,
1144,5/27/2025 21:25,AnalyticsEngine,User 3080 has been granted elevated admin priv...,Security Alert,bert,99,
1264,2025-10-31 12:24:20,ModernCRM,nova.osapi_compute.wsgi.server [req-79805ef8-d...,HTTP Status,bert,0,
695,6/30/2025 12:46,ThirdPartyAPI,Service outage due to email delivery problem,Critical Error,bert,1,
1220,11/17/2025 3:49,ModernCRM,Multiple login failures occurred on user 7102 ...,Security Alert,bert,7,
2151,2025-12-17 03:48:28,ModernCRM,nova.compute.claims [req-9118475d-6e72-48fa-9d...,Resource Usage,bert,5,
1279,2025-09-21 09:16:26,ModernHR,nova.compute.claims [req-5c8f52bd-8e3c-41f0-95...,Resource Usage,bert,5,


In [81]:
print(non_regex_df['target_label'].value_counts()[non_regex_df['target_label'].value_counts()<=5].index.tolist())



In [86]:
non_regex_df[(non_regex_df['target_label']=='Workflow Error') | (non_regex_df['target_label']=='Deprecation Warning')]

Unnamed: 0,timestamp,source,log_message,target_label,complexity,cluster,regex_labels
60,2025-10-06 16:55:23,LegacyCRM,Lead conversion failed for prospect ID 7842 du...,Workflow Error,llm,24,
255,2025-05-03 16:55:35,LegacyCRM,API endpoint 'getCustomerDetails' is deprecate...,Deprecation Warning,llm,48,
377,2025-06-24 12:16:29,LegacyCRM,Customer follow-up process for lead ID 5621 fa...,Workflow Error,llm,62,
1325,2025-04-17 07:33:44,LegacyCRM,Escalation rule execution failed for ticket ID...,Workflow Error,llm,105,
1734,2025-04-30 07:47:30,LegacyCRM,The 'ExportToCSV' feature is outdated. Please ...,Deprecation Warning,llm,118,
1826,2025-01-23 10:33:36,LegacyCRM,Support for legacy authentication methods will...,Deprecation Warning,llm,122,
2217,2025-05-12 09:46:54,LegacyCRM,Task assignment for TeamID 3425 could not comp...,Workflow Error,llm,133,


**Filtering and Embedding Log Messages**

In [89]:
bert_df = non_regex_df[non_regex_df['source']!='LegacyCRM']

In [90]:
llm_df = non_regex_df[non_regex_df['source']=='LegacyCRM']

In [91]:
filtered_embedings = model.encode(bert_df['log_message'].tolist())

**Training and Evaluating Classification Models**

In [93]:
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import BernoulliNB,MultinomialNB,GaussianNB

In [94]:
X_train, X_test, y_train, y_test = train_test_split(filtered_embedings, bert_df['target_label'], test_size=0.3, random_state=42)

print("Training Logistic Regression Model...")
logistic_model = LogisticRegression(random_state=42, solver='liblinear',C=1.0)
logistic_model.fit(X_train, y_train)
y_pred_logistic = logistic_model.predict(X_test)

print("Logistic Regression Classification Report:")
print(classification_report(y_test, y_pred_logistic))
print("Logistic Regression Accuracy:", accuracy_score(y_test, y_pred_logistic))


Training Logistic Regression Model...
Logistic Regression Classification Report:
                precision    recall  f1-score   support

Critical Error       0.89      1.00      0.94        48
         Error       0.98      0.87      0.92        47
   HTTP Status       1.00      1.00      1.00       304
Resource Usage       1.00      1.00      1.00        49
Security Alert       1.00      0.99      1.00       123

      accuracy                           0.99       571
     macro avg       0.97      0.97      0.97       571
  weighted avg       0.99      0.99      0.99       571

Logistic Regression Accuracy: 0.9877408056042032


In [95]:
print("\nTraining Naive Bayes Models...")

print("\nGaussian Naive Bayes")
gaussian_model = GaussianNB()
gaussian_model.fit(X_train, y_train)
y_pred_gaussian = gaussian_model.predict(X_test)
print("Gaussian Naive Bayes Classification Report:")
print(classification_report(y_test, y_pred_gaussian))
print("Gaussian Naive Bayes Accuracy:", accuracy_score(y_test, y_pred_gaussian))

scaler = MinMaxScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)
print("\nMultinomial Naive Bayes")
multinomial_model = MultinomialNB()
multinomial_model.fit(X_train_scaled, y_train)
y_pred_multinomial = multinomial_model.predict(X_test_scaled)
print("Multinomial Naive Bayes Classification Report:")
print(classification_report(y_test, y_pred_multinomial))
print("Multinomial Naive Bayes Accuracy:", accuracy_score(y_test, y_pred_multinomial))



Training Naive Bayes Models...

Gaussian Naive Bayes
Gaussian Naive Bayes Classification Report:
                precision    recall  f1-score   support

Critical Error       0.86      1.00      0.92        48
         Error       0.83      0.83      0.83        47
   HTTP Status       1.00      0.97      0.99       304
Resource Usage       1.00      1.00      1.00        49
Security Alert       0.99      0.99      0.99       123

      accuracy                           0.97       571
     macro avg       0.94      0.96      0.95       571
  weighted avg       0.97      0.97      0.97       571

Gaussian Naive Bayes Accuracy: 0.9702276707530648

Multinomial Naive Bayes
Multinomial Naive Bayes Classification Report:
                precision    recall  f1-score   support

Critical Error       0.83      1.00      0.91        48
         Error       0.97      0.79      0.87        47
   HTTP Status       1.00      1.00      1.00       304
Resource Usage       1.00      1.00      1.00   

**Exporting the Best Model**

In [97]:
import joblib

In [98]:
joblib.dump(logistic_model, 'logistic_model.pkl')

['logistic_model.pkl']

**Conclusion**

The Logistic Regression model achieved an accuracy of 98.77%, making it the best-performing classifier for log message categorization.  

The model demonstrates high precision and recall across all categories, ensuring reliable classification.    

The trained model has been exported using joblib, making it ready for deployment or further fine-tuning.