In [None]:
import pandas as pd


In [None]:
df = pd.read_csv('synthetic_logs.csv')

In [None]:
df

Unnamed: 0,timestamp,source,log_message,target_label,complexity
0,2025-06-27 07:20:25,ModernCRM,nova.osapi_compute.wsgi.server [req-b9718cd8-f...,HTTP Status,bert
1,1/14/2025 23:07,ModernCRM,Email service experiencing issues with sending,Critical Error,bert
2,1/17/2025 1:29,AnalyticsEngine,Unauthorized access to data was attempted,Security Alert,bert
3,2025-07-12 00:24:16,ModernHR,nova.osapi_compute.wsgi.server [req-4895c258-b...,HTTP Status,bert
4,2025-06-02 18:25:23,BillingSystem,nova.osapi_compute.wsgi.server [req-ee8bc8ba-9...,HTTP Status,bert
...,...,...,...,...,...
2405,2025-08-13 07:29:25,ModernHR,nova.osapi_compute.wsgi.server [req-96c3ec98-2...,HTTP Status,bert
2406,1/11/2025 5:32,ModernHR,User 3844 account experienced multiple failed ...,Security Alert,bert
2407,2025-08-03 03:07:47,ThirdPartyAPI,nova.metadata.wsgi.server [req-b6d4a270-accb-4...,HTTP Status,bert
2408,11/11/2025 11:52,BillingSystem,Email service affected by failed transmission,Critical Error,bert


In [None]:
df.drop('complexity', axis=1)

Unnamed: 0,timestamp,source,log_message,target_label,cluster_label
0,2025-06-27 07:20:25,ModernCRM,nova.osapi_compute.wsgi.server [req-b9718cd8-f...,HTTP Status,0
1,1/14/2025 23:07,ModernCRM,Email service experiencing issues with sending,Critical Error,1
2,1/17/2025 1:29,AnalyticsEngine,Unauthorized access to data was attempted,Security Alert,2
3,2025-07-12 00:24:16,ModernHR,nova.osapi_compute.wsgi.server [req-4895c258-b...,HTTP Status,0
4,2025-06-02 18:25:23,BillingSystem,nova.osapi_compute.wsgi.server [req-ee8bc8ba-9...,HTTP Status,0
...,...,...,...,...,...
2405,2025-08-13 07:29:25,ModernHR,nova.osapi_compute.wsgi.server [req-96c3ec98-2...,HTTP Status,0
2406,1/11/2025 5:32,ModernHR,User 3844 account experienced multiple failed ...,Security Alert,7
2407,2025-08-03 03:07:47,ThirdPartyAPI,nova.metadata.wsgi.server [req-b6d4a270-accb-4...,HTTP Status,0
2408,11/11/2025 11:52,BillingSystem,Email service affected by failed transmission,Critical Error,1


In [None]:
df.source.unique()

array(['ModernCRM', 'AnalyticsEngine', 'ModernHR', 'BillingSystem',
       'ThirdPartyAPI', 'LegacyCRM'], dtype=object)

In [None]:
df.target_label.unique()

array(['HTTP Status', 'Critical Error', 'Security Alert', 'Error',
       'System Notification', 'Resource Usage', 'User Action',

#### How do you find out where you can use regex?
We need to look that the messages and see if there an any fixed patterns that we can represent.
In order to make the task easier, we need to group the similar messages
so we use clustering technique to group all the similar messages and try to find the patterns.

### RegEx Strategy

1. We apply clustering we get clusters.
2. We find clusters with the more than 10 records.
3. Display 5 Samples from each clusters.
4. Inspect the clusters.
4. Create RegEx referring to the groups.


### Step 1: Install and Load Sentence Transformer Library

We need to install the `sentence-transformers` library to generate embeddings from text. After installation, we'll load a pre-trained model, such as `'all-MiniLM-L6-v2'`, which is a good balance of performance and speed.

In [None]:
!pip install -U sentence-transformers



In [None]:
from sentence_transformers import SentenceTransformer

# Load a pre-trained Sentence Transformer model
model = SentenceTransformer('all-MiniLM-L6-v2')

print("Sentence Transformer model loaded successfully.")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

Sentence Transformer model loaded successfully.


### Step 2: Generate Embeddings for Log Messages

Now, we'll use the loaded model to convert each `log_message` into a dense vector (embedding). These embeddings capture the semantic meaning of the messages.

In [None]:
print("Generating embeddings...")
# Generate embeddings for the 'log_message' column
embeddings = model.encode(df['log_message'].tolist(), show_progress_bar=True)

print(f"Embeddings generated. Shape: {embeddings.shape}")

In [None]:
df

Unnamed: 0,timestamp,source,log_message,target_label,cluster_label
0,2025-06-27 07:20:25,ModernCRM,nova.osapi_compute.wsgi.server [req-b9718cd8-f...,HTTP Status,0
1,1/14/2025 23:07,ModernCRM,Email service experiencing issues with sending,Critical Error,1
2,1/17/2025 1:29,AnalyticsEngine,Unauthorized access to data was attempted,Security Alert,2
3,2025-07-12 00:24:16,ModernHR,nova.osapi_compute.wsgi.server [req-4895c258-b...,HTTP Status,0
4,2025-06-02 18:25:23,BillingSystem,nova.osapi_compute.wsgi.server [req-ee8bc8ba-9...,HTTP Status,0
...,...,...,...,...,...
2405,2025-08-13 07:29:25,ModernHR,nova.osapi_compute.wsgi.server [req-96c3ec98-2...,HTTP Status,0
2406,1/11/2025 5:32,ModernHR,User 3844 account experienced multiple failed ...,Security Alert,7
2407,2025-08-03 03:07:47,ThirdPartyAPI,nova.metadata.wsgi.server [req-b6d4a270-accb-4...,HTTP Status,0
2408,11/11/2025 11:52,BillingSystem,Email service affected by failed transmission,Critical Error,1


### Step 3: Apply DBSCAN Clustering

With the numerical embeddings, we can now apply DBSCAN. When working with dense embeddings, `cosine` similarity is typically a good metric. You might need to adjust `eps` and `min_samples` based on the characteristics of your dataset and desired cluster density.

In [None]:
from sklearn.cluster import DBSCAN


dbscan = DBSCAN(eps=0.2, min_samples=1, metric='cosine')

# Fit DBSCAN to the embeddings and get cluster labels
clusters = dbscan.fit_predict(embeddings)

# Add the cluster labels to the original DataFrame
df['cluster_label'] = clusters

print("DBSCAN clustering complete.")

DBSCAN clustering complete.


In [None]:
pd.DataFrame(clusters[:5])

Unnamed: 0,0
0,0
1,1
2,2
3,0
4,0


### Step 4: Analyze the Clustering Results

Let's examine the clusters found by DBSCAN. Remember that points labeled as `-1` are considered noise by the algorithm.

In [None]:
# Printing Cluster 1 to examine how well it was grouped.
# We find out that with the new parameters the clusters are better grouped
df[df.cluster_label==1]

Unnamed: 0,timestamp,source,log_message,target_label,cluster_label
1,1/14/2025 23:07,ModernCRM,Email service experiencing issues with sending,Critical Error,1
10,8/9/2025 18:58,ModernCRM,Email server encountered a sending fault,Error,1
217,1/22/2025 5:45,BillingSystem,Mail service encountered a delivery glitch,Error,1
248,5/2/2025 23:04,ModernHR,Service disruption caused by email sending error,Critical Error,1
265,3/30/2025 23:53,ModernCRM,Email system had a problem sending emails,Error,1
361,11/19/2025 23:06,BillingSystem,Email service experienced a sending issue,Error,1
450,10/27/2025 5:59,ThirdPartyAPI,Email delivery system encountered an error,Error,1
477,12/2/2025 10:30,AnalyticsEngine,Email transmission error caused service impact,Critical Error,1
570,11/7/2025 18:08,ThirdPartyAPI,Email service impacted by sending failure,Critical Error,1
678,4/28/2025 15:13,AnalyticsEngine,Email delivery problem affected system,Critical Error,1


In [None]:
# Number of clusters found (ignoring noise points labeled as -1)
n_clusters_ = len(set(clusters)) - (1 if -1 in clusters else 0)
n_noise_ = list(clusters).count(-1)

print(f"Estimated number of clusters: {n_clusters_}")
print(f"Estimated number of noise points: {n_noise_}")

# Display the count of points per cluster
print("\nCluster distribution:")
print(df['cluster_label'].value_counts())

# Display sample log messages for a few clusters
print("\nSample log messages from different clusters:")
for cluster_id in sorted(df['cluster_label'].unique()):
    if cluster_id == -1:
        print(f"\n--- Noise Points (Cluster -1) ---")
    else:
        print(f"\n--- Cluster {cluster_id} ---")

    # Get sample messages, ensure we don't try to sample more than available
    sample_messages = df[df['cluster_label'] == cluster_id]['log_message'].sample(min(3, len(df[df['cluster_label'] == cluster_id]))).tolist()
    for msg in sample_messages:
        print(f"- {msg}")

Estimated number of clusters: 136
Estimated number of noise points: 0

Cluster distribution:
cluster_label
0      1017
5       147
11      100
13       86
7        60
       ... 
131       1
132       1
133       1
134       1
135       1
Name: count, Length: 136, dtype: int64

Sample log messages from different clusters:

--- Cluster 0 ---
- nova.metadata.wsgi.server [-] 10.11.21.135,10.11.10.1 "GET /latest/meta-data/block-device-mapping/ami HTTP/1.1" Status code -  200 len: 119 time: 0.0006430
- nova.osapi_compute.wsgi.server [req-de162460-0015-4ef5-ad34-36cdaf269be6 113d3a99c3da401fbd62cc2caa5b96d2 54fadb412c4e40cdbaed9335e4c35a9e - - -] 10.11.10.1 "GET /v2/54fadb412c4e40cdbaed9335e4c35a9e/servers/detail HTTP/1.1" Return code: 200 len: 1893 time: 0.2570632
- nova.metadata.wsgi.server [-] 10.11.21.140,10.11.10.1 "GET /openstack/2013-10-17/vendor_data.json HTTP/1.1" Return code: 200 len: 124 time: 0.0006189

--- Cluster 1 ---
- Delivery failure impacted email services
- Service outage

In [None]:
df[df.cluster_label==5]

Unnamed: 0,timestamp,source,log_message,target_label,complexity,cluster_label
8,2025-02-12 10:42:29,ThirdPartyAPI,nova.compute.claims [req-a07ac654-8e81-416d-bf...,Resource Usage,bert,5
26,2025-03-03 17:11:11,ModernCRM,nova.compute.claims [req-d6986b54-3735-4a42-90...,Resource Usage,bert,5
40,2025-06-19 21:42:34,ThirdPartyAPI,nova.compute.claims [req-72b4858f-049e-49e1-b3...,Resource Usage,bert,5
58,2025-09-13 14:45:14,AnalyticsEngine,nova.compute.claims [req-5c8f52bd-8e3c-41f0-95...,Resource Usage,bert,5
61,2025-04-27 11:18:18,ThirdPartyAPI,nova.compute.claims [req-d38f479d-9bb9-4276-96...,Resource Usage,bert,5
...,...,...,...,...,...,...
2336,2025-12-10 11:53:33,AnalyticsEngine,nova.compute.claims [req-97fcea79-42f7-4241-9b...,Resource Usage,bert,5
2345,2025-12-22 01:38:48,BillingSystem,nova.compute.claims [req-caeb3818-dab6-4e8d-9e...,Resource Usage,bert,5
2352,2025-02-18 00:16:44,ModernCRM,nova.compute.claims [req-98474cd9-61e1-4afe-bd...,Resource Usage,bert,5
2355,2025-11-28 18:03:55,BillingSystem,nova.compute.claims [req-6f9ecdfe-481c-4535-9b...,Resource Usage,bert,5


**Reasoning**:
To identify large clusters, I will count the occurrences of each 'cluster_label' in the DataFrame and then filter these counts to include only clusters with more than 10 records, storing the result in `large_clusters`.



In [None]:
cluster_counts = df['cluster_label'].value_counts()
large_clusters = cluster_counts[cluster_counts > 10]

print("Clusters with more than 10 records:")
print(large_clusters)

Clusters with more than 10 records:
cluster_label
0     1017
5      147
11     100
13      86
7       60
8       60
21      58
3       57
4       53
17      52
6       51
32      51
16      48
20      48
9       44
1       39
10      30
34      25
53      20
14      20
52      20
18      17
42      13
25      13
59      12
26      11
Name: count, dtype: int64


**Reasoning**:
Now that the large clusters have been identified, the next step is to iterate through each of these large clusters and display up to 5 sample log messages from each to provide insights into their content.


In [None]:
print("\nSummarizing large clusters and providing sample log messages:")

for cluster_id in large_clusters.index:
    print(f"\n--- Cluster {cluster_id} (Count: {large_clusters[cluster_id]}) ---")

    # Get sample messages, ensuring not to try and sample more than available
    sample_df = df[df['cluster_label'] == cluster_id][['log_message', 'target_label']].sample(min(5, large_clusters[cluster_id]))
    for _, row in sample_df.iterrows():
        print(f"- Log Message: {row['log_message']}\n  Target Label: {row['target_label']}")


Summarizing large clusters and providing sample log messages:

--- Cluster 0 (Count: 1017) ---
- Log Message: nova.metadata.wsgi.server [-] 10.11.21.124,10.11.10.1 "GET /openstack/2013-10-17/vendor_data.json HTTP/1.1" RCODE  200 len: 124 time: 0.0006270
  Target Label: HTTP Status
- Log Message: nova.osapi_compute.wsgi.server [req-c4d0c20c-cfe8-4e66-b280-b083419d4967 f7b8d1f1d4d44643b07fa10ca7d021fb e9746973ac574c6b8a9e8857f56a7608 - - -] 10.11.10.1 "POST /v2/e9746973ac574c6b8a9e8857f56a7608/os-server-external-events HTTP/1.1" HTTP status code -  404 len: 296 time: 0.0833371
  Target Label: HTTP Status
- Log Message: nova.osapi_compute.wsgi.server [req-95aa6171-064c-4a8f-96f6-d43250af4408 113d3a99c3da401fbd62cc2caa5b96d2 54fadb412c4e40cdbaed9335e4c35a9e - - -] 10.11.10.1 "GET /v2/54fadb412c4e40cdbaed9335e4c35a9e/servers/detail HTTP/1.1" Status code -  200 len: 1893 time: 0.2598760
  Target Label: HTTP Status
- Log Message: nova.metadata.wsgi.server [-] 10.11.21.139,10.11.10.1 "GET /la

### Regex Classification

In [None]:
import re
def classify_with_regex(log_message):
  regex_patterns = {
    # HTTP Status (Cluster 0)
    r"nova\.(metadata|osapi_compute)\.wsgi\.server .* (RCODE|HTTP status code -|Status code -|Return code:)  ?\d{3} len: \d+ time: .*": "HTTP Status",

    # Resource Usage (Clusters 5, 10)
    r"nova\.compute\.(claims|resource_tracker) .* (Total memory|disk limit|Total usable vcpus|Final resource view): .*": "Resource Usage",

    # User Action (Clusters 11, 9)
    r"User User\d+ logged (in|out)\.": "User Action",
    r"Account with ID .* created by .*.": "User Action",

    # System Notification (Clusters 13, 8, 21, 4, 32, 16)
    r"Backup (started|ended) at .*": "System Notification",
    r"Backup completed successfully.*": "System Notification",
    r"System updated to version \d+\.\d+\.\d+\.": "System Notification",
    r"File .* uploaded successfully by user User\d+\.": "System Notification",
    r"Disk cleanup completed successfully\.": "System Notification",
    r"System reboot initiated by user User\d+\.": "System Notification",

    # Security Alert (Clusters 7, 17, 20, 34, 42, 59, 26)
    r".*(incorrect|failed|rejected) login attempts?.*user \d+.*": "Security Alert",
    r".*(Account\d+|secured account).* (unauthorized|unsuccessful|failed|not possible) (login|access).*": "Security Alert",
    r".*(unauthorized|unauthenticated) API access attempt.*user \d+.*": "Security Alert",
    r"Server \d+ experienced potential security incident.*": "Security Alert",
    r"Anomalous activity identified on server \d+.*": "Security Alert",
    r"User \d+ (has )?(escalated|elevated).*admin.*": "Security Alert",
    r".*Admin privilege escalation (alert|threat) for user \d+.*": "Security Alert",

    # Critical Error (Clusters 6, 14, 18, 25, 1)
    r".*(Failure|malfunction|not operating).*component ID Component\d+.*": "Critical Error",
    r".*(RAID|disk).* (faults|malfunctions|failures|errors).*": "Critical Error",
    r"(Boot|Kernel) .* (aborted|failure|terminated|crash|interrupted).*": "Critical Error",
    r"System configuration (errors|failure).*": "Critical Error",
    r"(Delivery failure|Mail delivery issue).*email services": "Critical Error",

    # Error (Clusters 3, 53, 52, 1)
    r"(Data replication|Shard \d+) .* (failed|unsuccessful|terminated)": "Error",
    r".*module X.*(parse|format|syntax|Invalid).*": "Error",
    r"Service health check .* (SSL certificate|invalid|failed).*": "Error",
    r"Email service experiencing issues.*": "Error"
    }
  for pattern, label in regex_patterns.items():
    if re.search(pattern, log_message, re.IGNORECASE):
      return label
  return None










In [None]:
classify_with_regex("backup completed successfully.")

'System Notification'

### Applying Regex Classification

1. We apply regex classification to all the data.
2. Do do that by applying a classify_with_regex function.
3. This function takes in each cell inside the log_message.
4. That classification function compares the log_message using re module
5. The search method from re module re.search(pattern, log_message).

In [None]:
df['regex_label'] = df['log_message'].apply(classify_with_regex)

In [None]:
df

Unnamed: 0,timestamp,source,log_message,target_label,cluster_label,regex_label
0,2025-06-27 07:20:25,ModernCRM,nova.osapi_compute.wsgi.server [req-b9718cd8-f...,HTTP Status,0,
1,1/14/2025 23:07,ModernCRM,Email service experiencing issues with sending,Critical Error,1,Error
2,1/17/2025 1:29,AnalyticsEngine,Unauthorized access to data was attempted,Security Alert,2,
3,2025-07-12 00:24:16,ModernHR,nova.osapi_compute.wsgi.server [req-4895c258-b...,HTTP Status,0,HTTP Status
4,2025-06-02 18:25:23,BillingSystem,nova.osapi_compute.wsgi.server [req-ee8bc8ba-9...,HTTP Status,0,HTTP Status
...,...,...,...,...,...,...
2405,2025-08-13 07:29:25,ModernHR,nova.osapi_compute.wsgi.server [req-96c3ec98-2...,HTTP Status,0,HTTP Status
2406,1/11/2025 5:32,ModernHR,User 3844 account experienced multiple failed ...,Security Alert,7,
2407,2025-08-03 03:07:47,ThirdPartyAPI,nova.metadata.wsgi.server [req-b6d4a270-accb-4...,HTTP Status,0,HTTP Status
2408,11/11/2025 11:52,BillingSystem,Email service affected by failed transmission,Critical Error,1,


### See which rows it assigned the label

In [None]:
classified_df = df[df.regex_label.notnull()]

In [None]:
print("Number of Classification done by RegEx: ", classified_df.shape)
print("Number of Classifications that could not be done by RegEx", df.shape)

Number of Classification done by ReGex:  (1492, 6)
Number of Classifications that could not be done by ReGex (2410, 6)


In [None]:
classified_df

Unnamed: 0,timestamp,source,log_message,target_label,cluster_label,regex_label
1,1/14/2025 23:07,ModernCRM,Email service experiencing issues with sending,Critical Error,1,Error
3,2025-07-12 00:24:16,ModernHR,nova.osapi_compute.wsgi.server [req-4895c258-b...,HTTP Status,0,HTTP Status
4,2025-06-02 18:25:23,BillingSystem,nova.osapi_compute.wsgi.server [req-ee8bc8ba-9...,HTTP Status,0,HTTP Status
5,2025-10-09 10:30:31,ModernHR,nova.osapi_compute.wsgi.server [req-f0bffbc3-5...,HTTP Status,0,HTTP Status
7,10/11/2025 8:44,ModernHR,File data_6169.csv uploaded successfully by us...,System Notification,4,System Notification
...,...,...,...,...,...,...
2400,2025-01-07 09:13:28,ThirdPartyAPI,nova.compute.resource_tracker [req-addc1839-2e...,Resource Usage,10,Resource Usage
2403,10/1/2025 1:31,ModernCRM,Backup completed successfully.,System Notification,8,System Notification
2404,2025-09-18 02:18:30,ThirdPartyAPI,nova.osapi_compute.wsgi.server [req-2c9c783f-3...,HTTP Status,0,HTTP Status
2405,2025-08-13 07:29:25,ModernHR,nova.osapi_compute.wsgi.server [req-96c3ec98-2...,HTTP Status,0,HTTP Status


### Evaluation of the RegEx classification

Perceision: Percision measures "correct prediction of a class/ it was false but you labeled it as true"

Recall : Recal measures "correct prediction of a class/ it was true but you labeled it as false "

In [None]:
from sklearn.metrics import precision_score, recall_score, classification_report

# Assuming 'target_label' is the true label and 'regex_label' is the predicted label
y_true = classified_df['target_label']
y_pred = classified_df['regex_label']

# Calculate Precision (using 'weighted' average for multi-class classification)
precision = precision_score(y_true, y_pred, average='weighted', zero_division=0)

# Calculate Recall (using 'weighted' average for multi-class classification)
recall = recall_score(y_true, y_pred, average='weighted', zero_division=0)

print(f"Precision: {precision:.4f}")
print(f"Recall: {recall:.4f}")

# Generate a classification report for a more detailed breakdown
print("\nClassification Report:")
print(classification_report(y_true, y_pred, zero_division=0))

Precision: 0.9993
Recall: 0.9993

Classification Report:
                     precision    recall  f1-score   support

     Critical Error       1.00      0.97      0.99        34
              Error       0.98      1.00      0.99        44
        HTTP Status       1.00      1.00      1.00       818
     Resource Usage       1.00      1.00      1.00        51
     Security Alert       1.00      1.00      1.00        45
System Notification       1.00      1.00      1.00       356
        User Action       1.00      1.00      1.00       144

           accuracy                           1.00      1492
          macro avg       1.00      1.00      1.00      1492
       weighted avg       1.00      1.00      1.00      1492



### Percentage of Classfication done by RegEx

In [None]:

print("Percentage of Classfication handled by RegEx", (classified_df.shape[0]/df.shape[0])*100, "%")

Percentage of Classfication handled by RegEx 61.90871369294606 %


### Bert + Logistic Regression
SUMMARY
1. We create a new dataset where the values were Null
2. We get a complete new data frame
3. We try to see which target labels have a fewest examples so were conducting EDA again.
4. It turns out to be Deprecation Warning and Workflow Error
5. We create a new dataset without those target labels
6. Train Logistic Regression Model
7. Save the model using JobLib



In [None]:
df_non_regex = df[df['regex_label'].isnull()].copy()
df_non_regex

Unnamed: 0,timestamp,source,log_message,target_label,cluster_label,regex_label
0,2025-06-27 07:20:25,ModernCRM,nova.osapi_compute.wsgi.server [req-b9718cd8-f...,HTTP Status,0,
2,1/17/2025 1:29,AnalyticsEngine,Unauthorized access to data was attempted,Security Alert,2,
6,3/1/2025 19:14,ModernHR,Shard 6 replication task ended in failure,Error,3,
10,8/9/2025 18:58,ModernCRM,Email server encountered a sending fault,Error,1,
11,6/15/2025 11:44,ModernHR,Critical system unit error: unit ID Component55,Critical Error,6,
...,...,...,...,...,...,...
2401,2025-12-05 15:51:51,ModernCRM,nova.osapi_compute.wsgi.server [req-4bdf00b0-3...,HTTP Status,0,
2402,3/13/2025 9:44,BillingSystem,Replication error occurred for shard 10,Error,3,
2406,1/11/2025 5:32,ModernHR,User 3844 account experienced multiple failed ...,Security Alert,7,
2408,11/11/2025 11:52,BillingSystem,Email service affected by failed transmission,Critical Error,1,


### How do we know if there are enough training samples or not?
1. Do it programatically, by looking at the target labels
2. Talk to business managers or seniors

In [None]:
label_counts_non_regex = df_non_regex['target_label'].value_counts()
low_frequency_labels = label_counts_non_regex[label_counts_non_regex <= 5]

print("Target Labels with 5 or fewer rows in df_non_regex:")
print(low_frequency_labels)

Target Labels with 5 or fewer rows in df_non_regex:
target_label
Workflow Error         4
Name: count, dtype: int64


### Lets start bert based

In [None]:
df_non_legacy = df_non_regex[df_non_regex.source != 'LegacyCRM'] # LegacyCRM because it maps to Deprecation Warning and Wrokflow error, so we take those target labels out

df_non_legacy.source.unique()

array(['ModernCRM', 'AnalyticsEngine', 'ModernHR', 'ThirdPartyAPI',
       'BillingSystem'], dtype=object)

In [None]:
filtered_embeddings = model.encode(df_non_legacy['log_message'].tolist())
filtered_embeddings[:2]

array([[-1.02939598e-01,  3.35459486e-02, -2.20260844e-02,
         1.55103172e-03, -9.86921880e-03, -1.78956211e-01,
        -6.34409934e-02, -6.01761453e-02,  2.81108953e-02,
         5.99620081e-02, -1.72618236e-02,  1.43364200e-03,
        -1.49560079e-01,  3.15288268e-03, -5.66030741e-02,
         2.71685328e-02, -1.49890278e-02, -3.54037210e-02,
        -3.62936370e-02, -1.45410486e-02, -5.61492983e-03,
         8.75538811e-02,  4.55120727e-02,  2.50963680e-02,
         1.00187613e-02,  1.24267004e-02, -1.39923558e-01,
         7.68696666e-02,  3.14095393e-02, -4.15247958e-03,
         4.36902344e-02,  1.71249956e-02, -8.00950900e-02,
         5.74006140e-02,  1.89092122e-02,  8.55262056e-02,
         3.96399088e-02, -1.34371832e-01, -1.44367013e-03,
         3.06707830e-03,  1.76854059e-01,  4.44890792e-03,
        -1.69275142e-02,  2.24266183e-02, -4.35049757e-02,
         6.09031972e-03, -9.98171885e-03, -6.23973012e-02,
         1.07372692e-02, -6.04895223e-03, -7.14661255e-0

In [None]:
X= filtered_embeddings
y = df_non_legacy['target_label']


from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression


X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3,random_state=42)

clf = LogisticRegression(max_iter= 1000)
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)
report = classification_report(y_test, y_pred)
print(report)

                precision    recall  f1-score   support

Critical Error       0.97      1.00      0.98        32
         Error       1.00      0.97      0.99        37
   HTTP Status       1.00      1.00      1.00        64
Resource Usage       1.00      1.00      1.00        39
Security Alert       1.00      1.00      1.00       102

      accuracy                           1.00       274
     macro avg       0.99      0.99      0.99       274
  weighted avg       1.00      1.00      1.00       274



In [None]:
import joblib

In [None]:
joblib.dump(clf, 'sample_data/log_classifier.joblib')

['sample_data/log_classifier.joblib']

In [None]:
pwd

'/content'

In [None]:
ls

[0m[01;34msample_data[0m/  synthetic_logs.csv
