In [1]:
from google.colab import files
uploaded = files.upload()  # Upload labeled_logs.csv manually

Saving labeled_logs.csv to labeled_logs.csv


In [2]:
import pandas as pd
from sentence_transformers import SentenceTransformer

# Load dataset
df = pd.read_csv("labeled_logs.csv")
print(f"✅ Loaded {len(df)} logs")

# Quick check
df.head()

✅ Loaded 102768 logs


Unnamed: 0,log_message,cluster_label_final
0,AppleThunderboltNHIType2::prePCIWake - power u...,Thunderbolt Hardware Events
1,AppleThunderboltGenericHAL::earlyWake - comple...,Thunderbolt Hardware Events
2,AirPort: Link Down on awdl0. Reason 1 (Unspeci...,Network Interface Changes
3,ARPT: 620651.021206: wl0: wl_update_tcpkeep_se...,Wireless ARPT Log Events
4,Bluetooth -- LE is supported - Disable LE meta...,Missing Location or Interface Data


In [4]:
from sklearn.model_selection import train_test_split

# Features and labels
X = df["log_message"].astype(str)
y = df["cluster_label_final"]

# 80/20 split with stratification
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, stratify=y, random_state=42
)

print(f"✅ Train size: {len(X_train)} | Test size: {len(X_test)}")

✅ Train size: 82214 | Test size: 20554


In [5]:
embedder = SentenceTransformer("all-MiniLM-L6-v2")

# Encode text logs
X_train_embeds = embedder.encode(X_train.tolist(), show_progress_bar=True)
X_test_embeds = embedder.encode(X_test.tolist(), show_progress_bar=True)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/10.5k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

Batches:   0%|          | 0/2570 [00:00<?, ?it/s]

Batches:   0%|          | 0/643 [00:00<?, ?it/s]

In [6]:
from sklearn.linear_model import LogisticRegression
import joblib
from google.colab import files

clf = LogisticRegression(max_iter=1000)
clf.fit(X_train_embeds, y_train)

# Save model
joblib.dump(clf, "log_classifier_model.joblib")
print("✅ Model saved as log_classifier_model.joblib")

# Download the model to your local machine
files.download("log_classifier_model.joblib")

✅ Model saved as log_classifier_model.joblib


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [7]:
from sklearn.metrics import accuracy_score, classification_report

y_pred = clf.predict(X_test_embeds)

print("✅ Accuracy:", accuracy_score(y_test, y_pred))
print("\n📊 Classification Report:")
print(classification_report(y_test, y_pred))

✅ Accuracy: 0.9911939281891603

📊 Classification Report:
                                      precision    recall  f1-score   support

                AWDL Peer Management       1.00      0.98      0.99       749
        App Bundle/Package ID Events       1.00      1.00      1.00       727
     AppleScript or Scripting Errors       1.00      0.98      0.99       917
   Authentication or Action Failures       1.00      1.00      1.00       412
             Camera Device Wake Logs       0.99      1.00      0.99      1215
DNS/Bonjour Server Connection Errors       1.00      0.99      1.00       935
       Google Software Update Events       0.96      0.99      0.97       561
         IPv6/IPv4 Addressing Issues       1.00      0.96      0.98       297
 Log Capture (CoreCapture) File Logs       1.00      1.00      1.00       897
               Media Playback Events       0.99      0.99      0.99      1370
               Memory Pressure State       1.00      0.99      0.99       872
  Miss

In [8]:
# Step 1: Install and import required packages (if not already)
!pip install -q sentence-transformers scikit-learn joblib

import joblib
import pandas as pd
from sentence_transformers import SentenceTransformer
import re

# Step 2: Load the saved classifier model
clf = joblib.load("log_classifier_model.joblib")

# Step 3: Load the same embedder model
embedder = SentenceTransformer("all-MiniLM-L6-v2")

# Step 4: Normalization function (same used during training)
def normalize_log(text):
    text = text.lower()
    text = re.sub(r'\d+', '<NUM>', text)
    text = re.sub(r'http\S+', 'httpurl', text)
    text = re.sub(r'\s+', ' ', text)
    return text.strip()

# Step 5: New unseen macOS logs to test
new_logs = [
    "ARPT: 610123.456789: wl0: wl_update_tcpkeep_seq: Original Seq: 2477392075",
    "AppleCamIn::systemWakeCall - messageType = 0xE0000340",
    "mdnsresponder[91]: Could not write data to client PID[11203]",
    "com.apple.WebKit.WebContent[25654]: <<< CRABS >>> crabsFlumeHostUnavailable",
    "com_apple_safari_keychain_get_icdp_status: keychain: -25300",
    "IOThunderboltSwitch<0x>(0x): Thunderbolt HPD packet for route = 0x0 port = 1 unplug = 0",
    "Bluetooth -- LE is supported - Disable LE meta event"
]

# Step 6: Normalize and embed
normalized_logs = [normalize_log(log) for log in new_logs]
log_embeddings = embedder.encode(normalized_logs)

# Step 7: Predict
predicted_labels = clf.predict(log_embeddings)

# Step 8: Display results
print("📊 Predictions on New Logs:\n")
for i, (log, label) in enumerate(zip(new_logs, predicted_labels), 1):
    print(f"{i}. Log: {log}\n   ➤ Predicted Label: {label}\n")


[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m363.4/363.4 MB[0m [31m3.9 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m13.8/13.8 MB[0m [31m57.4 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m24.6/24.6 MB[0m [31m33.4 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m883.7/883.7 kB[0m [31m32.3 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m664.8/664.8 MB[0m [31m809.3 kB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m211.5/211.5 MB[0m [31m5.7 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m56.3/56.3 MB[0m [31m12.4 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m127.9/127.9 MB[0m [31m8.5 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━