In [1]:
# ✅ STEP 1: Upload cleaned_logs.csv
from google.colab import files
import pandas as pd

uploaded = files.upload()  # upload cleaned_logs.csv

df = pd.read_csv("cleaned_logs.csv")

# ✅ STEP 2: Take first 50,000 rows and use log_message_clean

df = df.dropna(subset=["log_message_clean"]).reset_index(drop=True)


print("✅ Loaded and sampled:", df.shape)

Saving cleaned_logs.csv to cleaned_logs.csv
✅ Loaded and sampled: (102768, 2)


In [2]:
# ✅ STEP 3: Embed log_message_clean using SentenceTransformer
!pip install -U sentence-transformers


Collecting sentence-transformers
  Downloading sentence_transformers-4.1.0-py3-none-any.whl.metadata (13 kB)
Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch>=1.11.0->sentence-transformers)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch>=1.11.0->sentence-transformers)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.4.127 (from torch>=1.11.0->sentence-transformers)
  Downloading nvidia_cuda_cupti_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cudnn-cu12==9.1.0.70 (from torch>=1.11.0->sentence-transformers)
  Downloading nvidia_cudnn_cu12-9.1.0.70-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cublas-cu12==12.4.5.8 (from torch>=1.11.0->sentence-transformers)
  Downloading nvidia_cublas_cu12-12.4.5.8-py3-none-manylinux2014_x86_

In [2]:
from sentence_transformers import SentenceTransformer

print("🔄 Embedding logs...")
model = SentenceTransformer("all-MiniLM-L6-v2")
embeddings = model.encode(df["log_message_clean"].tolist(), show_progress_bar=True)

print("✅ Embeddings shape:", embeddings.shape)

🔄 Embedding logs...


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/10.5k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

Batches:   0%|          | 0/3212 [00:00<?, ?it/s]

✅ Embeddings shape: (102768, 384)


In [3]:
# ✅ STEP 4: Dimensionality Reduction with UMAP (384 → 20)
!pip install umap-learn

import umap
reducer = umap.UMAP(n_components=20, random_state=42)
reduced_embeddings = reducer.fit_transform(embeddings)

print("✅ Reduced shape:", reduced_embeddings.shape)



  warn(


✅ Reduced shape: (102768, 20)


In [4]:
# ✅ STEP 5: KMeans Clustering (25 clusters)
from sklearn.cluster import KMeans

kmeans = KMeans(n_clusters=25, random_state=42)
cluster_labels = kmeans.fit_predict(reduced_embeddings)

df["cluster_id"] = cluster_labels
print("✅ Clustered logs into 25 groups.")

✅ Clustered logs into 25 groups.


In [5]:
# ✅ STEP 6: Show 5 example logs per cluster
for cluster_id in sorted(df["cluster_id"].unique()):
    print(f"\n🔷 Cluster {cluster_id} (count: {len(df[df.cluster_id == cluster_id])})")
    sample_logs = df[df["cluster_id"] == cluster_id].head(5)["log_message"].tolist()
    for i, log in enumerate(sample_logs, 1):
        print(f"   {i}. {log[:150]}")


🔷 Cluster 0 (count: 7743)
   1. com.apple.CacheDelete.daily: scheduler_evaluate_activity told me to run this job; however, but the start time isn't for 14567 seconds.  Ignoring.
   2. com.apple.Safari.SafeBrowsing.Update: scheduler_evaluate_activity told me to run this job; however, but the start time isn't for 1130 seconds.  Ignori
   3. com.apple.icloud.fmfd.heartbeat: scheduler_evaluate_activity told me to run this job; however, but the start time isn't for 440131 seconds.  Ignoring.
   4. com.apple.EscrowSecurityAlert.daily: scheduler_evaluate_activity told me to run this job; however, but the start time isn't for 14655 seconds.  Ignori
   5. com.apple.Safari.SafeBrowsing.Update: scheduler_evaluate_activity told me to run this job; however, but the start time isn't for 1121 seconds.  Ignori

🔷 Cluster 1 (count: 1485)
   1. in6_unlink_ifa: IPv6 address 0x77c911455cd9bcdb has no prefix
   2. __73-[NetworkAnalyticsEngine observeValueForKeyPath:ofObject:change:context:]_block_invoke u

In [6]:
# ✅ STEP 7: Auto-label clusters using TF-IDF from cleaned logs
from sklearn.feature_extraction.text import TfidfVectorizer

label_map = {}
tfidf = TfidfVectorizer(max_features=3, stop_words="english")

for cluster_id in sorted(df["cluster_id"].unique()):
    logs = df[df["cluster_id"] == cluster_id]["log_message_clean"].tolist()
    if not logs:
        continue
    tfidf_matrix = tfidf.fit_transform(logs)
    top_keywords = tfidf.get_feature_names_out()
    label_map[cluster_id] = "_".join(top_keywords)

df["cluster_label"] = df["cluster_id"].apply(lambda cid: label_map.get(cid, "Unlabeled"))


In [7]:
unique_labels = df["cluster_label"].unique().tolist()
print(f"🔢 Total unique cluster labels: {len(unique_labels)}")
print("🧠 Cluster Labels:")
for label in unique_labels:
    print("-", label)

🔢 Total unique cluster labels: 23
🧠 Cluster Labels:
- applethunderboltnhitype_complete_num
- changed_en_num
- arpt_num_wl
- location_null_num
- arpt_num_wake
- applecamin_messagetype_num
- num_pressure_state
- complete_figplaybackbossprerollcompleted_num
- failed_network_num
- com_error_num
- awdlpeermanager_io_num
- apple_job_num
- connecttoserver_dnssd_clientstub_num
- address_ipv_num
- googlesoftwareupdateagent_lvl_num
- act_failed_num
- bundle_id_num
- event_num_url
- ccfile_file_num
- corecapture_io_num
- com_deny_num
- error_num_scripting
- apple_com_safari


In [10]:
manual_label_map = {
    "applethunderboltnhitype_complete_num": "Thunderbolt Hardware Events",
    "changed_en_num": "Network Interface Changes",
    "arpt_num_wl": "Wireless ARPT Log Events",
    "location_null_num": "Missing Location or Interface Data",
    "arpt_num_wake": "Wake from Sleep via ARPT",
    "applecamin_messagetype_num": "Camera Device Wake Logs",
    "num_pressure_state": "Memory Pressure State",
    "complete_figplaybackbossprerollcompleted_num": "Media Playback Events",
    "failed_network_num": "Network Connection Failures",
    "com_error_num": "macOS Error and Exception Logs",
    "awdlpeermanager_io_num": "AWDL Peer Management",
    "apple_job_num": "macOS Background Jobs",
    "connecttoserver_dnssd_clientstub_num": "DNS/Bonjour Server Connection Errors",
    "address_ipv_num": "IPv6/IPv4 Addressing Issues",
    "googlesoftwareupdateagent_lvl_num": "Google Software Update Events",
    "act_failed_num": "Authentication or Action Failures",
    "bundle_id_num": "App Bundle/Package ID Events",
    "event_num_url": "Network URLs and Redirect Events",
    "ccfile_file_num": "Log Capture (CoreCapture) File Logs",
    "corecapture_io_num": "macOS CoreCapture Subsystem Logs",
    "com_deny_num": "Service or Access Denied Events",
    "error_num_scripting": "AppleScript or Scripting Errors",
    "apple_com_safari": "Safari Browser Logs"
}


In [12]:
# Step 1: Map machine labels to human-readable ones
df["cluster_label_human"] = df["cluster_label"].map(manual_label_map).fillna("Unlabeled")

# Step 2: Save final CSV with original log + human label
final_df = df[["log_message", "cluster_label"]]
final_df.to_csv("labeled_logs.csv", index=False)

# Step 3 (Optional in Colab): Download the file
from google.colab import files
files.download("labeled_logs.csv")


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>