In [3]:
# 📂 Step 1: Upload labeled_logs.csv

from google.colab import files
uploaded = files.upload()

import pandas as pd

df = pd.read_csv("labeled_logs.csv")
df.head()


Saving labeled_logs.csv to labeled_logs.csv


Unnamed: 0,log_message,cluster_label_final
0,AppleThunderboltNHIType2::prePCIWake - power u...,Thunderbolt Hardware Events
1,AppleThunderboltGenericHAL::earlyWake - comple...,Thunderbolt Hardware Events
2,AirPort: Link Down on awdl0. Reason 1 (Unspeci...,Network Interface Changes
3,ARPT: 620651.021206: wl0: wl_update_tcpkeep_se...,Wireless ARPT Log Events
4,Bluetooth -- LE is supported - Disable LE meta...,Missing Location or Interface Data


In [2]:
# 📦 Step 2: Install LightGBM if not already

!pip install lightgbm --quiet


In [5]:
# 🛠️ Step 3: Correct Preprocessing (Split first, then embed)

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder

# Split features and labels
X = df['log_message']
y = df['cluster_label_final']  # Corrected: use cluster_label_final

# Encode text labels into numbers
label_encoder = LabelEncoder()
y_encoded = label_encoder.fit_transform(y)

# Split into Train and Test BEFORE embedding
X_train, X_test, y_train, y_test = train_test_split(X, y_encoded, test_size=0.2, random_state=42)

# Load SentenceTransformer to embed
from sentence_transformers import SentenceTransformer
embedder = SentenceTransformer('all-MiniLM-L6-v2')

# Embed separately
X_train_embeds = embedder.encode(X_train.tolist(), show_progress_bar=True)
X_test_embeds = embedder.encode(X_test.tolist(), show_progress_bar=True)



The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/10.5k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

Batches:   0%|          | 0/2570 [00:00<?, ?it/s]

Batches:   0%|          | 0/643 [00:00<?, ?it/s]

In [5]:
# 🤖 Step 4: Train Multiple Models

from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
import lightgbm as lgb

# Initialize models
logistic_model = LogisticRegression(max_iter=1000, random_state=42)
rf_model = RandomForestClassifier(n_estimators=150, random_state=42)
lgbm_model = lgb.LGBMClassifier(random_state=42)

# Train models
logistic_model.fit(X_train_embeds, y_train)
rf_model.fit(X_train_embeds, y_train)
lgbm_model.fit(X_train_embeds, y_train)




[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.955629 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 97911
[LightGBM] [Info] Number of data points in the train set: 82214, number of used features: 384
[LightGBM] [Info] Start training from score -3.300433
[LightGBM] [Info] Start training from score -3.337400
[LightGBM] [Info] Start training from score -3.108589
[LightGBM] [Info] Start training from score -3.897101
[LightGBM] [Info] Start training from score -2.818255
[LightGBM] [Info] Start training from score -3.089973
[LightGBM] [Info] Start training from score -3.587785
[LightGBM] [Info] Start training from score -4.235372
[LightGBM] [Info] Start training from score -3.121195
[LightGBM] [Info] Start training from score -2.719045
[LightGBM] [Info] Start training from score -3.172402
[LightGBM] [Info] Start training from score -3.010856
[LightGBM] [Info] Start training from score -3.249932
[LightG

In [6]:
# 📈 Step 5: Evaluate Models

from sklearn.metrics import accuracy_score, f1_score

# Make predictions
y_pred_logistic = logistic_model.predict(X_test_embeds)
y_pred_rf = rf_model.predict(X_test_embeds)
y_pred_lgbm = lgbm_model.predict(X_test_embeds)

# Collect results
results = {
    "Model": ["Logistic Regression", "Random Forest", "LightGBM"],
    "Accuracy": [
        accuracy_score(y_test, y_pred_logistic),
        accuracy_score(y_test, y_pred_rf),
        accuracy_score(y_test, y_pred_lgbm)
    ],
    "Macro F1 Score": [
        f1_score(y_test, y_pred_logistic, average='macro'),
        f1_score(y_test, y_pred_rf, average='macro'),
        f1_score(y_test, y_pred_lgbm, average='macro')
    ],
    "Weighted F1 Score": [
        f1_score(y_test, y_pred_logistic, average='weighted'),
        f1_score(y_test, y_pred_rf, average='weighted'),
        f1_score(y_test, y_pred_lgbm, average='weighted')
    ]
}

comparison_df = pd.DataFrame(results)
comparison_df




Unnamed: 0,Model,Accuracy,Macro F1 Score,Weighted F1 Score
0,Logistic Regression,0.991437,0.990323,0.991404
1,Random Forest,0.995232,0.994743,0.995196
2,LightGBM,0.995086,0.994669,0.99504
