In [7]:
!pip -q install pandas==2.2.2 scikit-learn==1.5.1 joblib==1.4.2

In [8]:
import pandas as pd

df = pd.read_csv("incidents_v1.csv")
df.head(), df["incident_type"].value_counts()

(   log_id                        source  \
 0       0  dfs.DataNode$PacketResponder   
 1       1  dfs.DataNode$PacketResponder   
 2       2              dfs.FSNamesystem   
 3       3  dfs.DataNode$PacketResponder   
 4       4  dfs.DataNode$PacketResponder   
 
                                                  log incident_type  
 0  PacketResponder 1 for block blk_38865049064139...       unknown  
 1  PacketResponder 0 for block blk_-6952295868487...       unknown  
 2  BLOCK* NameSystem.addStoredBlock: blockMap upd...       unknown  
 3  PacketResponder 2 for block blk_82291938032499...       unknown  
 4  PacketResponder 2 for block blk_-6670958622368...       unknown  ,
 incident_type
 unknown     1696
 config       224
 db_error     200
 timeout      200
 auth         200
 Name: count, dtype: int64)

In [9]:
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GroupShuffleSplit

X = df["log"].astype(str)
y = df["incident_type"].astype(str)

# proxy "template" group: first 60 chars after lowercasing
groups = X.str.lower().str.slice(0, 60)

gss = GroupShuffleSplit(n_splits=1, test_size=0.2, random_state=42)
train_idx, test_idx = next(gss.split(X, y, groups=groups))

X_train, X_test = X.iloc[train_idx], X.iloc[test_idx]
y_train, y_test = y.iloc[train_idx], y.iloc[test_idx]


pipe = Pipeline([
    ("tfidf", TfidfVectorizer(ngram_range=(1,2), min_df=2, max_df=0.9)),
    ("clf", LogisticRegression(max_iter=200, class_weight="balanced", n_jobs=None))
])

pipe.fit(X_train, y_train)

In [10]:
from sklearn.metrics import classification_report, accuracy_score, f1_score

pred = pipe.predict(X_test)

acc = accuracy_score(y_test, pred)
f1m = f1_score(y_test, pred, average="macro")

print("accuracy:", acc)
print("macro_f1:", f1m)
print(classification_report(y_test, pred))


accuracy: 0.9272349272349273
macro_f1: 0.7066666666666667
              precision    recall  f1-score   support

        auth       0.00      0.00      0.00        35
      config       1.00      1.00      1.00        51
    db_error       0.36      1.00      0.53        20
     timeout       1.00      1.00      1.00        38
     unknown       1.00      1.00      1.00       337

    accuracy                           0.93       481
   macro avg       0.67      0.80      0.71       481
weighted avg       0.90      0.93      0.91       481



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [11]:
import pandas as pd
from sklearn.metrics import confusion_matrix

cm = confusion_matrix(y_test, pred, labels=sorted(df["incident_type"].unique()))
print(pd.DataFrame(cm, index=sorted(df["incident_type"].unique()), columns=sorted(df["incident_type"].unique())))
print("Test class counts:")
print(y_test.value_counts())


          auth  config  db_error  timeout  unknown
auth         0       0        35        0        0
config       0      51         0        0        0
db_error     0       0        20        0        0
timeout      0       0         0       38        0
unknown      0       0         0        0      337
Test class counts:
incident_type
unknown     337
config       51
timeout      38
auth         35
db_error     20
Name: count, dtype: int64


In [12]:
import joblib, json
from datetime import datetime

joblib.dump(pipe, "incident_clf.joblib")

meta = {
    "model_type": "tfidf_logreg",
    "created_at": datetime.utcnow().isoformat() + "Z",
    "data_version": "incidents_v1.csv (from HDFS_2k structured)",
    "metrics": {"accuracy": float(acc), "macro_f1": float(f1m)},
    "classes": sorted(df["incident_type"].unique().tolist())
}

with open("model_meta.json", "w") as f:
    json.dump(meta, f, indent=2)

print("Saved:", "incident_clf.joblib", "model_meta.json")


Saved: incident_clf.joblib model_meta.json


  "created_at": datetime.utcnow().isoformat() + "Z",


In [13]:
from google.colab import files
files.download("incident_clf.joblib")
files.download("model_meta.json")


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>