In [1]:
!pip -q install pandas==2.2.2 scikit-learn==1.5.1 joblib==1.4.2

[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/13.1 MB[0m [31m?[0m eta [36m-:--:--[0m[2K   [91m━━━[0m[91m╸[0m[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.2/13.1 MB[0m [31m61.3 MB/s[0m eta [36m0:00:01[0m[2K   [91m━━━━━━━━━━━[0m[90m╺[0m[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.7/13.1 MB[0m [31m68.0 MB/s[0m eta [36m0:00:01[0m[2K   [91m━━━━━━━━━━━━━━━━━━━━━━[0m[90m╺[0m[90m━━━━━━━━━━━━━━━━━[0m [32m7.3/13.1 MB[0m [31m84.3 MB/s[0m eta [36m0:00:01[0m[2K   [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[90m╺[0m[90m━━━━[0m [32m11.5/13.1 MB[0m [31m113.6 MB/s[0m eta [36m0:00:01[0m[2K   [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m [32m13.1/13.1 MB[0m [31m87.5 MB/s[0m eta [36m0:00:01[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m13.1/13.1 MB[0m [31m63.4 MB/s[0m eta [36m0:00:00[0m
[?25h[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/301.8 kB[0m [31m?[

In [3]:
import pandas as pd

df = pd.read_csv("incidents_v1.csv")
df.head(), df["incident_type"].value_counts()

(   log_id                        source  \
 0       0  dfs.DataNode$PacketResponder   
 1       1  dfs.DataNode$PacketResponder   
 2       2              dfs.FSNamesystem   
 3       3  dfs.DataNode$PacketResponder   
 4       4  dfs.DataNode$PacketResponder   
 
                                                  log incident_type  
 0  PacketResponder 1 for block blk_38865049064139...      db_error  
 1  PacketResponder 0 for block blk_-6952295868487...      db_error  
 2  BLOCK* NameSystem.addStoredBlock: blockMap upd...       unknown  
 3  PacketResponder 2 for block blk_82291938032499...      db_error  
 4  PacketResponder 2 for block blk_-6670958622368...      db_error  ,
 incident_type
 db_error    1078
 unknown      922
 Name: count, dtype: int64)

In [4]:
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression

X = df["log"].astype(str)
y = df["incident_type"].astype(str)

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

pipe = Pipeline([
    ("tfidf", TfidfVectorizer(ngram_range=(1,2), min_df=2, max_df=0.9)),
    ("clf", LogisticRegression(max_iter=200, class_weight="balanced", n_jobs=None))
])

pipe.fit(X_train, y_train)

In [5]:
from sklearn.metrics import classification_report, accuracy_score, f1_score

pred = pipe.predict(X_test)

acc = accuracy_score(y_test, pred)
f1m = f1_score(y_test, pred, average="macro")

print("accuracy:", acc)
print("macro_f1:", f1m)
print(classification_report(y_test, pred))


accuracy: 1.0
macro_f1: 1.0
              precision    recall  f1-score   support

    db_error       1.00      1.00      1.00       216
     unknown       1.00      1.00      1.00       184

    accuracy                           1.00       400
   macro avg       1.00      1.00      1.00       400
weighted avg       1.00      1.00      1.00       400



In [6]:
import joblib, json
from datetime import datetime

joblib.dump(pipe, "incident_clf.joblib")

meta = {
    "model_type": "tfidf_logreg",
    "created_at": datetime.utcnow().isoformat() + "Z",
    "data_version": "incidents_v1.csv (from HDFS_2k structured)",
    "metrics": {"accuracy": float(acc), "macro_f1": float(f1m)},
    "classes": sorted(df["incident_type"].unique().tolist())
}

with open("model_meta.json", "w") as f:
    json.dump(meta, f, indent=2)

print("Saved:", "incident_clf.joblib", "model_meta.json")


Saved: incident_clf.joblib model_meta.json


  "created_at": datetime.utcnow().isoformat() + "Z",


In [8]:
from google.colab import files
files.download("incident_clf.joblib")
files.download("model_meta.json")


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>