
# Hierarchical 2-stage pipeline (parent -> children)
Notebook tách từ script gốc, để chạy thử trên Colab.


In [1]:

import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.multiclass import OneVsRestClassifier
from sklearn.linear_model import LogisticRegression
import warnings
warnings.filterwarnings("ignore")

# faiss optional
try:
    import faiss
    FAISS_AVAILABLE = True
except Exception:
    FAISS_AVAILABLE = False
    from sklearn.neighbors import NearestNeighbors


In [12]:

X_texts = [
    "student likes math and programming",
    "student loves physics and chemistry",
    "student enjoys literature and history",
    "student interested in biology and chemistry",
    "student good at math and statistics",
    "student passionate about computer science and ai",
    "student loves art and design",
    "student enjoys economics and finance"
]

y_children = [
    ["Math", "CS"],
    ["Physics", "Chemistry"],
    ["Literature", "History"],
    ["Biology", "Chemistry"],
    ["Math", "Statistics"],
    ["CS", "AI"],
    ["Art", "Design"],
    ["Economics", "Finance"]
]

child2parent = {
    "Math": "Science", "Physics": "Science", "Chemistry": "Science",
    "Biology": "Science", "Statistics": "Science",
    "Literature": "Arts", "History": "Arts", "Art": "Arts", "Design": "Arts",
    "Economics": "Business", "Finance": "Business",
    "CS": "ComputerScience", "AI": "ComputerScience"
}

# derive parent labels from child labels
y_parents = []
for childs in y_children:
    print(childs)
    parents = sorted({ child2parent[c] for c in childs })
    print(parents)
    y_parents.append(parents)


['Math', 'CS']
['ComputerScience', 'Science']
['Physics', 'Chemistry']
['Science']
['Literature', 'History']
['Arts']
['Biology', 'Chemistry']
['Science']
['Math', 'Statistics']
['Science']
['CS', 'AI']
['ComputerScience']
['Art', 'Design']
['Arts']
['Economics', 'Finance']
['Business']


In [9]:
print(y_parents)

[['ComputerScience', 'Science'], ['Science'], ['Arts'], ['Science'], ['Science'], ['ComputerScience'], ['Arts'], ['Business']]


In [13]:

mlb_parents = MultiLabelBinarizer()
Y_parents = mlb_parents.fit_transform(y_parents)
parent_names = list(mlb_parents.classes_)

mlb_children = MultiLabelBinarizer()
Y_children = mlb_children.fit_transform(y_children)
child_names = list(mlb_children.classes_)

print("Parents:", parent_names)
print("Children:", child_names)
print("Y_parents shape:", Y_parents.shape, "Y_children shape:", Y_children.shape)

parent_to_children = {}
for c in child_names:
    p = child2parent[c]
    parent_to_children.setdefault(p, []).append(c)

child_name_to_idx = {name: i for i, name in enumerate(child_names)}
parent_child_indices = {
    p: [ child_name_to_idx[c] for c in parent_to_children[p] ] for p in parent_to_children
}

print("Parent -> child names:")
for p, childs in parent_to_children.items():
    print(f"  {p}: {childs}")


Parents: ['Arts', 'Business', 'ComputerScience', 'Science']
Children: ['AI', 'Art', 'Biology', 'CS', 'Chemistry', 'Design', 'Economics', 'Finance', 'History', 'Literature', 'Math', 'Physics', 'Statistics']
Y_parents shape: (8, 4) Y_children shape: (8, 13)
Parent -> child names:
  ComputerScience: ['AI', 'CS']
  Arts: ['Art', 'Design', 'History', 'Literature']
  Science: ['Biology', 'Chemistry', 'Math', 'Physics', 'Statistics']
  Business: ['Economics', 'Finance']


In [4]:

vectorizer = TfidfVectorizer(max_features=5000)
X_vec = vectorizer.fit_transform(X_texts).toarray().astype('float32')

if FAISS_AVAILABLE:
    d = X_vec.shape[1]
    faiss_index = faiss.IndexFlatL2(d)
    faiss_index.add(X_vec)
    print("FAISS index created. d=", d)
else:
    nn_index = NearestNeighbors(n_neighbors=min(10, len(X_vec)), metric='euclidean').fit(X_vec)
    print("FAISS not available — using sklearn NearestNeighbors fallback.")


FAISS not available — using sklearn NearestNeighbors fallback.


In [5]:

parent_clf = OneVsRestClassifier(LogisticRegression(max_iter=1000))
parent_clf.fit(X_vec, Y_parents)
print("Parent classifier trained. Classes:", parent_names)


Parent classifier trained. Classes: ['Arts', 'Business', 'ComputerScience', 'Science']


In [6]:

child_clfs = {}
for i, p in enumerate(parent_names):
    idx_pos = np.where(Y_parents[:, i] == 1)[0]
    if len(idx_pos) < 2:
        child_clfs[p] = None
        print(f"[WARN] Parent {p} has {len(idx_pos)} positive samples -> no child classifier trained.")
        continue

    X_sub = X_vec[idx_pos]
    cols = parent_child_indices[p]
    y_sub = Y_children[idx_pos][:, cols]

    clf = OneVsRestClassifier(LogisticRegression(max_iter=1000))
    clf.fit(X_sub, y_sub)
    child_clfs[p] = (clf, cols)
    print(f"Trained child clf for parent {p} — children columns: {cols}")


Trained child clf for parent Arts — children columns: [1, 5, 8, 9]
[WARN] Parent Business has 1 positive samples -> no child classifier trained.
Trained child clf for parent ComputerScience — children columns: [0, 3]
Trained child clf for parent Science — children columns: [2, 4, 10, 11, 12]


In [7]:

def predict_hierarchical(text, parent_threshold=0.4, child_threshold=0.5, k=3, alpha=0.6):
    xq = vectorizer.transform([text]).toarray().astype('float32')

    parent_probs_clf = parent_clf.predict_proba(xq).reshape(-1)

    if FAISS_AVAILABLE:
        D, I = faiss_index.search(xq, k)
    else:
        k2 = min(k, len(X_vec))
        D, I = nn_index.kneighbors(xq, n_neighbors=k2)

    neigh_idx = I[0] if I is not None else np.array([], dtype=int)
    if len(neigh_idx) > 0:
        neigh_parent_mean = Y_parents[neigh_idx].mean(axis=0)
    else:
        neigh_parent_mean = np.zeros_like(parent_probs_clf)

    parent_probs = alpha * parent_probs_clf + (1.0 - alpha) * neigh_parent_mean

    active_parents = [ (parent_names[i], float(parent_probs[i])) for i in range(len(parent_names)) if parent_probs[i] >= parent_threshold ]

    predicted_children = []
    children_probs = {}
    for pname, pprob in active_parents:
        clf_info = child_clfs.get(pname)
        if clf_info is None:
            cols = parent_child_indices[pname]
            if len(neigh_idx) > 0:
                neigh_child_mean = Y_children[neigh_idx][:, cols].mean(axis=0)
                for j, cidx in enumerate(cols):
                    if neigh_child_mean[j] >= child_threshold:
                        predicted_children.append(child_names[cidx])
                        children_probs[child_names[cidx]] = float(neigh_child_mean[j])
            continue

        clf, cols = clf_info
        probs = clf.predict_proba(xq)
        probs = np.array(probs).reshape(-1)
        for j, child_col_idx in enumerate(cols):
            prob = float(probs[j])
            if prob >= child_threshold:
                predicted_children.append(child_names[child_col_idx])
            children_probs[child_names[child_col_idx]] = prob

    result = {
        "parent_probs": { parent_names[i]: float(parent_probs[i]) for i in range(len(parent_names)) },
        "active_parents": active_parents,
        "predicted_children": sorted(list(set(predicted_children))),
        "children_probs": children_probs,
        "nearest_neighbor_indices": neigh_idx.tolist()
    }
    return result


In [8]:

q = "this student is very good at math and loves programming and ai"
out = predict_hierarchical(q, parent_threshold=0.3, child_threshold=0.4, k=3, alpha=0.6)

print("Parent probabilities:")
for p, pr in out["parent_probs"].items():
    print(f"  {p:16s}: {pr:.3f}")
print("Active parents (>= threshold):", out["active_parents"])
print("Predicted child labels:", out["predicted_children"])
print("Child probabilities:", out["children_probs"])
print("Nearest neighbor indices:", out["nearest_neighbor_indices"])


Parent probabilities:
  Arts            : 0.130
  Business        : 0.064
  ComputerScience : 0.302
  Science         : 0.742
Active parents (>= threshold): [('ComputerScience', 0.30207953067762994), ('Science', 0.7423300428252578)]
Predicted child labels: ['AI', 'CS', 'Chemistry', 'Math']
Child probabilities: {'AI': 0.4749309404638527, 'CS': 1.0, 'Biology': 0.20434192930355896, 'Chemistry': 0.42560356781608344, 'Math': 0.5743964321839166, 'Physics': 0.23156395653002587, 'Statistics': 0.2880086673034899}
Nearest neighbor indices: [4, 0, 1]



### Notes & tuning
- Nếu không có `faiss` thì fallback `NearestNeighbors` vẫn chạy được.
- Có thể chỉnh `parent_threshold`, `child_threshold`, `k`, `alpha` để thay đổi hành vi dự đoán.
- Với dữ liệu thực cần train/test split và cross-validation trước khi chốt ngưỡng.
- Có thể lưu `vectorizer` và classifier bằng `joblib` để tái sử dụng.
