Генерация embeddings + feature matrix

In [1]:
import pandas as pd

df = pd.read_excel("/content/Ready_clean_sample.xlsx")

In [2]:
import numpy as np
from sentence_transformers import SentenceTransformer
from tqdm import tqdm

# --- Load model ---
model = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2")

def generate_embeddings(
    texts,
    batch_size=64,
    normalize=True
):
    embeddings = []

    for i in tqdm(range(0, len(texts), batch_size)):
        batch = texts[i:i + batch_size]
        emb = model.encode(
            batch,
            show_progress_bar=False,
            normalize_embeddings=normalize
        )
        embeddings.append(emb)

    return np.vstack(embeddings)


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

In [3]:
texts = df["text"].fillna("").tolist()

X_emb = generate_embeddings(texts, batch_size=64)

print(X_emb.shape)
# (30000, 384)


100%|██████████| 469/469 [10:27<00:00,  1.34s/it]

(30001, 384)





In [4]:
tabular_features = [
    "text_length",
    "word_count",
    "num_exclamations",
    "has_urgent"
]

X_tab = df[tabular_features].values


In [6]:
from numpy import hstack

X = hstack([X_emb, X_tab])

In [10]:
X.shape = (len(df), 384 + len(tabular_features))

print(X.shape)

(30001, 388)


Target

In [13]:
label_map = {
    "low": 0,
    "medium": 1,
    "high": 2,
    "critical": 3
}

y = df["priority_score"].map(label_map).values

Train/val/split

In [14]:
from sklearn.model_selection import train_test_split

X_train, X_val, y_train, y_val = train_test_split(
    X, y,
    test_size=0.2,
    stratify=y,
    random_state=42
)

Проверка на sanity-check

In [15]:
from sklearn.metrics.pairwise import cosine_similarity

cosine_similarity(X_emb[:5])

array([[1.0000001 , 0.48126093, 0.40945172, 0.71243304, 0.39938685],
       [0.48126093, 1.0000001 , 0.28587198, 0.42005306, 0.5955684 ],
       [0.40945172, 0.28587198, 0.9999999 , 0.525432  , 0.30601195],
       [0.71243304, 0.42005306, 0.525432  , 0.99999994, 0.40559393],
       [0.39938685, 0.5955684 , 0.30601195, 0.40559393, 1.        ]],
      dtype=float32)

Обучаем XGBoost

In [18]:
print(X_train.shape, X_val.shape)
print(y_train.shape, y_val.shape)

(24000, 388) (6001, 388)
(24000,) (6001,)


Работа с дисбалансом классов

In [19]:
import numpy as np

classes, counts = np.unique(y_train, return_counts=True)
print(dict(zip(classes, counts)))

{np.int64(1): np.int64(1114), np.int64(2): np.int64(22369), np.int64(3): np.int64(517)}


In [20]:
from sklearn.utils.class_weight import compute_class_weight

class_weights = compute_class_weight(
    class_weight="balanced",
    classes=np.unique(y_train),
    y=y_train
)

class_weight_map = dict(zip(np.unique(y_train), class_weights))

sample_weights = np.array([class_weight_map[y] for y in y_train])


In [44]:
import xgboost as xgb

dtrain = xgb.DMatrix(X_train, label=y_train, weight=sample_weights)
dval = xgb.DMatrix(X_val, label=y_val)

params = {
    "objective": "multi:softprob",
    "num_class": 4,
    "max_depth": 6,
    "eta": 0.05,
    "subsample": 0.8,
    "colsample_bytree": 0.8,
    "eval_metric": "mlogloss",
}

model = xgb.train(
    params,
    dtrain,
    num_boost_round=500,
    evals=[(dval, "val")],
    early_stopping_rounds=30
)


[0]	val-mlogloss:1.27659
[1]	val-mlogloss:1.24295
[2]	val-mlogloss:1.21210
[3]	val-mlogloss:1.18061
[4]	val-mlogloss:1.15033
[5]	val-mlogloss:1.12336
[6]	val-mlogloss:1.09778
[7]	val-mlogloss:1.07187
[8]	val-mlogloss:1.04954
[9]	val-mlogloss:1.02678
[10]	val-mlogloss:1.00458
[11]	val-mlogloss:0.98418
[12]	val-mlogloss:0.96435
[13]	val-mlogloss:0.94621
[14]	val-mlogloss:0.92689
[15]	val-mlogloss:0.90827
[16]	val-mlogloss:0.89169
[17]	val-mlogloss:0.87423
[18]	val-mlogloss:0.85773
[19]	val-mlogloss:0.84171
[20]	val-mlogloss:0.82842
[21]	val-mlogloss:0.81286
[22]	val-mlogloss:0.79869
[23]	val-mlogloss:0.78460
[24]	val-mlogloss:0.77131
[25]	val-mlogloss:0.75975
[26]	val-mlogloss:0.74830
[27]	val-mlogloss:0.73599
[28]	val-mlogloss:0.72370
[29]	val-mlogloss:0.71201
[30]	val-mlogloss:0.70121
[31]	val-mlogloss:0.69119
[32]	val-mlogloss:0.68016
[33]	val-mlogloss:0.66991
[34]	val-mlogloss:0.66076
[35]	val-mlogloss:0.65117
[36]	val-mlogloss:0.64183
[37]	val-mlogloss:0.63242
[38]	val-mlogloss:0.62

Предсказание

In [51]:
# X_val уже есть, создаём DMatrix
dval = xgb.DMatrix(X_val)

# Получаем вероятности
y_proba = model.predict(dval)  # shape: (n_samples, num_class)

# Получаем метки (argmax)
y_pred = y_proba.argmax(axis=1)

<xgboost.core.DMatrix object at 0x7d265e0bd3d0>


Метрики

In [52]:
from sklearn.metrics import classification_report, confusion_matrix

# Классы в порядке 0:low,1:medium,2:high,3:critical
print(classification_report(y_val, y_pred, target_names=["medium","high","critical"]))

print("Confusion Matrix:")
print(confusion_matrix(y_val, y_pred))

              precision    recall  f1-score   support

      medium       0.81      0.43      0.56       279
        high       0.96      0.99      0.98      5593
    critical       0.90      0.57      0.70       129

    accuracy                           0.96      6001
   macro avg       0.89      0.66      0.75      6001
weighted avg       0.95      0.96      0.95      6001

Confusion Matrix:
[[ 119  160    0]
 [  27 5558    8]
 [   1   54   74]]


Бизнес метрики

In [54]:
from sklearn.metrics import recall_score

critical_label = 3  # индекс класса critical

recall_critical = recall_score(y_val, y_pred, labels=[critical_label], average="macro")
print("Recall (critical):", recall_critical)

Recall (critical): 0.5736434108527132
