In [None]:
import ast
import torch
import torch.nn.functional as F
import pandas as pd
from transformers import AutoTokenizer
import torch.nn as nn
from transformers import AutoModel

In [None]:
# Recreate the MultiTaskModel class

class MultiTaskModel(nn.Module):
    def __init__(self, model_name, num_labels_dict, dropout=0.2):
        super().__init__()
        self.backbone = AutoModel.from_pretrained(model_name)
        h = self.backbone.config.hidden_size
        self.dropout = nn.Dropout(dropout)
        self.intent_head        = nn.Linear(h, num_labels_dict["intent"])
        self.issue_type_head    = nn.Linear(h, num_labels_dict["issue_type"])
        self.product_head       = nn.Linear(h, num_labels_dict["product"])
        self.urgency_head       = nn.Linear(h, num_labels_dict["urgency"])
        self.sentiment_head     = nn.Linear(h, num_labels_dict["sentiment"])
        self.routing_queue_head = nn.Linear(h, num_labels_dict["routing_queue"])

    def forward(self, input_ids, attention_mask):
        x = self.backbone(input_ids=input_ids, attention_mask=attention_mask).last_hidden_state[:, 0, :]
        x = self.dropout(x)
        return {
            "intent":        self.intent_head(x),
            "issue_type":    self.issue_type_head(x),
            "product":       self.product_head(x),
            "urgency":       self.urgency_head(x),
            "sentiment":     self.sentiment_head(x),
            "routing_queue": self.routing_queue_head(x),
        }

In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# === 1. Load trained checkpoint (SAME as inference) ===
CKPT_PATH = "/content/drive/MyDrive/Colab Notebooks/TrainedModel/multitask_distilbert_clean.pt"

checkpoint = torch.load(CKPT_PATH, map_location=device)
model_name = checkpoint["model_name"]
num_labels_dict = checkpoint["num_labels_dict"]

model = MultiTaskModel(model_name, num_labels_dict).to(device)
model.load_state_dict(checkpoint["model_state_dict"])
model.eval()

backbone = model.backbone
backbone.eval()

tokenizer = AutoTokenizer.from_pretrained(model_name)

print("Model reloaded successfully.")

# === 2. Load KB CSV ===
KB_PATH = "/content/drive/MyDrive/Colab Notebooks/DataSet/kb_policies_rag.csv"
kb_df = pd.read_csv(KB_PATH)

list_cols = ["product_tags", "issue_type_tags", "intent_tags"]
for col in list_cols:
    kb_df[col] = kb_df[col].apply(ast.literal_eval)

print("KB loaded. Shape:", kb_df.shape)

Model reloaded successfully.
KB loaded. Shape: (22, 7)


In [None]:
# === 3. Embedding function using model.backbone ===
@torch.no_grad()
def embed_texts(texts, max_length=256, batch_size=16):
    if isinstance(texts, str):
        texts = [texts]
    all_embs = []
    for i in range(0, len(texts), batch_size):
        batch = texts[i:i+batch_size]
        enc = tokenizer(
            batch,
            return_tensors="pt",
            padding=True,
            truncation=True,
            max_length=max_length
        ).to(device)
        out = backbone(**enc).last_hidden_state   # [B, T, H]
        emb = out[:, 0, :]                        # CLS pooling (be consistent!)
        all_embs.append(emb.cpu())
    embs = torch.cat(all_embs, dim=0)
    return embs      # save unnormalised; weâ€™ll normalise on load

# === 4. Build and save KB embeddings ===
kb_texts = kb_df["body_text"].tolist()
kb_embeddings = embed_texts(kb_texts)

KB_EMB_PATH = "/content/drive/MyDrive/Colab Notebooks/TrainedModel/kb_embeddings.pt"
torch.save(kb_embeddings, KB_EMB_PATH)

print("KB embeddings built and saved to:", KB_EMB_PATH)
print("KB embeddings shape:", kb_embeddings.shape)

KB embeddings built and saved to: /content/drive/MyDrive/Colab Notebooks/TrainedModel/kb_embeddings.pt
KB embeddings shape: torch.Size([22, 768])
