In [None]:
!pip install nlpaug

In [None]:
from transformers import BertTokenizer, BertForSequenceClassification, Trainer, TrainingArguments
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import LabelEncoder
import torch
from torch.utils.data import Dataset
import pandas as pd
from torch.utils.data import Dataset, DataLoader
from transformers import BertTokenizer, BertForSequenceClassification , AutoTokenizer,AutoModelForSequenceClassification
from torch.optim import AdamW
from tqdm import tqdm
from sklearn.utils.class_weight import compute_class_weight
from collections import Counter
from torch.utils.data import WeightedRandomSampler
import os
os.environ["WANDB_DISABLED"] = "true"
import warnings
warnings.filterwarnings("default")
import nlpaug.augmenter.word as naw
import pandas as pd

In [None]:
teacher_tokenizer = BertTokenizer.from_pretrained("bert-base-multilingual-cased")
teacher_model = BertForSequenceClassification.from_pretrained("bert-base-multilingual-cased", num_labels=42)

In [None]:
data = pd.read_excel("/kaggle/input/d/narvindalt/something/AG - AI 2.xlsx")

data["text"] = data["Email Subject"].fillna("") + " " + \
               data["Email Query Discerption"].fillna("") 

In [None]:
data = data.dropna()

In [None]:
data2 = data.drop(["Query Category"],axis=1)
data2

In [None]:
data2['Query Item'] = data2['Query Item'].str.split('-').str[1]

In [None]:
data2

In [None]:
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
data2["Query Item"] = le.fit_transform(data2["Query Item"])

In [None]:
data2

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt

sns.countplot(x=data2["Query Item"])
plt.xticks(rotation=90)
plt.title("Category Distribution")
plt.show()

In [None]:
import nlpaug.augmenter.word as naw
import pandas as pd
from collections import Counter

augmenter = naw.ContextualWordEmbsAug(
    model_path='bert-base-uncased', action="substitute", device='cuda' if torch.cuda.is_available() else 'cpu'
)

In [None]:
label_counts = data2["Query Item"].value_counts()
max_count = label_counts.max()

print(label_counts)

In [None]:
from tqdm import tqdm

augmented_texts = []
augmented_labels = []

for label, count in label_counts.items():
    if count < max_count:
        texts = data2[data2["Query Item"] == label]["text"].tolist()
        needed = max_count - count

        repeats = needed // len(texts) + 1
        texts_to_augment = (texts * repeats)[:needed]

        print(f"Augmenting class '{label}' with {needed} samples...")
        for text in tqdm(texts_to_augment, desc=f"Augmenting '{label}'", leave=False):
            try:
                aug_text = augmenter.augment(text)
                augmented_texts.append(aug_text if isinstance(aug_text, str) else aug_text[0])
                augmented_labels.append(label)
            except Exception as e:
                print(f"Error augmenting: {text} → {e}")


In [None]:
aug_data = pd.DataFrame({"text": augmented_texts, "Query Item": augmented_labels})
data2 = pd.concat([data2, aug_data]).reset_index(drop=True)

print(data2["Query Item"].value_counts())

In [None]:
data2.to_csv("/kaggle/working/aug_df.csv")

In [None]:
texts = list(data2["text"])
categorys = list(data2["Query Item"])
true_labels = list(data2["Query Item"].unique())
train_texts, val_texts, train_labels, val_labels = train_test_split(
    texts,
    categorys,
    test_size=0.2,
    random_state=42,
    stratify=categorys
)

In [None]:
class TextDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_len=128):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = self.texts[idx]
        label = self.labels[idx]
        encoding = self.tokenizer(
            text,
            max_length=self.max_len,
            padding='max_length',
            truncation=True,
            return_tensors='pt'
        )
        
        item = {key: val.squeeze(0) for key, val in encoding.items()}
        item['labels'] = torch.tensor(label)
        return item

In [None]:
train_dataset = TextDataset(train_texts, train_labels, teacher_tokenizer)
val_dataset = TextDataset(val_texts, val_labels, teacher_tokenizer)

train_loader = DataLoader(train_dataset, batch_size=16)
val_loader = DataLoader(val_dataset, batch_size=32, shuffle=False)

In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
teacher_model.to(device)

In [None]:
class EarlyStopping:
    def __init__(self, patience=3, min_delta=0):
        self.patience = patience
        self.min_delta = min_delta
        self.counter = 0
        self.best_loss = None
        self.early_stop = False

    def __call__(self, val_loss):
        if self.best_loss is None or val_loss < self.best_loss - self.min_delta:
            self.best_loss = val_loss
            self.counter = 0
        else:
            self.counter += 1
            if self.counter >= self.patience:
                self.early_stop = True


In [None]:
import torch.nn as nn

loss_fn = nn.CrossEntropyLoss()

In [None]:
from torch.optim import AdamW
from torch.optim.lr_scheduler import ReduceLROnPlateau
from sklearn.metrics import accuracy_score
from tqdm import tqdm

teacher_model.train()
optimizer = AdamW(teacher_model.parameters(), lr=5e-5, weight_decay=0.0005)

scheduler = ReduceLROnPlateau(
    optimizer, mode='min', factor=0.001, patience=1, verbose=True, min_lr=1e-50
)

early_stopping = EarlyStopping(patience=10, min_delta=0.0001)

for epoch in range(2):
    total_loss = 0
    all_preds = []
    all_labels = []

    loop = tqdm(train_loader, desc=f"Epoch {epoch+1}", leave=False)
    for batch in loop:
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)

        optimizer.zero_grad()
        outputs = teacher_model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
        logits = outputs.logits
        loss = loss_fn(logits, labels)

        loss.backward()
        optimizer.step()

        total_loss += loss.item()
        preds = torch.argmax(logits, dim=1)
        all_preds.extend(preds.cpu().numpy())
        all_labels.extend(labels.cpu().numpy())

        loop.set_postfix(loss=loss.item())

    train_loss = total_loss / len(train_loader)
    train_acc = accuracy_score(all_labels, all_preds)

    teacher_model.eval()
    val_loss = 0
    val_preds = []
    val_labels = []

    with torch.no_grad():
        for batch in val_loader:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)

            outputs = teacher_model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
            logits = outputs.logits
            loss = loss_fn(logits, labels)

            val_loss += loss.item()
            preds = torch.argmax(logits, dim=1)
            val_preds.extend(preds.cpu().numpy())
            val_labels.extend(labels.cpu().numpy())

    val_loss /= len(val_loader)
    val_acc = accuracy_score(val_labels, val_preds)

    scheduler.step(val_loss)

    print(f"Epoch {epoch+1} | Train Loss: {train_loss:.4f} | Train Acc: {train_acc:.4f} | Val Loss: {val_loss:.4f} | Val Acc: {val_acc:.4f}")

    early_stopping(val_loss)
    if early_stopping.early_stop:
        print("Early stopping triggered.")
        break

    teacher_model.train()

In [None]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification

student_tokenizer = BertTokenizer.from_pretrained("distilbert-base-cased")
student_model = AutoModelForSequenceClassification.from_pretrained(
    "distilbert-base-cased", num_labels=42
).to(device)

In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
teacher_model.to(device)
student_model.to(device)

teacher_model.eval()
optimizer = AdamW(student_model.parameters(), lr=5e-5,weight_decay=0.0005)

In [None]:
train_dataset = TextDataset(train_texts, train_labels, student_tokenizer)
val_dataset = TextDataset(val_texts, val_labels, student_tokenizer)

train_loader = DataLoader(train_dataset, batch_size=16)
val_loader = DataLoader(val_dataset, batch_size=16, shuffle=False)

In [None]:
import torch.nn.functional as F


def distillation_loss(student_logits, teacher_logits, true_labels, temperature=4.0, alpha=0.5):
    # Soft targets
    KD_loss = nn.KLDivLoss(reduction="batchmean")(
        F.log_softmax(student_logits / temperature, dim=1),
        F.softmax(teacher_logits / temperature, dim=1)
    ) * (temperature ** 2)

    # Hard labels
    CE_loss = F.cross_entropy(student_logits, true_labels)

    return alpha * KD_loss + (1 - alpha) * CE_loss


In [None]:
num_epochs = 2
teacher_model.eval()
for epoch in range(num_epochs):
    student_model.train()
    total_loss = 0
    correct = 0
    total = 0

    loop = tqdm(train_loader, desc=f"Epoch {epoch+1}/{num_epochs} - Training", leave=False)
    for batch in loop:
        input_ids = batch["input_ids"].to(device)
        attention_mask = batch["attention_mask"].to(device)
        labels = batch["labels"].to(device)

        with torch.no_grad():
            teacher_outputs = teacher_model(input_ids=input_ids, attention_mask=attention_mask)
            teacher_logits = teacher_outputs.logits

        student_outputs = student_model(input_ids=input_ids, attention_mask=attention_mask)
        student_logits = student_outputs.logits

        loss = distillation_loss(student_logits, teacher_logits, labels)

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        total_loss += loss.item()
        preds = torch.argmax(student_logits, dim=1)
        correct += (preds == labels).sum().item()
        total += labels.size(0)

        loop.set_postfix(loss=loss.item(), acc=100 * correct / total)

    train_accuracy = 100 * correct / total
    print(f"\nEpoch {epoch+1}/{num_epochs} | Train Loss: {total_loss:.4f} | Train Accuracy: {train_accuracy:.2f}%")

    student_model.eval()
    val_loss = 0
    val_correct = 0
    val_total = 0
    val_loop = tqdm(val_loader, desc=f"Epoch {epoch+1}/{num_epochs} - Validation", leave=False)
    with torch.no_grad():
        for batch in val_loop:
            input_ids = batch["input_ids"].to(device)
            attention_mask = batch["attention_mask"].to(device)
            labels = batch["labels"].to(device)

            outputs = student_model(input_ids=input_ids, attention_mask=attention_mask)
            logits = outputs.logits
            loss = F.cross_entropy(logits, labels)
            val_loss += loss.item()

            preds = torch.argmax(logits, dim=1)
            val_correct += (preds == labels).sum().item()
            val_total += labels.size(0)

            val_loop.set_postfix(val_loss=loss.item())

    val_accuracy = 100 * val_correct / val_total
    print(f"Validation Loss: {val_loss:.4f} | Validation Accuracy: {val_accuracy:.2f}%\n")


In [None]:
def evaluate(model, data_loader, device, desc="Evaluation"):
    model.eval()
    model.to(device)
    total_correct = 0
    total_samples = 0
    total_loss = 0

    loop = tqdm(data_loader, desc=desc)
    with torch.no_grad():
        for batch in loop:
            input_ids = batch["input_ids"].to(device)
            attention_mask = batch["attention_mask"].to(device)
            labels = batch["labels"].to(device)

            outputs = model(input_ids=input_ids, attention_mask=attention_mask)
            logits = outputs.logits
            loss = F.cross_entropy(logits, labels)
            total_loss += loss.item()

            preds = torch.argmax(logits, dim=1)
            total_correct += (preds == labels).sum().item()
            total_samples += labels.size(0)

            loop.set_postfix(loss=loss.item(), acc=100 * total_correct / total_samples)

    accuracy = 100 * total_correct / total_samples
    avg_loss = total_loss / len(data_loader)
    return accuracy, avg_loss


train_dataset = TextDataset(train_texts, train_labels, teacher_tokenizer)
val_dataset = TextDataset(val_texts, val_labels, teacher_tokenizer)

train_loader = DataLoader(train_dataset, batch_size=16)
val_loader = DataLoader(val_dataset, batch_size=32, shuffle=False)

teacher_acc, teacher_loss = evaluate(teacher_model, val_loader, device)

print(f"Final Teacher Accuracy: {teacher_acc:.2f}% | Loss: {teacher_loss:.4f}")


train_dataset = TextDataset(train_texts, train_labels, student_tokenizer)
val_dataset = TextDataset(val_texts, val_labels, student_tokenizer)
train_loader = DataLoader(train_dataset, batch_size=16)
val_loader = DataLoader(val_dataset, batch_size=16, shuffle=False)

student_acc, student_loss = evaluate(student_model, val_loader, device)
print(f"Final Student Accuracy: {student_acc:.2f}% | Loss: {student_loss:.4f}")

In [None]:
student_model.save_pretrained("/kaggle/working/model")