In [3]:
import pandas as pd
import torch
import numpy as np
import re
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.utils.class_weight import compute_class_weight
from sklearn.model_selection import StratifiedShuffleSplit
from torch.utils.data import DataLoader, TensorDataset
from transformers import AutoModelForSequenceClassification, AutoTokenizer, AdamW
from tqdm import tqdm


df = pd.read_csv("/content/mbti_1.csv")


def clean_text(text):
    text = re.sub(r"http\S+|www\S+|https\S+", "", text, flags=re.MULTILINE)
    text = re.sub(r"[^A-Za-z0-9\s]", "", text)
    text = re.sub(r"\s+", " ", text).strip()
    return text

df["posts"] = df["posts"].astype(str).apply(clean_text)


mbti_types = df['type'].unique()
type_to_label = {mbti: idx for idx, mbti in enumerate(mbti_types)}
df["label"] = df["type"].map(type_to_label)


model_name = "microsoft/deberta-v3-base"
tokenizer = AutoTokenizer.from_pretrained(model_name)


encodings = tokenizer(list(df["posts"]), truncation=True, padding="max_length", max_length=256, return_tensors="pt")
input_ids = encodings["input_ids"]
attention_masks = encodings["attention_mask"]
labels = torch.tensor(df["label"].tolist())


sss = StratifiedShuffleSplit(n_splits=1, test_size=0.2, random_state=42)
for train_index, test_index in sss.split(input_ids.numpy(), labels.numpy()):
    train_inputs, test_inputs = input_ids[train_index], input_ids[test_index]
    train_masks, test_masks = attention_masks[train_index], attention_masks[test_index]
    train_labels, test_labels = labels[train_index], labels[test_index]

train_inputs, test_inputs = torch.tensor(train_inputs), torch.tensor(test_inputs)
train_masks, test_masks = torch.tensor(train_masks), torch.tensor(test_masks)
train_labels, test_labels = torch.tensor(train_labels), torch.tensor(test_labels)


batch_size = 16
train_data = TensorDataset(train_inputs, train_masks, train_labels)
test_data = TensorDataset(test_inputs, test_masks, test_labels)

train_dataloader = DataLoader(train_data, batch_size=batch_size, shuffle=True)
test_dataloader = DataLoader(test_data, batch_size=batch_size, shuffle=False)


device = torch.device("cuda" if torch.cuda.is_available() else "cpu")


model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=len(set(labels)))
model.to(device)


class_weights = compute_class_weight(class_weight="balanced", classes=np.unique(labels.numpy()), y=labels.numpy())
class_weights = torch.tensor(class_weights, dtype=torch.float).to(device)


loss_fn = torch.nn.CrossEntropyLoss(weight=class_weights)


optimizer = AdamW(model.parameters(), lr=3e-5, eps=1e-8)
epochs = 15


for epoch in range(epochs):
    print(f"Epoch {epoch+1}/{epochs}")
    model.train()
    total_loss = 0


    if epoch >= 10:
        for param_group in optimizer.param_groups:
            param_group['lr'] = 2e-5

    for batch in tqdm(train_dataloader, desc="Training"):
        b_input_ids, b_masks, b_labels = batch[0].to(device), batch[1].to(device), batch[2].to(device)

        optimizer.zero_grad()


        outputs = model(input_ids=b_input_ids, attention_mask=b_masks, labels=b_labels)
        loss = outputs.loss


        loss.backward()
        optimizer.step()

        total_loss += loss.item()

    avg_loss = total_loss / len(train_dataloader)
    print(f"Avg Training Loss: {avg_loss:.4f}")

print("Training Complete!")


model.eval()
predictions, true_labels = [], []

with torch.no_grad():
    for batch in test_dataloader:
        b_input_ids, b_masks, b_labels = batch[0].to(device), batch[1].to(device), batch[2].to(device)


        outputs = model(input_ids=b_input_ids, attention_mask=b_masks)
        preds = torch.argmax(outputs.logits, dim=1).cpu().numpy()
        label_ids = b_labels.cpu().numpy()

        predictions.extend(preds)
        true_labels.extend(label_ids)


accuracy = np.mean(np.array(predictions) == np.array(true_labels))
print(f"✅ Final Test Accuracy: {accuracy * 100:.2f}%")


tokenizer_config.json:   0%|          | 0.00/52.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/579 [00:00<?, ?B/s]

spm.model:   0%|          | 0.00/2.46M [00:00<?, ?B/s]

  train_inputs, test_inputs = torch.tensor(train_inputs), torch.tensor(test_inputs)
  train_masks, test_masks = torch.tensor(train_masks), torch.tensor(test_masks)
  train_labels, test_labels = torch.tensor(train_labels), torch.tensor(test_labels)


pytorch_model.bin:   0%|          | 0.00/371M [00:00<?, ?B/s]

Some weights of DebertaV2ForSequenceClassification were not initialized from the model checkpoint at microsoft/deberta-v3-base and are newly initialized: ['classifier.bias', 'classifier.weight', 'pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch 1/15


Training:   0%|          | 0/434 [00:00<?, ?it/s]

model.safetensors:   0%|          | 0.00/371M [00:00<?, ?B/s]

Training: 100%|██████████| 434/434 [03:42<00:00,  1.95it/s]


Avg Training Loss: 3.0941
Epoch 2/15


Training: 100%|██████████| 434/434 [03:45<00:00,  1.93it/s]


Avg Training Loss: 2.0667
Epoch 3/15


Training: 100%|██████████| 434/434 [03:44<00:00,  1.93it/s]


Avg Training Loss: 1.8629
Epoch 4/15


Training: 100%|██████████| 434/434 [03:44<00:00,  1.93it/s]


Avg Training Loss: 1.6764
Epoch 5/15


Training: 100%|██████████| 434/434 [03:44<00:00,  1.93it/s]


Avg Training Loss: 1.5016
Epoch 6/15


Training: 100%|██████████| 434/434 [03:44<00:00,  1.93it/s]


Avg Training Loss: 1.2824
Epoch 7/15


Training: 100%|██████████| 434/434 [03:44<00:00,  1.93it/s]


Avg Training Loss: 1.0927
Epoch 8/15


Training: 100%|██████████| 434/434 [03:44<00:00,  1.93it/s]


Avg Training Loss: 0.9610
Epoch 9/15


Training: 100%|██████████| 434/434 [03:44<00:00,  1.93it/s]


Avg Training Loss: 0.8144
Epoch 10/15


Training: 100%|██████████| 434/434 [03:44<00:00,  1.93it/s]


Avg Training Loss: 0.7180
Epoch 11/15


Training: 100%|██████████| 434/434 [03:44<00:00,  1.93it/s]


Avg Training Loss: 0.5814
Epoch 12/15


Training: 100%|██████████| 434/434 [03:44<00:00,  1.93it/s]


Avg Training Loss: 0.5105
Epoch 13/15


Training: 100%|██████████| 434/434 [03:44<00:00,  1.93it/s]


Avg Training Loss: 0.4560
Epoch 14/15


Training: 100%|██████████| 434/434 [03:44<00:00,  1.93it/s]


Avg Training Loss: 0.4165
Epoch 15/15


Training: 100%|██████████| 434/434 [03:44<00:00,  1.93it/s]


Avg Training Loss: 0.3813
Training Complete!
✅ Final Test Accuracy: 49.11%
