<a href="https://colab.research.google.com/github/541DeepLearning-Group8/models/blob/main/DeBERTa.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install transformers datasets -q

import pandas as pd
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import BertTokenizer, BertForSequenceClassification, get_scheduler
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from torch.optim import AdamW
from sklearn.metrics import classification_report
from tqdm import tqdm

# parameters
DEBERTA_MODEL = 'microsoft/deberta-v3-base'
BATCH_SIZE = 16
EPOCHS = 3
MAX_LEN = 128
LEARNING_RATE = 2e-5
NUM_CLASSES = 5

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")


from google.colab import drive
drive.mount('/content/drive')

train_df = pd.read_csv('/content/drive/MyDrive/541project/train.csv')
val_df = pd.read_csv('/content/drive/MyDrive/541project/val.csv')
test_df = pd.read_csv('/content/drive/MyDrive/541project/test.csv')

[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/491.4 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m491.4/491.4 kB[0m [31m15.0 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m6.9 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m193.6/193.6 kB[0m [31m12.9 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m143.5/143.5 kB[0m [31m7.8 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m194.8/194.8 kB[0m [31m11.4 MB/s[0m eta [36m0:00:00[0m
[?25h[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
torch 2.6.0+cu124 requires nvidia-cublas-cu12==12.4.5.8; platform_system == "Linux" and platform_m

In [None]:
# 预处理标签（减1变为0~4）
for df in [train_df, val_df, test_df]:
    df['label'] = df['Rating'] - 1

# 定义Dataset
class TextDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_len):
        self.encodings = tokenizer(texts, truncation=True, padding='max_length', max_length=max_len)
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx], dtype=torch.long)
        return item

    def __len__(self):
        return len(self.labels)

In [None]:
# tokenizer
tokenizer = AutoTokenizer.from_pretrained(DEBERTA_MODEL)

# datasets
train_dataset = TextDataset(train_df['Clean Comments'].tolist(), train_df['label'].tolist(), tokenizer, MAX_LEN)
val_dataset = TextDataset(val_df['Clean Comments'].tolist(), val_df['label'].tolist(), tokenizer, MAX_LEN)
test_dataset = TextDataset(test_df['Clean Comments'].tolist(), test_df['label'].tolist(), tokenizer, MAX_LEN)

train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=BATCH_SIZE)
test_loader = DataLoader(test_dataset, batch_size=BATCH_SIZE)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/52.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/579 [00:00<?, ?B/s]

spm.model:   0%|          | 0.00/2.46M [00:00<?, ?B/s]



In [None]:
# DeBERTa
model = AutoModelForSequenceClassification.from_pretrained(DEBERTA_MODEL, num_labels=NUM_CLASSES)
model.to(device)

# optimizer & scheduler
optimizer = AdamW(model.parameters(), lr=LEARNING_RATE)
num_training_steps = len(train_loader) * EPOCHS
lr_scheduler = get_scheduler("linear", optimizer=optimizer, num_warmup_steps=0, num_training_steps=num_training_steps)

pytorch_model.bin:   0%|          | 0.00/371M [00:00<?, ?B/s]

Some weights of DebertaV2ForSequenceClassification were not initialized from the model checkpoint at microsoft/deberta-v3-base and are newly initialized: ['classifier.bias', 'classifier.weight', 'pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


model.safetensors:   0%|          | 0.00/371M [00:00<?, ?B/s]

In [None]:
def train(model, dataloader):
    model.train()
    total_loss = 0
    for batch in tqdm(dataloader, desc="Training"):
        batch = {k: v.to(device) for k, v in batch.items()}
        outputs = model(**batch)
        loss = outputs.loss
        total_loss += loss.item()
        loss.backward()
        optimizer.step()
        lr_scheduler.step()
        optimizer.zero_grad()
    print(f"Train Loss: {total_loss/len(dataloader):.4f}")

In [None]:
def evaluate(model, dataloader, desc="Validation"):
    model.eval()
    preds, labels = [], []
    with torch.no_grad():
        for batch in tqdm(dataloader, desc=desc):
            batch = {k: v.to(device) for k, v in batch.items()}
            outputs = model(**batch)
            logits = outputs.logits
            preds.extend(torch.argmax(logits, dim=-1).cpu().numpy())
            labels.extend(batch['labels'].cpu().numpy())
    report = classification_report(labels, preds, digits=4)
    print(report)

In [None]:
for epoch in range(EPOCHS):
    print(f"\nEpoch {epoch+1}/{EPOCHS}")
    train(model, train_loader)
    print("Validation results:")
    evaluate(model, val_loader)


Epoch 1/3



Training:   0%|          | 0/456 [00:00<?, ?it/s][A
Training:   0%|          | 1/456 [00:03<23:49,  3.14s/it][A
Training:   0%|          | 2/456 [00:04<13:45,  1.82s/it][A
Training:   1%|          | 3/456 [00:04<08:47,  1.17s/it][A
Training:   1%|          | 4/456 [00:04<06:45,  1.11it/s][A
Training:   1%|          | 5/456 [00:05<05:32,  1.36it/s][A
Training:   1%|▏         | 6/456 [00:05<04:35,  1.63it/s][A
Training:   2%|▏         | 7/456 [00:06<04:18,  1.74it/s][A
Training:   2%|▏         | 8/456 [00:06<03:43,  2.01it/s][A
Training:   2%|▏         | 9/456 [00:06<03:31,  2.11it/s][A
Training:   2%|▏         | 10/456 [00:07<03:24,  2.18it/s][A
Training:   2%|▏         | 11/456 [00:07<03:20,  2.22it/s][A
Training:   3%|▎         | 12/456 [00:08<03:21,  2.20it/s][A
Training:   3%|▎         | 13/456 [00:08<03:22,  2.18it/s][A
Training:   3%|▎         | 14/456 [00:09<03:16,  2.25it/s][A
Training:   3%|▎         | 15/456 [00:09<03:20,  2.20it/s][A
Training:   4%|▎         

Train Loss: 0.8535
Validation results:


Validation: 100%|██████████| 114/114 [00:16<00:00,  6.94it/s]


              precision    recall  f1-score   support

           0     0.6833    0.7885    0.7321       312
           1     0.3303    0.2769    0.3013       130
           2     0.4086    0.2517    0.3115       151
           3     0.5488    0.3169    0.4018       284
           4     0.8247    0.9566    0.8857       944

    accuracy                         0.7210      1821
   macro avg     0.5591    0.5181    0.5265      1821
weighted avg     0.6876    0.7210    0.6946      1821


Epoch 2/3


Training: 100%|██████████| 456/456 [03:24<00:00,  2.23it/s]


Train Loss: 0.6670
Validation results:


Validation: 100%|██████████| 114/114 [00:16<00:00,  7.01it/s]


              precision    recall  f1-score   support

           0     0.6919    0.8205    0.7507       312
           1     0.2576    0.1308    0.1735       130
           2     0.3429    0.2384    0.2812       151
           3     0.4877    0.4190    0.4508       284
           4     0.8465    0.9290    0.8859       944

    accuracy                         0.7166      1821
   macro avg     0.5253    0.5075    0.5084      1821
weighted avg     0.6803    0.7166    0.6939      1821


Epoch 3/3


Training: 100%|██████████| 456/456 [03:24<00:00,  2.23it/s]


Train Loss: 0.5655
Validation results:


Validation: 100%|██████████| 114/114 [00:16<00:00,  6.94it/s]

              precision    recall  f1-score   support

           0     0.7179    0.8077    0.7602       312
           1     0.3034    0.2077    0.2466       130
           2     0.4000    0.3179    0.3542       151
           3     0.4815    0.4120    0.4440       284
           4     0.8487    0.9153    0.8807       944

    accuracy                         0.7183      1821
   macro avg     0.5503    0.5321    0.5372      1821
weighted avg     0.6929    0.7183    0.7030      1821






In [None]:
print("\n📊 Final Evaluation on Test Set:")
evaluate(model, test_loader, desc="Test")


📊 Final Evaluation on Test Set:


Test: 100%|██████████| 64/64 [00:09<00:00,  7.03it/s]

              precision    recall  f1-score   support

           0     0.7487    0.8439    0.7935       173
           1     0.4528    0.3288    0.3810        73
           2     0.4500    0.3214    0.3750        84
           3     0.4844    0.3924    0.4336       158
           4     0.8368    0.9198    0.8764       524

    accuracy                         0.7322      1012
   macro avg     0.5945    0.5613    0.5719      1012
weighted avg     0.7069    0.7322    0.7157      1012






In [None]:
from transformers import AutoModelForSequenceClassification, AutoTokenizer

save_path = "/content/drive/MyDrive/541project/deberta_model_1"

model.save_pretrained(save_path)

tokenizer.save_pretrained(save_path)

print(f"Save successfully:" {save_path}")


✅ 模型已保存到：/content/drive/MyDrive/541project/deberta_model_1


In [None]:
# Hugging Face
model.save_pretrained("/content/drive/MyDrive/541project/deberta_model_12")
tokenizer.save_pretrained("/content/drive/MyDrive/541project/deberta_model_13")

#PyTorch .pth
torch.save(model.state_dict(), "/content/drive/MyDrive/541project/deberta_model_14.pth")


In [None]:
import os

save_path = "/content/drive/MyDrive/541project/deberta_model_14.pth"

torch.save(model.state_dict(), save_path)

if os.path.exists(save_path):
    print("Save successfully", save_path)
else:
    print("Save failed!")


✅ 模型成功保存到： /content/drive/MyDrive/541project/deberta_model_14.pth
