<a href="https://colab.research.google.com/github/541DeepLearning-Group8/models/blob/main/DeBERTa.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install transformers datasets -q

import pandas as pd
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import BertTokenizer, BertForSequenceClassification, get_scheduler
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from torch.optim import AdamW
from sklearn.metrics import classification_report
from tqdm import tqdm

# parameters
DEBERTA_MODEL = 'microsoft/deberta-v3-base'
BATCH_SIZE = 16
EPOCHS = 3
MAX_LEN = 128
LEARNING_RATE = 2e-5
NUM_CLASSES = 5

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")


from google.colab import drive
drive.mount('/content/drive')

train_df = pd.read_csv('/content/drive/MyDrive/541project/train.csv')
val_df = pd.read_csv('/content/drive/MyDrive/541project/val.csv')
test_df = pd.read_csv('/content/drive/MyDrive/541project/test.csv')

In [None]:
for df in [train_df, val_df, test_df]:
    df['label'] = df['Rating'] - 1

# Dataset
class TextDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_len):
        self.encodings = tokenizer(texts, truncation=True, padding='max_length', max_length=max_len)
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx], dtype=torch.long)
        return item

    def __len__(self):
        return len(self.labels)

In [None]:
# tokenizer
tokenizer = AutoTokenizer.from_pretrained(DEBERTA_MODEL)

# datasets
train_dataset = TextDataset(train_df['Clean Comments'].tolist(), train_df['label'].tolist(), tokenizer, MAX_LEN)
val_dataset = TextDataset(val_df['Clean Comments'].tolist(), val_df['label'].tolist(), tokenizer, MAX_LEN)
test_dataset = TextDataset(test_df['Clean Comments'].tolist(), test_df['label'].tolist(), tokenizer, MAX_LEN)

train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=BATCH_SIZE)
test_loader = DataLoader(test_dataset, batch_size=BATCH_SIZE)

In [None]:
# DeBERTa
model = AutoModelForSequenceClassification.from_pretrained(DEBERTA_MODEL, num_labels=NUM_CLASSES)
model.to(device)

# optimizer & scheduler
optimizer = AdamW(model.parameters(), lr=LEARNING_RATE)
num_training_steps = len(train_loader) * EPOCHS
lr_scheduler = get_scheduler("linear", optimizer=optimizer, num_warmup_steps=0, num_training_steps=num_training_steps)

In [None]:
def train(model, dataloader):
    model.train()
    total_loss = 0
    for batch in tqdm(dataloader, desc="Training"):
        batch = {k: v.to(device) for k, v in batch.items()}
        outputs = model(**batch)
        loss = outputs.loss
        total_loss += loss.item()
        loss.backward()
        optimizer.step()
        lr_scheduler.step()
        optimizer.zero_grad()
    print(f"Train Loss: {total_loss/len(dataloader):.4f}")

In [None]:
def evaluate(model, dataloader, desc="Validation"):
    model.eval()
    preds, labels = [], []
    with torch.no_grad():
        for batch in tqdm(dataloader, desc=desc):
            batch = {k: v.to(device) for k, v in batch.items()}
            outputs = model(**batch)
            logits = outputs.logits
            preds.extend(torch.argmax(logits, dim=-1).cpu().numpy())
            labels.extend(batch['labels'].cpu().numpy())
    report = classification_report(labels, preds, digits=4)
    print(report)

In [None]:
for epoch in range(EPOCHS):
    print(f"\nEpoch {epoch+1}/{EPOCHS}")
    train(model, train_loader)
    print("Validation results:")
    evaluate(model, val_loader)

In [None]:
print("\n Final Evaluation on Test Set:")
evaluate(model, test_loader, desc="Test")

In [None]:
from transformers import AutoModelForSequenceClassification, AutoTokenizer

save_path = "/content/drive/MyDrive/541project/deberta_model_1"

model.save_pretrained(save_path)

tokenizer.save_pretrained(save_path)

print(f"Save successfully:" {save_path}")


In [None]:
# Hugging Face
model.save_pretrained("/content/drive/MyDrive/541project/deberta_model_12")
tokenizer.save_pretrained("/content/drive/MyDrive/541project/deberta_model_13")

#PyTorch .pth
torch.save(model.state_dict(), "/content/drive/MyDrive/541project/deberta_model_14.pth")


In [None]:
import os

save_path = "/content/drive/MyDrive/541project/deberta_model_14.pth"

torch.save(model.state_dict(), save_path)

if os.path.exists(save_path):
    print("Save successfully", save_path)
else:
    print("Save failed!")
