In [1]:
from google.colab import drive
drive.mount('/content/drive')


Mounted at /content/drive


In [2]:
from torch.utils.data import Dataset
import pandas as pd
import torch
from transformers.models.bert import BertTokenizer
from sklearn.model_selection import train_test_split



class CustomText(Dataset):
    def __init__(self, texts, labels, tokenizer, max_length, mode):
        self.mode = mode
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = self.texts[idx]
        encoding = self.tokenizer(text, return_tensors='pt', max_length=self.max_length, padding='max_length', truncation=True)
        if self.mode == "train":
            label = self.labels[idx]
            return {'input_ids': encoding['input_ids'].flatten(), 'attention_mask': encoding['attention_mask'].flatten(), 'label': torch.tensor(label)}
        elif self.mode == "test":
            return {'input_ids': encoding['input_ids'].flatten(), 'attention_mask': encoding['attention_mask'].flatten()}
        else:
            raise("error")

def read_dataset(dataset_path, tokenizer_root='bert-base-uncased', max_length=1024, mode='train'):
    df = pd.read_json(dataset_path)
    tokenizer = BertTokenizer.from_pretrained(tokenizer_root)
    text = []
    for i, row in df.iterrows():
        t = f"Title: {row['title']}. Content: {row['text']}."
        text.append(t.replace("<br />", " "))
    helpful = df['helpful_vote'].to_list()
    verified = df['verified_purchase'].to_list()
    if mode == "train":
        label = [l - 1 for l in df['rating'].to_list()]
        train_texts, val_texts, train_labels, val_labels = train_test_split(text, label, test_size=0.2, random_state=42)
        train_dataset = CustomText(train_texts, train_labels, tokenizer, max_length, mode="train")
        val_dataset = CustomText(val_texts, val_labels, tokenizer, max_length, mode="train")
        return train_dataset, val_dataset
    elif mode == "test":
        test_dataset =CustomText(text, None, tokenizer, max_length, mode="test")
        return test_dataset
    else:
        raise("error")

if __name__ == '__main__':
    dataset_path = '/content/drive/My Drive/train.json'
    data = read_dataset(dataset_path)





The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

In [None]:
import torch
import os
from argparse import ArgumentParser
from torch import nn
from transformers.models.bert import BertModel
from torch.optim import AdamW
from torch.optim.lr_scheduler import LinearLR
from torch.utils.data import DataLoader
from sklearn.metrics import accuracy_score, classification_report
from tqdm import tqdm


class CustomBERTModel(nn.Module):
    def __init__(self, bert_model_name, num_classes, dropout_ratio):
        super(CustomBERTModel, self).__init__()
        self.bert = BertModel.from_pretrained(bert_model_name)
        self.dropout = nn.Dropout(dropout_ratio)
        self.fc = nn.Linear(self.bert.config.hidden_size, num_classes)

    def forward(self, input_ids, attention_mask):
        outputs = self.bert(input_ids=input_ids, attention_mask=attention_mask)
        pooled_output = outputs.pooler_output
        x = self.dropout(pooled_output)
        logits = self.fc(x)
        return logits


class ModelTrainer():
    def __init__(self):
        train_dataset_path = '/content/drive/My Drive/train.json'
        test_dataset_path = '/content/drive/My Drive/test.json'

        self.epochs = 20
        self.batch_size = 32
        self.lr = 3e-5
        self.weight_decay = 1e-2
        self.dropout_ratio = 0.2
        self.max_length = 256
        self.model_root = 'bert-base-uncased'

        self.model_save_root = os.path.join('model', self.model_root)
        self.model_name = "submission"

        if not os.path.exists(self.model_save_root):
            os.makedirs(self.model_save_root, exist_ok=True)  # Create any necessary directories

        self.device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')

        self.model = CustomBERTModel(self.model_root, 5, self.dropout_ratio).to(self.device)
        train_dataset, val_dataset = read_dataset(train_dataset_path, self.model_root, self.max_length, mode="train")
        test_dataset = read_dataset(test_dataset_path, self.model_root, self.max_length, mode="test")
        self.train_dataloader = DataLoader(train_dataset, batch_size=self.batch_size, shuffle=True)
        self.val_dataloader = DataLoader(val_dataset, batch_size=self.batch_size)
        self.test_dataloader = DataLoader(test_dataset, batch_size=1)

        self.optimizer = AdamW(self.model.parameters(), lr=self.lr)
        total_steps = len(self.train_dataloader) * self.epochs
        self.scheduler = LinearLR(self.optimizer, total_iters=total_steps)


    def train(self):
        for epoch in range(self.epochs):
            print(f"Epoch {epoch + 1}/{self.epochs}")
            self._train()
            accuracy, report = self._evaluate()
            print(f"Validation Acc: {accuracy:.4f}")
            print(report)
        torch.save(self.model.state_dict(), os.path.join(self.model_save_root, f"{self.model_name}.pth"))

    def save_predictions(self):
      predict = self._save_predictions()
      predictions_path = '/content/drive/My Drive'
      os.makedirs(predictions_path, exist_ok=True)
      with open(os.path.join(predictions_path, f"{self.model_name}.csv"), 'w') as f:
          f.write('index,rating\n')
          for i, pred in enumerate(predict):
              f.write(f'index_{i},{pred+1}\n')


    def _save_predictions(self):
        self.model.load_state_dict(torch.load(os.path.join(self.model_save_root, f"{self.model_name}.pth")))
        self.model.eval()
        predict = []
        for batch in tqdm(self.test_dataloader):
            input_ids = batch['input_ids'].to(self.device)
            attention_mask = batch['attention_mask'].to(self.device)
            outputs = self.model(input_ids=input_ids, attention_mask=attention_mask)
            _, preds = torch.max(outputs, dim=1)
            predict.extend(preds.cpu().tolist())
        return predict

    def _train(self):
        self.model.train()
        for batch in tqdm(self.train_dataloader):
            self.optimizer.zero_grad()
            input_ids = batch['input_ids'].to(self.device)
            attention_mask = batch['attention_mask'].to(self.device)
            labels = batch['label'].to(self.device)
            outputs = self.model(input_ids=input_ids, attention_mask=attention_mask)
            loss = nn.CrossEntropyLoss()(outputs, labels)
            loss.backward()
            self.optimizer.step()
            self.scheduler.step()

    def _evaluate(self):
        self.model.eval()
        predict = []
        actual_labels = []
        with torch.no_grad():
            for batch in self.val_dataloader:
                input_ids = batch['input_ids'].to(self.device)
                attention_mask = batch['attention_mask'].to(self.device)
                labels = batch['label'].to(self.device)
                outputs = self.model(input_ids=input_ids, attention_mask=attention_mask)
                _, preds = torch.max(outputs, dim=1)
                predict.extend(preds.cpu().tolist())
                actual_labels.extend(labels.cpu().tolist())
        return accuracy_score(actual_labels, predict), classification_report(actual_labels, predict)

if __name__ == "__main__":
    trainer = ModelTrainer()
    trainer.train()
    trainer.save_predictions()

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Epoch 1/20


100%|██████████| 875/875 [18:44<00:00,  1.28s/it]


Validation Acc: 0.6216
              precision    recall  f1-score   support

           0       0.67      0.70      0.68      1421
           1       0.51      0.48      0.49      1440
           2       0.55      0.57      0.56      1409
           3       0.63      0.52      0.57      1372
           4       0.73      0.85      0.78      1358

    accuracy                           0.62      7000
   macro avg       0.62      0.62      0.62      7000
weighted avg       0.62      0.62      0.62      7000

Epoch 2/20


100%|██████████| 875/875 [18:44<00:00,  1.29s/it]


Validation Acc: 0.6254
              precision    recall  f1-score   support

           0       0.69      0.68      0.68      1421
           1       0.51      0.53      0.52      1440
           2       0.58      0.55      0.56      1409
           3       0.66      0.47      0.55      1372
           4       0.70      0.90      0.79      1358

    accuracy                           0.63      7000
   macro avg       0.63      0.63      0.62      7000
weighted avg       0.62      0.63      0.62      7000

Epoch 3/20


100%|██████████| 875/875 [18:44<00:00,  1.29s/it]


Validation Acc: 0.6257
              precision    recall  f1-score   support

           0       0.66      0.72      0.69      1421
           1       0.48      0.55      0.52      1440
           2       0.64      0.44      0.52      1409
           3       0.64      0.56      0.60      1372
           4       0.73      0.86      0.79      1358

    accuracy                           0.63      7000
   macro avg       0.63      0.63      0.62      7000
weighted avg       0.63      0.63      0.62      7000

Epoch 4/20


100%|██████████| 875/875 [18:45<00:00,  1.29s/it]


Validation Acc: 0.6203
              precision    recall  f1-score   support

           0       0.70      0.63      0.66      1421
           1       0.50      0.53      0.51      1440
           2       0.55      0.60      0.57      1409
           3       0.63      0.51      0.56      1372
           4       0.73      0.85      0.78      1358

    accuracy                           0.62      7000
   macro avg       0.62      0.62      0.62      7000
weighted avg       0.62      0.62      0.62      7000

Epoch 5/20


100%|██████████| 875/875 [18:44<00:00,  1.28s/it]


Validation Acc: 0.6144
              precision    recall  f1-score   support

           0       0.65      0.71      0.68      1421
           1       0.54      0.39      0.46      1440
           2       0.53      0.58      0.55      1409
           3       0.59      0.58      0.59      1372
           4       0.74      0.81      0.77      1358

    accuracy                           0.61      7000
   macro avg       0.61      0.62      0.61      7000
weighted avg       0.61      0.61      0.61      7000

Epoch 6/20


100%|██████████| 875/875 [18:43<00:00,  1.28s/it]


Validation Acc: 0.6030
              precision    recall  f1-score   support

           0       0.71      0.57      0.63      1421
           1       0.50      0.50      0.50      1440
           2       0.50      0.62      0.55      1409
           3       0.60      0.57      0.58      1372
           4       0.76      0.77      0.76      1358

    accuracy                           0.60      7000
   macro avg       0.61      0.60      0.61      7000
weighted avg       0.61      0.60      0.60      7000

Epoch 7/20


100%|██████████| 875/875 [18:44<00:00,  1.29s/it]


Validation Acc: 0.6069
              precision    recall  f1-score   support

           0       0.70      0.63      0.66      1421
           1       0.50      0.50      0.50      1440
           2       0.52      0.60      0.56      1409
           3       0.56      0.62      0.59      1372
           4       0.79      0.69      0.74      1358

    accuracy                           0.61      7000
   macro avg       0.62      0.61      0.61      7000
weighted avg       0.62      0.61      0.61      7000

Epoch 8/20


 69%|██████▊   | 600/875 [12:51<05:52,  1.28s/it]