In [None]:
!pip install transformers
!pip install evaluate

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting transformers
  Downloading transformers-4.20.1-py3-none-any.whl (4.4 MB)
[K     |████████████████████████████████| 4.4 MB 22.7 MB/s 
Collecting tokenizers!=0.11.3,<0.13,>=0.11.1
  Downloading tokenizers-0.12.1-cp37-cp37m-manylinux_2_12_x86_64.manylinux2010_x86_64.whl (6.6 MB)
[K     |████████████████████████████████| 6.6 MB 11.0 MB/s 
Collecting huggingface-hub<1.0,>=0.1.0
  Downloading huggingface_hub-0.8.1-py3-none-any.whl (101 kB)
[K     |████████████████████████████████| 101 kB 8.9 MB/s 
[?25hCollecting pyyaml>=5.1
  Downloading PyYAML-6.0-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl (596 kB)
[K     |████████████████████████████████| 596 kB 57.0 MB/s 
Installing collected packages: pyyaml, tokenizers, huggingface-hub, transformers
  Attempting uninstall: pyyaml
    Found existing installation: PyYAML 3.13
    Uninstal

In [None]:
!pip install datasets

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [24]:
import torch
from datasets import load_dataset
from transformers import AutoTokenizer
from transformers import DataCollatorWithPadding
from transformers import AutoModelForSequenceClassification, TrainingArguments, Trainer

## Mount the drive
from google.colab import drive
drive.mount('/content/drive/')


class Dataset_J(torch.utils.data.Dataset):
    def __init__(self, raw_data, split, id, K=10):
        self.raw = raw_data
        N = len(self.raw['text']) // K
        if split == 'train':
          self.data = {'text': self.raw['text'][:int(id*N)] + self.raw['text'][int(id*N)+N:]}
          self.data['labels'] = self.raw['label'][:int(id*N)] + self.raw['label'][int(id*N)+N:]
        else:
          self.data = {'text': self.raw['text'][int(id*N):int(id*N)+N]}
          self.data['labels'] = self.raw['label'][int(id*N):int(id*N)+N]
        self.tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")

    def __getitem__(self, idx):
        item = self.tokenizer(self.data['text'][idx], padding=True, truncation=True, max_length=512)
        item = {key: torch.tensor(val) for key, val in item.items()}
        item["labels"] = torch.tensor(self.data['labels'][idx])
        return item

    def __len__(self):
        return len(self.data['labels'])


Drive already mounted at /content/drive/; to attempt to forcibly remount, call drive.mount("/content/drive/", force_remount=True).


In [25]:
drive.mount("/content/drive/", force_remount=True)

Mounted at /content/drive/


In [42]:
import numpy as np
import evaluate
from numba import cuda

# Preprocess
tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")
# tokenized_dataset = dataset.map(preprocess_function, batched=True)
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

# K-fold cross validation
num_iter = 10

accuracy = evaluate.load("accuracy")
accuracy_train = evaluate.load("accuracy")
f1 = evaluate.load("f1")
f1_train = evaluate.load("f1")

path1 = "./drive/MyDrive/Covid Research/SD_cooked_2.1.csv"
path2 = "./drive/MyDrive/Covid Research/MD_cooked_2.0 (1).csv"
raw_data = load_dataset('csv', data_files=[path1, path2]).shuffle()
raw_data = raw_data['train'] #.train_test_split(test_size=0.1)[split]

for i in range(num_iter):
  accuracy_iter = evaluate.load("accuracy")
  f1_iter = evaluate.load("f1") 

  train_dataset = Dataset_J(raw_data, 'train', id=i)
  test_dataset = Dataset_J(raw_data, 'test', id=i)

  model = AutoModelForSequenceClassification.from_pretrained(
    "distilbert-base-uncased", num_labels=2)
  device = "cuda"
  model.to(device)
  
  training_args = TrainingArguments(
    output_dir="./results",
    learning_rate=2e-6,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=2,
    weight_decay=0,
    resume_from_checkpoint=False,
    save_strategy = "no",
    )
  trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    tokenizer=tokenizer,
    data_collator=data_collator,
    )
  
  trainer.train()

  # Prediction
  print("[Iter {}: ]".format(str(i)))

  ## Train Acc
  print("Training Set: ")
  predictions_train = trainer.predict(train_dataset)
  preds_train = np.argmax(predictions_train.predictions, axis=-1)
  print(accuracy_iter.compute(references=predictions_train.label_ids, predictions=preds_train))
  print(f1_iter.compute(references=predictions_train.label_ids, predictions=preds_train))
  accuracy.add_batch(references=predictions_train.label_ids, predictions=preds_train)
  f1.add_batch(references=predictions_train.label_ids, predictions=preds_train)

  ## Test Acc
  print("Test Set: ")
  predictions = trainer.predict(test_dataset)
  preds = np.argmax(predictions.predictions, axis=-1)
  print(accuracy_iter.compute(references=predictions.label_ids, predictions=preds))
  print(f1_iter.compute(references=predictions.label_ids, predictions=preds))
  accuracy.add_batch(references=predictions.label_ids, predictions=preds)
  f1.add_batch(references=predictions.label_ids, predictions=preds)

  model.to("cpu")
  # # Reset the GPU
  # device = cuda.get_current_device()
  # device.reset()
  

print("######## Final Result: 10-fold cross validation ########")
print('Final Accuracy: ' + accuracy.compute())
print('Final F1: ' + f1.compute())
print('Final Train Accuracy: ' + accuracy_train.compute())
print('Final Train F1: ' + f1_train.compute())

loading configuration file https://huggingface.co/distilbert-base-uncased/resolve/main/config.json from cache at /root/.cache/huggingface/transformers/23454919702d26495337f3da04d1655c7ee010d5ec9d77bdb9e399e00302c0a1.91b885ab15d631bf9cee9dc9d25ece0afd932f2f5130eba28f2055b2220c0333
Model config DistilBertConfig {
  "_name_or_path": "distilbert-base-uncased",
  "activation": "gelu",
  "architectures": [
    "DistilBertForMaskedLM"
  ],
  "attention_dropout": 0.1,
  "dim": 768,
  "dropout": 0.1,
  "hidden_dim": 3072,
  "initializer_range": 0.02,
  "max_position_embeddings": 512,
  "model_type": "distilbert",
  "n_heads": 12,
  "n_layers": 6,
  "pad_token_id": 0,
  "qa_dropout": 0.1,
  "seq_classif_dropout": 0.2,
  "sinusoidal_pos_embds": false,
  "tie_weights_": true,
  "transformers_version": "4.20.1",
  "vocab_size": 30522
}

loading file https://huggingface.co/distilbert-base-uncased/resolve/main/vocab.txt from cache at /root/.cache/huggingface/transformers/0e1bbfda7f63a99bb52e3915dcf10

  0%|          | 0/1 [00:00<?, ?it/s]

loading configuration file https://huggingface.co/distilbert-base-uncased/resolve/main/config.json from cache at /root/.cache/huggingface/transformers/23454919702d26495337f3da04d1655c7ee010d5ec9d77bdb9e399e00302c0a1.91b885ab15d631bf9cee9dc9d25ece0afd932f2f5130eba28f2055b2220c0333
Model config DistilBertConfig {
  "_name_or_path": "distilbert-base-uncased",
  "activation": "gelu",
  "architectures": [
    "DistilBertForMaskedLM"
  ],
  "attention_dropout": 0.1,
  "dim": 768,
  "dropout": 0.1,
  "hidden_dim": 3072,
  "initializer_range": 0.02,
  "max_position_embeddings": 512,
  "model_type": "distilbert",
  "n_heads": 12,
  "n_layers": 6,
  "pad_token_id": 0,
  "qa_dropout": 0.1,
  "seq_classif_dropout": 0.2,
  "sinusoidal_pos_embds": false,
  "tie_weights_": true,
  "transformers_version": "4.20.1",
  "vocab_size": 30522
}

loading file https://huggingface.co/distilbert-base-uncased/resolve/main/vocab.txt from cache at /root/.cache/huggingface/transformers/0e1bbfda7f63a99bb52e3915dcf10

Step,Training Loss


KeyboardInterrupt: ignored