In [None]:
!pip install datasets
!pip install transformers
!pip install evaluate
#Task 2 is sentiment analysis and Task 3 is multi-class classification

In [None]:
from datasets import load_dataset

datasetCBT = load_dataset("cbt", "CN")
# # you can use any of the following config names as a second argument:
# "CN", "NE", "P", "V", 
# "raw"
datasetAG = load_dataset("ag_news")
datasetIMDB = load_dataset("imdb")


In [None]:
print("CBT", datasetCBT)
print("AGN", datasetAG)
print("IMDB", datasetIMDB)

CBT DatasetDict({
    train: Dataset({
        features: ['sentences', 'question', 'answer', 'options'],
        num_rows: 120769
    })
    test: Dataset({
        features: ['sentences', 'question', 'answer', 'options'],
        num_rows: 2500
    })
    validation: Dataset({
        features: ['sentences', 'question', 'answer', 'options'],
        num_rows: 2000
    })
})
AGN DatasetDict({
    train: Dataset({
        features: ['text', 'label'],
        num_rows: 120000
    })
    test: Dataset({
        features: ['text', 'label'],
        num_rows: 7600
    })
})
IMDB DatasetDict({
    train: Dataset({
        features: ['text', 'label'],
        num_rows: 25000
    })
    test: Dataset({
        features: ['text', 'label'],
        num_rows: 25000
    })
    unsupervised: Dataset({
        features: ['text', 'label'],
        num_rows: 50000
    })
})


In [None]:
from datasets import Dataset, load_metric, ClassLabel
import numpy as np
import pandas as pd
import torch
from transformers import AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments, Trainer, DataCollatorWithPadding
import os
import json
import evaluate

metric = evaluate.load("accuracy")
def compute_metrics(eval_pred):
	logits,labels = eval_pred
	predictions = np.argmax(logits, axis=-1)
	result = metric.compute(predictions=predictions, references=labels)
	return result

#IMDB sentiment analysis - We randomly set aside about 15% of training data as validation set.
datasetIMDB_train = datasetIMDB["train"].train_test_split(test_size=0.15)
# datasetIMDB_validate = datasetIMDB_train["test"]
datasetIMDB_test = datasetIMDB["test"]
print(datasetIMDB_train, datasetIMDB_test)

#CBT sentence level auto QA - 
datasetCBT_train = datasetCBT["train"]
datasetCBT_test = datasetCBT["test"]

#AG char level news article classification - There are 30,000 training and 1,900 testing examples for each class respectively, where 15% of training data is set aside as validation.
datasetAG_train = datasetAG["train"].train_test_split(test_size=0.15)
# datasetAG_validate = datasetAG_train["test"]
datasetAG_test = datasetAG["test"]
print(datasetAG_train, datasetAG_test)

DatasetDict({
    train: Dataset({
        features: ['text', 'label'],
        num_rows: 21250
    })
    test: Dataset({
        features: ['text', 'label'],
        num_rows: 3750
    })
}) Dataset({
    features: ['text', 'label'],
    num_rows: 25000
})
DatasetDict({
    train: Dataset({
        features: ['text', 'label'],
        num_rows: 102000
    })
    test: Dataset({
        features: ['text', 'label'],
        num_rows: 18000
    })
}) Dataset({
    features: ['text', 'label'],
    num_rows: 7600
})


In [None]:
print("building tokenizer...")
tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")

def tokenize_function(examples):
	token = tokenizer(examples["text"], padding="max_length", truncation=True)
	return token

tokenized_datasetIMDB_train = datasetIMDB_train.map(tokenize_function, batched=True)
print(tokenized_datasetIMDB_train)

building tokenizer...


loading configuration file config.json from cache at /root/.cache/huggingface/hub/models--distilbert-base-uncased/snapshots/043235d6088ecd3dd5fb5ca3592b6913fd516027/config.json
Model config DistilBertConfig {
  "_name_or_path": "distilbert-base-uncased",
  "activation": "gelu",
  "architectures": [
    "DistilBertForMaskedLM"
  ],
  "attention_dropout": 0.1,
  "dim": 768,
  "dropout": 0.1,
  "hidden_dim": 3072,
  "initializer_range": 0.02,
  "max_position_embeddings": 512,
  "model_type": "distilbert",
  "n_heads": 12,
  "n_layers": 6,
  "pad_token_id": 0,
  "qa_dropout": 0.1,
  "seq_classif_dropout": 0.2,
  "sinusoidal_pos_embds": false,
  "tie_weights_": true,
  "transformers_version": "4.24.0",
  "vocab_size": 30522
}

loading file vocab.txt from cache at /root/.cache/huggingface/hub/models--distilbert-base-uncased/snapshots/043235d6088ecd3dd5fb5ca3592b6913fd516027/vocab.txt
loading file tokenizer.json from cache at /root/.cache/huggingface/hub/models--distilbert-base-uncased/snapsh

  0%|          | 0/22 [00:00<?, ?ba/s]

  0%|          | 0/4 [00:00<?, ?ba/s]

DatasetDict({
    train: Dataset({
        features: ['text', 'label', 'input_ids', 'attention_mask'],
        num_rows: 21250
    })
    test: Dataset({
        features: ['text', 'label', 'input_ids', 'attention_mask'],
        num_rows: 3750
    })
})


In [None]:
data_collator = DataCollatorWithPadding(tokenizer=tokenizer, padding="max_length")
modelIMDB = AutoModelForSequenceClassification.from_pretrained("distilbert-base-uncased", num_labels = 2)
modelIMDB.cuda()
training_args = TrainingArguments(
	output_dir="/content/drive/My Drive/ColabNotebooks/model out/",
	optim = "adamw_torch", 
	evaluation_strategy="steps", 
	#num_train_epochs = 10,
	metric_for_best_model = "accuracy",
	greater_is_better = True,
  warmup_steps = 5000,
	save_steps=5000,
	max_steps = 25000,
	eval_steps = 5000,
  # warmup_steps = 50,
	# save_steps=50,
	# max_steps = 250,
	# eval_steps = 50,
	per_device_train_batch_size=8, 
	per_device_eval_batch_size=8,
	weight_decay=0.01,
	# learning_rate = 2e-5,#, #defaults to 5e-5
	learning_rate = 5e-6,
	load_best_model_at_end = True,
	# lr_scheduler_type = "cosine"
	# report_to="wandb"
	)

trainer = Trainer(
		model=modelIMDB,
		args=training_args,
		train_dataset=tokenized_datasetIMDB_train["train"],
    eval_dataset=tokenized_datasetIMDB_train["test"],
    tokenizer=tokenizer,
    data_collator=data_collator,
		compute_metrics=compute_metrics
)
trainer.train()

loading configuration file config.json from cache at /root/.cache/huggingface/hub/models--distilbert-base-uncased/snapshots/043235d6088ecd3dd5fb5ca3592b6913fd516027/config.json
Model config DistilBertConfig {
  "_name_or_path": "distilbert-base-uncased",
  "activation": "gelu",
  "architectures": [
    "DistilBertForMaskedLM"
  ],
  "attention_dropout": 0.1,
  "dim": 768,
  "dropout": 0.1,
  "hidden_dim": 3072,
  "initializer_range": 0.02,
  "max_position_embeddings": 512,
  "model_type": "distilbert",
  "n_heads": 12,
  "n_layers": 6,
  "pad_token_id": 0,
  "qa_dropout": 0.1,
  "seq_classif_dropout": 0.2,
  "sinusoidal_pos_embds": false,
  "tie_weights_": true,
  "transformers_version": "4.24.0",
  "vocab_size": 30522
}

loading weights file pytorch_model.bin from cache at /root/.cache/huggingface/hub/models--distilbert-base-uncased/snapshots/043235d6088ecd3dd5fb5ca3592b6913fd516027/pytorch_model.bin
Some weights of the model checkpoint at distilbert-base-uncased were not used when in

Step,Training Loss,Validation Loss,Accuracy
5000,0.2709,0.311584,0.9104
10000,0.1854,0.339142,0.916533
15000,0.1068,0.429734,0.921867
20000,0.0747,0.428652,0.922667
25000,0.0357,0.465716,0.923733


The following columns in the evaluation set don't have a corresponding argument in `DistilBertForSequenceClassification.forward` and have been ignored: text. If text are not expected by `DistilBertForSequenceClassification.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 3750
  Batch size = 8
Saving model checkpoint to /content/drive/My Drive/ColabNotebooks/model out/checkpoint-5000
Configuration saved in /content/drive/My Drive/ColabNotebooks/model out/checkpoint-5000/config.json
Model weights saved in /content/drive/My Drive/ColabNotebooks/model out/checkpoint-5000/pytorch_model.bin
tokenizer config file saved in /content/drive/My Drive/ColabNotebooks/model out/checkpoint-5000/tokenizer_config.json
Special tokens file saved in /content/drive/My Drive/ColabNotebooks/model out/checkpoint-5000/special_tokens_map.json
The following columns in the evaluation set don't have a corresponding argument in `DistilBertForSequenceClassification.forwar

TrainOutput(global_step=25000, training_loss=0.16800662643432618, metrics={'train_runtime': 3563.0665, 'train_samples_per_second': 56.131, 'train_steps_per_second': 7.016, 'total_flos': 2.6486326491672576e+16, 'train_loss': 0.16800662643432618, 'epoch': 9.41})

In [None]:
#get predictions on IMDB Test set for task 2

tokenized_datasetIMDB_test = datasetIMDB_test.map(tokenize_function, batched=True)
prediction = trainer.predict(tokenized_datasetIMDB_test)
print(prediction)

# # For each prediction, create the label with argmax
# test_predictions_argmax = np.argmax(prediction[0], axis=1)
# # Retrieve reference labels from test set
# test_references = np.array(datasetIMDB_test["label"])
# # Compute accuracy
# metric.compute(predictions=test_predictions_argmax, references=test_references)
# # {'accuracy': 0.86028} same as trainer output when given labels

  0%|          | 0/25 [00:00<?, ?ba/s]

The following columns in the test set don't have a corresponding argument in `DistilBertForSequenceClassification.forward` and have been ignored: text. If text are not expected by `DistilBertForSequenceClassification.forward`,  you can safely ignore this message.
***** Running Prediction *****
  Num examples = 25000
  Batch size = 8


PredictionOutput(predictions=array([[ 4.234369 , -3.6373174],
       [ 4.1285663, -3.5888915],
       [ 3.9875145, -3.5075574],
       ...,
       [ 3.3507366, -3.0311785],
       [-4.020826 ,  3.4801638],
       [-3.86805  ,  3.3308415]], dtype=float32), label_ids=array([0, 0, 0, ..., 1, 1, 1]), metrics={'test_loss': 0.43931668996810913, 'test_accuracy': 0.92612, 'test_runtime': 139.8763, 'test_samples_per_second': 178.729, 'test_steps_per_second': 22.341})


In [None]:
#Moving on to Task 3 - multi-class classification with AG News dataset

tokenized_datasetAG_train = datasetAG_train.map(tokenize_function, batched=True)
print(tokenized_datasetAG_train)

modelAG = AutoModelForSequenceClassification.from_pretrained("distilbert-base-uncased", num_labels = 4)
modelAG.cuda()
training_args = TrainingArguments(
	output_dir="/content/drive/My Drive/ColabNotebooks/model out/",
	optim = "adamw_torch", 
	evaluation_strategy="steps", 
	#num_train_epochs = 10,
	metric_for_best_model = "accuracy",
	greater_is_better = True,
  warmup_steps = 5000,
	save_steps=5000,
	max_steps = 25000,
	eval_steps = 5000,
  # warmup_steps = 50,
	# save_steps=50,
	# max_steps = 100,
	# eval_steps = 50,
	per_device_train_batch_size=8, 
	per_device_eval_batch_size=8,
	weight_decay=0.01,
	# learning_rate = 2e-5,#, #defaults to 5e-5
	learning_rate = 5e-6,
	load_best_model_at_end = True,
	# lr_scheduler_type = "cosine"
	# report_to="wandb"
	)

trainer1 = Trainer(
		model=modelAG,
		args=training_args,
		train_dataset=tokenized_datasetAG_train["train"],
    eval_dataset=tokenized_datasetAG_train["test"],
    tokenizer=tokenizer,
    data_collator=data_collator,
		compute_metrics=compute_metrics
)
trainer1.train()

  0%|          | 0/102 [00:00<?, ?ba/s]

  0%|          | 0/18 [00:00<?, ?ba/s]

DatasetDict({
    train: Dataset({
        features: ['text', 'label', 'input_ids', 'attention_mask'],
        num_rows: 102000
    })
    test: Dataset({
        features: ['text', 'label', 'input_ids', 'attention_mask'],
        num_rows: 18000
    })
})


loading configuration file config.json from cache at /root/.cache/huggingface/hub/models--distilbert-base-uncased/snapshots/043235d6088ecd3dd5fb5ca3592b6913fd516027/config.json
Model config DistilBertConfig {
  "_name_or_path": "distilbert-base-uncased",
  "activation": "gelu",
  "architectures": [
    "DistilBertForMaskedLM"
  ],
  "attention_dropout": 0.1,
  "dim": 768,
  "dropout": 0.1,
  "hidden_dim": 3072,
  "id2label": {
    "0": "LABEL_0",
    "1": "LABEL_1",
    "2": "LABEL_2",
    "3": "LABEL_3"
  },
  "initializer_range": 0.02,
  "label2id": {
    "LABEL_0": 0,
    "LABEL_1": 1,
    "LABEL_2": 2,
    "LABEL_3": 3
  },
  "max_position_embeddings": 512,
  "model_type": "distilbert",
  "n_heads": 12,
  "n_layers": 6,
  "pad_token_id": 0,
  "qa_dropout": 0.1,
  "seq_classif_dropout": 0.2,
  "sinusoidal_pos_embds": false,
  "tie_weights_": true,
  "transformers_version": "4.24.0",
  "vocab_size": 30522
}

loading weights file pytorch_model.bin from cache at /root/.cache/huggingfac

Step,Training Loss,Validation Loss,Accuracy
5000,0.3061,0.342977,0.903056
10000,0.277,0.249448,0.929111
15000,0.2311,0.256257,0.931
20000,0.2229,0.235156,0.937944
25000,0.2095,0.22803,0.938722


The following columns in the evaluation set don't have a corresponding argument in `DistilBertForSequenceClassification.forward` and have been ignored: text. If text are not expected by `DistilBertForSequenceClassification.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 18000
  Batch size = 8
Saving model checkpoint to /content/drive/My Drive/ColabNotebooks/model out/checkpoint-5000
Configuration saved in /content/drive/My Drive/ColabNotebooks/model out/checkpoint-5000/config.json
Model weights saved in /content/drive/My Drive/ColabNotebooks/model out/checkpoint-5000/pytorch_model.bin
tokenizer config file saved in /content/drive/My Drive/ColabNotebooks/model out/checkpoint-5000/tokenizer_config.json
Special tokens file saved in /content/drive/My Drive/ColabNotebooks/model out/checkpoint-5000/special_tokens_map.json
The following columns in the evaluation set don't have a corresponding argument in `DistilBertForSequenceClassification.forwa

TrainOutput(global_step=25000, training_loss=0.2987227294921875, metrics={'train_runtime': 8131.3005, 'train_samples_per_second': 24.596, 'train_steps_per_second': 3.075, 'total_flos': 2.64944246784e+16, 'train_loss': 0.2987227294921875, 'epoch': 1.96})

In [None]:
#get predictions for task 3

tokenized_datasetAG_test = datasetAG_test.map(tokenize_function, batched=True)
prediction1 = trainer1.predict(tokenized_datasetAG_test)
print(prediction1)

Loading cached processed dataset at /root/.cache/huggingface/datasets/ag_news/default/0.0.0/bc2bcb40336ace1a0374767fc29bb0296cdaf8a6da7298436239c54d79180548/cache-8e38d0ebcaefa4f0.arrow
The following columns in the test set don't have a corresponding argument in `DistilBertForSequenceClassification.forward` and have been ignored: text. If text are not expected by `DistilBertForSequenceClassification.forward`,  you can safely ignore this message.
***** Running Prediction *****
  Num examples = 7600
  Batch size = 8


PredictionOutput(predictions=array([[-1.2356045 , -3.9653928 ,  5.164323  , -2.2271745 ],
       [-1.3414255 , -3.3007202 , -1.2773756 ,  5.2970185 ],
       [-0.65212464, -3.1574771 , -1.5690675 ,  5.0186276 ],
       ...,
       [-0.9578073 ,  6.808625  , -2.012453  , -2.8426404 ],
       [ 3.0978603 , -3.8792908 ,  0.5166585 , -0.74815303],
       [-2.5660062 , -4.487078  ,  2.0876055 ,  3.4098408 ]],
      dtype=float32), label_ids=array([2, 3, 3, ..., 1, 2, 2]), metrics={'test_loss': 0.23484377562999725, 'test_accuracy': 0.9386842105263158, 'test_runtime': 79.4947, 'test_samples_per_second': 95.604, 'test_steps_per_second': 11.95})
