In [1]:
%pip install transformers[torch] huggingface_hub datasets evaluate torchvision kagglehub ipywidgets

Looking in indexes: https://pypi.org/simple, https://pypi.ngc.nvidia.com
Collecting huggingface_hub
  Downloading huggingface_hub-0.29.1-py3-none-any.whl.metadata (13 kB)
Collecting datasets
  Downloading datasets-3.3.2-py3-none-any.whl.metadata (19 kB)
Collecting evaluate
  Downloading evaluate-0.4.3-py3-none-any.whl.metadata (9.2 kB)
Collecting kagglehub
  Downloading kagglehub-0.3.9-py3-none-any.whl.metadata (30 kB)
Collecting ipywidgets
  Downloading ipywidgets-8.1.5-py3-none-any.whl.metadata (2.3 kB)
Collecting transformers[torch]
  Downloading transformers-4.49.0-py3-none-any.whl.metadata (44 kB)
Collecting tokenizers<0.22,>=0.21 (from transformers[torch])
  Downloading tokenizers-0.21.0-cp39-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (6.7 kB)
Collecting accelerate>=0.26.0 (from transformers[torch])
  Downloading accelerate-1.4.0-py3-none-any.whl.metadata (19 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10

In [1]:
from datasets import concatenate_datasets, load_from_disk
from transformers import BasicTokenizer
from transformers import Trainer
import numpy as np
import kagglehub
import torch
import base

In [2]:
my_glove = kagglehub.dataset_download("thanakomsn/glove6b300dtxt")
print(my_glove)

/home/jovyan/.cache/kagglehub/datasets/thanakomsn/glove6b300dtxt/versions/1


In [3]:
GLOVE_FILE = f"{my_glove}/glove.6B.300d.txt"

In [4]:
train_data = load_from_disk("./data/sst2/train-logits")
eval_data = load_from_disk("./data/sst2/eval-logits")
test_data = load_from_disk("./data/sst2/test-logits")

all_train_data = load_from_disk("./data/sst2/train-logits-augmented")


all_data = concatenate_datasets([load_from_disk(file) for file in ["./data/sst2/eval-logits", "./data/sst2/test-logits", "./data/sst2/train-logits-augmented", "./data/sst2/test-blank-logits"]])
tokenizer = BasicTokenizer(do_lower_case=True)

In [5]:
def tokenize(dataset):
    if isinstance(dataset["sentence"], str):
        return list(tokenizer.tokenize(dataset["sentence"]))
    else:
        raise ValueError("Input text is not string")

In [6]:
def get_vocab(dataset):
    all_tokens = []
    for data in dataset:
        for token in data:
            all_tokens.append(token)

    vocab = set(all_tokens)
    return vocab


In [7]:
if torch.cuda.is_available():
    device = torch.device("cuda")
    print("GPU is available and will be used:", torch.cuda.get_device_name(0))
else:
    device = torch.device("cpu")
    print("GPU is not available, using CPU.")

GPU is available and will be used: NVIDIA A100 80GB PCIe MIG 2g.20gb


In [8]:
train_data_tokens = list(map(lambda e: tokenize(e), train_data))
eval_data_tokens = list(map(lambda e: tokenize(e), eval_data))
test_data_tokens = list(map(lambda e: tokenize(e), test_data))

all_train_data_tokens = list(map(lambda e: tokenize(e), all_train_data))


all_data_tokens = list(map(lambda e: tokenize(e), all_data))

In [9]:
vocab = get_vocab(all_data_tokens)

In [10]:
word_index = dict(zip(vocab, range(len(vocab))))

In [11]:
embeddings_index = {}
with open(GLOVE_FILE, encoding='utf-8') as f:
    for line in f:
        word, coefs = line.split(maxsplit=1)
        coefs = np.fromstring(coefs, "f", sep=" ")
        embeddings_index[word] = coefs
print(f"Found {len(embeddings_index)} word vectors.")


Found 400000 word vectors.


In [12]:
print(len(vocab))

16152


In [13]:
num_tokens = len(vocab) + 2
embedding_dim = 300
hits = 0
misses = 0
embedding_matrix = np.zeros((num_tokens, embedding_dim))

for word, i in word_index.items():
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        embedding_matrix[i] = embedding_vector
        hits += 1
    else:
        misses += 1
embedding_matrix = torch.tensor(embedding_matrix, dtype=torch.float32)
print(f"Converted {hits} words ({misses}) misses")

Converted 15775 words (377) misses


In [14]:
def padd(data, max_length):
    padding_length = max_length - len(data)
    if padding_length > 0:
        padding = [0 for _ in range(padding_length)]
        data.extend(padding)
    return data[:max_length]

In [15]:
train_data_index = list(map(lambda x: list(map(lambda y: word_index[y], x)),train_data_tokens))
eval_data_index = list(map(lambda x: list(map(lambda y: word_index[y], x)),eval_data_tokens))
test_data_index = list(map(lambda x: list(map(lambda y: word_index[y], x)),test_data_tokens))

all_train_data_index = list(map(lambda x: list(map(lambda y: word_index[y], x)),all_train_data_tokens))

In [16]:
train_padded_data = list(map(lambda x: padd(x,60), train_data_index))
eval_padded_data = list(map(lambda x: padd(x,60), eval_data_index))
test_padded_data = list(map(lambda x: padd(x,60), test_data_index))

all_train_padded_data = list(map(lambda x: padd(x,60), all_train_data_index))

In [17]:
train_data = train_data.add_column("input_ids", train_padded_data)
eval_data = eval_data.add_column("input_ids", eval_padded_data)
test_data = test_data.add_column("input_ids", test_padded_data)

all_train_data = all_train_data.add_column("input_ids", all_train_padded_data)

In [18]:
model = base.BiLSTMClassifier(embedding_matrix=embedding_matrix, embedding_dim=embedding_dim, fc_dim=400, hidden_dim=300, output_dim=2)

In [19]:
print(model)

BiLSTMClassifier(
  (embedding): Embedding(16154, 300)
  (lstm): LSTM(300, 300, batch_first=True, bidirectional=True)
  (fc1): Linear(in_features=600, out_features=400, bias=True)
  (dropout): Dropout(p=0.2, inplace=False)
  (fc2): Linear(in_features=400, out_features=2, bias=True)
)


In [20]:
training_args = base.get_training_args(output_dir="./results/bilstm-base", logging_dir='./logs/bilstm-base', lr=.001,  epochs=10, batch_size=128)

In [21]:
base.reset_seed()

In [22]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_data,
    eval_dataset=eval_data,
    compute_metrics=base.compute_metrics,
)

In [23]:
trainer.train()

Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.3791,0.437752,0.78555,0.801882,0.787646,0.783394
2,0.2923,0.392721,0.83945,0.844322,0.838343,0.838524
3,0.2257,0.441251,0.829128,0.832606,0.828166,0.828343
4,0.1731,0.464642,0.831422,0.837994,0.830123,0.830166
5,0.131,0.500803,0.844037,0.844007,0.843942,0.84397
6,0.0966,0.523377,0.837156,0.83781,0.836722,0.836908
7,0.0687,0.606604,0.84289,0.846232,0.841974,0.842216
8,0.0479,0.642301,0.845183,0.845141,0.84511,0.845125
9,0.0323,0.743058,0.84289,0.842843,0.842942,0.842865
10,0.022,0.827019,0.83945,0.839852,0.839101,0.839259


TrainOutput(global_step=4210, training_loss=0.14688521360275014, metrics={'train_runtime': 84.8466, 'train_samples_per_second': 6350.166, 'train_steps_per_second': 49.619, 'total_flos': 0.0, 'train_loss': 0.14688521360275014, 'epoch': 10.0})

In [24]:
model.eval()

BiLSTMClassifier(
  (embedding): Embedding(16154, 300)
  (lstm): LSTM(300, 300, batch_first=True, bidirectional=True)
  (fc1): Linear(in_features=600, out_features=400, bias=True)
  (dropout): Dropout(p=0.2, inplace=False)
  (fc2): Linear(in_features=400, out_features=2, bias=True)
)

In [25]:
trainer.evaluate(test_data)

{'eval_loss': 0.27407392859458923,
 'eval_accuracy': 0.9328878990348923,
 'eval_precision': 0.9316026297774611,
 'eval_recall': 0.9325522959336439,
 'eval_f1': 0.9320542859520848,
 'eval_runtime': 3.8816,
 'eval_samples_per_second': 3470.262,
 'eval_steps_per_second': 27.309,
 'epoch': 10.0}

In [26]:
torch.save(model.state_dict(), "./models/sst2/bilstm-base.pth")

In [27]:
student_model = base.BiLSTMClassifier(embedding_matrix=embedding_matrix, embedding_dim=embedding_dim, fc_dim=400, hidden_dim=300, output_dim=2)

In [28]:
training_args = base.get_training_args(output_dir="./results/bilstm-distill", remove_unused_columns=False, logging_dir='./logs/bilstm-distill', lr=.001,  epochs=10, batch_size=128, lambda_param=.75, temp=5)

In [29]:
base.reset_seed()

In [30]:
trainer = base.DistilTrainer(
    student_model=student_model,
    args=training_args,
    train_dataset=train_data,
    eval_dataset=eval_data,
    compute_metrics=base.compute_metrics,
)

In [31]:
trainer.train()

Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,2.2381,2.088562,0.802752,0.811274,0.804243,0.801876
2,1.491,1.787525,0.841743,0.846657,0.840637,0.840831
3,1.0424,1.709736,0.840596,0.846505,0.839385,0.839533
4,0.7441,1.615136,0.855505,0.856753,0.854951,0.8552
5,0.5605,1.642296,0.857798,0.858131,0.857498,0.857651
6,0.4161,1.704366,0.855505,0.85742,0.854824,0.855101
7,0.3181,1.751488,0.854358,0.854631,0.854077,0.854218
8,0.2496,1.789579,0.84289,0.844373,0.842269,0.842507
9,0.2021,1.707498,0.855505,0.855481,0.855414,0.855443
10,0.1688,1.766199,0.857798,0.858131,0.857498,0.857651


TrainOutput(global_step=4210, training_loss=0.7430880947520784, metrics={'train_runtime': 88.1325, 'train_samples_per_second': 6113.412, 'train_steps_per_second': 47.769, 'total_flos': 0.0, 'train_loss': 0.7430880947520784, 'epoch': 10.0})

In [32]:
student_model.eval()

BiLSTMClassifier(
  (embedding): Embedding(16154, 300)
  (lstm): LSTM(300, 300, batch_first=True, bidirectional=True)
  (fc1): Linear(in_features=600, out_features=400, bias=True)
  (dropout): Dropout(p=0.2, inplace=False)
  (fc2): Linear(in_features=400, out_features=2, bias=True)
)

In [33]:
trainer.evaluate(test_data)

{'eval_loss': 0.7615254521369934,
 'eval_accuracy': 0.9333333333333333,
 'eval_precision': 0.9316271266997773,
 'eval_recall': 0.9339263149352013,
 'eval_f1': 0.9326199505989041,
 'eval_runtime': 4.5464,
 'eval_samples_per_second': 2962.784,
 'eval_steps_per_second': 23.315,
 'epoch': 10.0}

In [34]:
torch.save(student_model.state_dict(), "./models/sst2/bilstm-distill.pth")

In [35]:
model = base.BiLSTMClassifier(embedding_matrix=embedding_matrix, embedding_dim=embedding_dim, fc_dim=400, hidden_dim=300, output_dim=2)

In [36]:
training_args = base.get_training_args(output_dir="./results/bilstm-base-aug", logging_dir='./logs/bilstm-base-aug', lr=.001,  epochs=10, batch_size=128)

In [37]:
base.reset_seed()

In [38]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=all_train_data,
    eval_dataset=eval_data,
    compute_metrics=base.compute_metrics,
)

In [39]:
trainer.train()

Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.2684,0.347294,0.881881,0.881862,0.881988,0.881868
2,0.1271,0.41517,0.861239,0.863325,0.860539,0.860834
3,0.0733,0.531261,0.861239,0.861261,0.861381,0.86123
4,0.0428,0.686697,0.856651,0.858454,0.855993,0.856269
5,0.026,0.872453,0.857798,0.858866,0.857287,0.857528
6,0.0163,1.020872,0.862385,0.862351,0.862465,0.862367
7,0.0106,1.217379,0.855505,0.856224,0.855077,0.855285
8,0.0064,1.398077,0.870413,0.870831,0.870096,0.870269
9,0.0042,1.520113,0.868119,0.868173,0.86797,0.868043
10,0.0026,1.663398,0.866972,0.866955,0.866886,0.866916


TrainOutput(global_step=39660, training_loss=0.057786727324316015, metrics={'train_runtime': 527.0044, 'train_samples_per_second': 9632.064, 'train_steps_per_second': 75.256, 'total_flos': 0.0, 'train_loss': 0.057786727324316015, 'epoch': 10.0})

In [40]:
model.eval()

BiLSTMClassifier(
  (embedding): Embedding(16154, 300)
  (lstm): LSTM(300, 300, batch_first=True, bidirectional=True)
  (fc1): Linear(in_features=600, out_features=400, bias=True)
  (dropout): Dropout(p=0.2, inplace=False)
  (fc2): Linear(in_features=400, out_features=2, bias=True)
)

In [41]:
trainer.evaluate(test_data)

{'eval_loss': 0.18746386468410492,
 'eval_accuracy': 0.9337045285820341,
 'eval_precision': 0.9317890710812043,
 'eval_recall': 0.9352511979876204,
 'eval_f1': 0.9331032836846588,
 'eval_runtime': 3.8951,
 'eval_samples_per_second': 3458.225,
 'eval_steps_per_second': 27.214,
 'epoch': 10.0}

In [42]:
torch.save(model.state_dict(), "./models/sst2/bilstm-base-aug.pth")

In [43]:
student_model = base.BiLSTMClassifier(embedding_matrix=embedding_matrix, embedding_dim=embedding_dim, fc_dim=400, hidden_dim=300, output_dim=2)

In [44]:
training_args = base.get_training_args(output_dir="./results/bilstm-distill-aug", remove_unused_columns=False, logging_dir='./logs/bilstm-distill-aug', lr=.001,  epochs=10, batch_size=128, lambda_param=.75, temp=5)

In [45]:
base.reset_seed()

In [46]:
trainer = base.DistilTrainer(
    student_model=student_model,
    args=training_args,
    train_dataset=all_train_data,
    eval_dataset=eval_data,
    compute_metrics=base.compute_metrics,
)

In [47]:
trainer.train()

Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,1.0482,1.038888,0.879587,0.879742,0.87982,0.879586
2,0.42,1.019241,0.885321,0.886325,0.884861,0.885125
3,0.2631,1.028778,0.885321,0.885734,0.885661,0.88532
4,0.1874,1.033168,0.876147,0.876113,0.876231,0.87613
5,0.1446,0.928737,0.883028,0.882988,0.882988,0.882988
6,0.1181,0.896784,0.894495,0.894493,0.894418,0.89445
7,0.0999,0.964403,0.886468,0.886822,0.886787,0.886468
8,0.086,0.933784,0.886468,0.886443,0.886409,0.886425
9,0.0756,0.907945,0.888761,0.888738,0.888703,0.888719
10,0.0679,0.910377,0.887615,0.887608,0.887535,0.887567


TrainOutput(global_step=39660, training_loss=0.2510762948071301, metrics={'train_runtime': 565.3235, 'train_samples_per_second': 8979.177, 'train_steps_per_second': 70.155, 'total_flos': 0.0, 'train_loss': 0.2510762948071301, 'epoch': 10.0})

In [48]:
student_model.eval()

BiLSTMClassifier(
  (embedding): Embedding(16154, 300)
  (lstm): LSTM(300, 300, batch_first=True, bidirectional=True)
  (fc1): Linear(in_features=600, out_features=400, bias=True)
  (dropout): Dropout(p=0.2, inplace=False)
  (fc2): Linear(in_features=400, out_features=2, bias=True)
)

In [49]:
trainer.evaluate(test_data)

{'eval_loss': 0.31329110264778137,
 'eval_accuracy': 0.9623608017817372,
 'eval_precision': 0.9614036150443488,
 'eval_recall': 0.9624335446901624,
 'eval_f1': 0.9618939118994636,
 'eval_runtime': 3.9027,
 'eval_samples_per_second': 3451.463,
 'eval_steps_per_second': 27.161,
 'epoch': 10.0}

In [50]:
torch.save(model.state_dict(), "./models/sst2/bilstm-distill-aug.pth")

In [113]:
test_blank_data = load_from_disk("./data/sst2/test-blank-logits")
test_data_blank_tokens = list(map(lambda e: tokenize(e), test_blank_data))
test_data_blank_index = list(map(lambda x: list(map(lambda y: word_index[y], x)),test_data_blank_tokens))
test_blank_padded_data = list(map(lambda x: padd(x,60), test_data_blank_index))
test_blank_data = test_blank_data.add_column("input_ids", test_blank_padded_data)
test_blank_data.remove_columns("labels")
test_blank_data.set_format(type='torch', columns=["input_ids"], device="cuda")

In [114]:
from torch.utils.data import DataLoader
test_blank_dataloader = DataLoader(test_blank_data, batch_size=128, shuffle=False)

In [63]:
from tqdm.notebook import tqdm
def generate_logits(dataloader, model, images=True):
    """Generates logits for given input."""
    logits_arr = []
    for batch in tqdm(dataloader, desc="Generating logits for given dataset: "):
        
        with torch.no_grad():
            if images:
                pixel_values, labels = batch
                outputs = model(pixel_values)
            else:
                outputs = model(batch["input_ids"])
            logits = outputs["logits"]
        logits_arr.append(logits.cpu().numpy())

    logits_arr_flat = []
    for tensor in logits_arr:
        logits_arr_flat.extend(tensor)
    return logits_arr_flat


Destilovaná BiLSTM

In [64]:
test_blank_logits = generate_logits(test_blank_dataloader, student_model, images=False)

Generating logits for given dataset:   0%|          | 0/15 [00:00<?, ?it/s]

In [102]:
labels = []
labels.append("id\tlabel\n")
for index, logit in enumerate(test_blank_logits):
    labels.append(f"{index}\t{torch.topk(torch.as_tensor(logit), k=1).indices.numpy()[0]}\n")

In [None]:
with open("SST2-DISTILL-BiLSTM-OUTPUT.tsv", "w") as file:
    file.writelines(labels)

![SST2 test score for best BiLSTM](imgs/sst2_BiLSTM_test_score.png)

Baseline BiLSTM

In [115]:
test_blank_logits = generate_logits(test_blank_dataloader, model, images=False)

Generating logits for given dataset:   0%|          | 0/15 [00:00<?, ?it/s]

In [116]:
labels = []
labels.append("id\tlabel\n")
for index, logit in enumerate(test_blank_logits):
    labels.append(f"{index}\t{torch.topk(torch.as_tensor(logit), k=1).indices.numpy()[0]}\n")

In [120]:
with open("SST2-BASE-BiLSTM-OUTPUT.tsv", "w") as file:
    file.writelines(labels)

Baseline BERT

In [105]:
test_blank_data = load_from_disk("./data/sst2/test-blank-logits")

In [110]:
labels = []
labels.append("id\tlabel\n")
for index, data in enumerate(test_blank_data):
    labels.append(f"{index}\t{torch.topk(torch.as_tensor(data['logits']), k=1).indices.numpy()[0]}\n")

In [111]:
print(labels[0:5])

['id\tlabel\n', '0\t0\n', '1\t0\n', '2\t1\n', '3\t1\n']


In [112]:
with open("SST2-BERT-OUTPUT.tsv", "w") as file:
    file.writelines(labels)

![SST2 test score for best BERT model](imgs/sst2_BERT_test_score.png)