In [1]:
from datasets import concatenate_datasets, load_from_disk
from transformers import BasicTokenizer, Trainer, EarlyStoppingCallback
from torch.utils.data import DataLoader
import kagglehub
import torch
import base
import os

[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /home/jovyan/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package punkt to /home/jovyan/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to /home/jovyan/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger_eng to
[nltk_data]     /home/jovyan/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger_eng is already up-to-
[nltk_data]       date!


In [2]:
my_glove = kagglehub.dataset_download("thanakomsn/glove6b300dtxt")
print(my_glove)

/home/jovyan/.cache/kagglehub/datasets/thanakomsn/glove6b300dtxt/versions/1


In [None]:
GLOVE_FILE = f"{my_glove}/glove.6B.300d.txt"
DATASET = "sst2"

In [4]:
train_data = load_from_disk(f"~/data/{DATASET}/train-logits")
eval_data = load_from_disk(f"~/data/{DATASET}/eval-logits")
test_data = load_from_disk(f"~/data/{DATASET}/test-logits")

all_train_data = load_from_disk(f"~/data/{DATASET}/train-logits-augmented")
test_blank_data = load_from_disk(f"~/data/{DATASET}/test-blank-logits")

all_data = concatenate_datasets([load_from_disk(file) for file in [f"~/data/{DATASET}/eval-logits", f"~/data/{DATASET}/test-logits", f"~/data/{DATASET}/train-logits-augmented", f"~/data/{DATASET}/test-blank-logits"]])
tokenizer = BasicTokenizer(do_lower_case=True)

In [5]:
if torch.cuda.is_available():
    device = torch.device("cuda")
    print("GPU is available and will be used:", torch.cuda.get_device_name(0))
else:
    device = torch.device("cpu")
    print("GPU is not available, using CPU.")

GPU is available and will be used: NVIDIA A100 80GB PCIe MIG 2g.20gb


In [6]:
train_data_tokens = list(map(lambda e: tokenizer.tokenize(e["sentence"]), train_data))
eval_data_tokens = list(map(lambda e: tokenizer.tokenize(e["sentence"]), eval_data))
test_data_tokens = list(map(lambda e: tokenizer.tokenize(e["sentence"]), test_data))

all_train_data_tokens = list(map(lambda e: tokenizer.tokenize(e["sentence"]), all_train_data))
test_data_blank_tokens = list(map(lambda e: tokenizer.tokenize(e["sentence"]), test_blank_data))

all_data_tokens = list(map(lambda e: tokenizer.tokenize(e["sentence"]), all_data))

In [7]:
vocab = base.get_vocab(all_data_tokens)

In [8]:
word_index = dict(zip(vocab, range(len(vocab))))

In [9]:
embeddings_index = base.get_embeddings_indeces(GLOVE_FILE)

Found 400000 word vectors.


In [10]:
print(len(vocab))
num_tokens = len(vocab) + 2
embedding_dim = 300

16152


In [11]:
embedding_matrix = base.get_embedding_matrix(num_tokens, embedding_dim, word_index, embeddings_index)

Converted 15775 words (377) misses


In [12]:
train_data_index = list(map(lambda x: list(map(lambda y: word_index[y], x)),train_data_tokens))
eval_data_index = list(map(lambda x: list(map(lambda y: word_index[y], x)),eval_data_tokens))
test_data_index = list(map(lambda x: list(map(lambda y: word_index[y], x)),test_data_tokens))

all_train_data_index = list(map(lambda x: list(map(lambda y: word_index[y], x)),all_train_data_tokens))
test_data_blank_index = list(map(lambda x: list(map(lambda y: word_index[y], x)),test_data_blank_tokens))

In [13]:
train_padded_data = list(map(lambda x: base.padd(x,60), train_data_index))
eval_padded_data = list(map(lambda x: base.padd(x,60), eval_data_index))
test_padded_data = list(map(lambda x: base.padd(x,60), test_data_index))

all_train_padded_data = list(map(lambda x: base.padd(x,60), all_train_data_index))
test_blank_padded_data = list(map(lambda x: base.padd(x,60), test_data_blank_index))

In [14]:
train_data = train_data.add_column("input_ids", train_padded_data)
eval_data = eval_data.add_column("input_ids", eval_padded_data)
test_data = test_data.add_column("input_ids", test_padded_data)

all_train_data = all_train_data.add_column("input_ids", all_train_padded_data)
test_blank_data = test_blank_data.add_column("input_ids", test_blank_padded_data)

In [15]:
model = base.BiLSTMClassifier(embedding_matrix=embedding_matrix, embedding_dim=embedding_dim, fc_dim=400, hidden_dim=300, output_dim=2)

In [16]:
print(model)

BiLSTMClassifier(
  (embedding): Embedding(16154, 300)
  (lstm): LSTM(300, 300, batch_first=True, bidirectional=True)
  (fc1): Linear(in_features=600, out_features=400, bias=True)
  (dropout): Dropout(p=0.2, inplace=False)
  (fc2): Linear(in_features=400, out_features=2, bias=True)
)


In [17]:
training_args = base.get_training_args(output_dir=f"~/results/{DATASET}/bilstm-base", logging_dir=f"~/logs/{DATASET}/bilstm-base", lr=.001,  epochs=10, batch_size=128)

In [18]:
base.reset_seed()

In [19]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_data,
    eval_dataset=eval_data,
    compute_metrics=base.compute_metrics,
    callbacks = [EarlyStoppingCallback(early_stopping_patience = 3)]
)

In [20]:
trainer.train()

Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.3814,0.432093,0.793578,0.805548,0.795361,0.792161
2,0.294,0.383163,0.84289,0.845079,0.842142,0.842392
3,0.2277,0.451857,0.825688,0.831078,0.824493,0.824558
4,0.175,0.45965,0.832569,0.836949,0.831502,0.831662
5,0.1333,0.433682,0.845183,0.845543,0.844858,0.845012
6,0.1005,0.490228,0.844037,0.845864,0.843353,0.843601
7,0.0738,0.469332,0.862385,0.862339,0.862339,0.862339
8,0.0516,0.536502,0.856651,0.857454,0.856203,0.85642
9,0.0349,0.6198,0.861239,0.861219,0.861339,0.861224
10,0.0244,0.653052,0.864679,0.86466,0.864591,0.864621


TrainOutput(global_step=4210, training_loss=0.14965844414862772, metrics={'train_runtime': 346.4266, 'train_samples_per_second': 1555.279, 'train_steps_per_second': 12.153, 'total_flos': 0.0, 'train_loss': 0.14965844414862772, 'epoch': 10.0})

In [21]:
model.eval()

BiLSTMClassifier(
  (embedding): Embedding(16154, 300)
  (lstm): LSTM(300, 300, batch_first=True, bidirectional=True)
  (fc1): Linear(in_features=600, out_features=400, bias=True)
  (dropout): Dropout(p=0.2, inplace=False)
  (fc2): Linear(in_features=400, out_features=2, bias=True)
)

In [22]:
trainer.evaluate(test_data)

{'eval_loss': 0.3025161623954773,
 'eval_accuracy': 0.9319227913882703,
 'eval_precision': 0.9304546648729137,
 'eval_recall': 0.931896122089896,
 'eval_f1': 0.9311190994506449,
 'eval_runtime': 6.5588,
 'eval_samples_per_second': 2053.717,
 'eval_steps_per_second': 16.161,
 'epoch': 10.0}

In [23]:
torch.save(model.state_dict(), f"{os.path.expanduser('~')}/models/{DATASET}/bilstm-base.pth")

In [24]:
test_blank_data.set_format(type="torch", columns=["input_ids"], device="cuda")
test_blank_dataloader = DataLoader(test_blank_data, batch_size=128, shuffle=False)
test_blank_logits = base.generate_logits(test_blank_dataloader, model, images=False)

Generating logits for given dataset:   0%|          | 0/15 [00:00<?, ?it/s]

In [25]:
base.generate_real_test_file_sst2(test_blank_logits, f"{os.path.expanduser('~')}/data/{DATASET}/bilstm-base-test.tsv")

Created output file named: /home/jovyan/data/sst2/bilstm-base-test.tsv upload it to GLUE benchmark to obtain results!


Reálné skóre na pravé test části datasetu

![Real test score (GLUE Benchmark)](../imgs/sst2_BiLSTM_base_score.png)

In [26]:
student_model = base.BiLSTMClassifier(embedding_matrix=embedding_matrix, embedding_dim=embedding_dim, fc_dim=400, hidden_dim=300, output_dim=2)

In [27]:
training_args = base.get_training_args(output_dir=f"~/results/{DATASET}/bilstm-distill", remove_unused_columns=False, logging_dir=f"~/logs/{DATASET}/bilstm-distill", lr=.001,  epochs=10, batch_size=128, lambda_param=.75, temp=5)

In [28]:
base.reset_seed()

In [29]:
trainer = base.DistilTrainer(
    student_model=student_model,
    args=training_args,
    train_dataset=train_data,
    eval_dataset=eval_data,
    compute_metrics=base.compute_metrics,
    callbacks = [EarlyStoppingCallback(early_stopping_patience = 3)]
)

In [30]:
trainer.train()

Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,2.195,2.012274,0.805046,0.814629,0.806622,0.804055
2,1.4767,1.614153,0.844037,0.844037,0.844153,0.844024
3,1.0419,1.658638,0.845183,0.849965,0.8441,0.844318
4,0.754,1.777842,0.833716,0.838298,0.832628,0.832786
5,0.5625,1.719183,0.852064,0.853394,0.851488,0.851736
6,0.4231,1.762693,0.850917,0.852345,0.85032,0.850571
7,0.3323,1.652702,0.858945,0.859461,0.858582,0.858766
8,0.2593,1.728679,0.857798,0.858618,0.858255,0.857786
9,0.2085,1.613539,0.868119,0.8682,0.868306,0.868115
10,0.1737,1.652338,0.862385,0.862366,0.862297,0.862327


TrainOutput(global_step=4210, training_loss=0.7426923140210947, metrics={'train_runtime': 362.9577, 'train_samples_per_second': 1484.443, 'train_steps_per_second': 11.599, 'total_flos': 0.0, 'train_loss': 0.7426923140210947, 'epoch': 10.0})

In [31]:
student_model.eval()

BiLSTMClassifier(
  (embedding): Embedding(16154, 300)
  (lstm): LSTM(300, 300, batch_first=True, bidirectional=True)
  (fc1): Linear(in_features=600, out_features=400, bias=True)
  (dropout): Dropout(p=0.2, inplace=False)
  (fc2): Linear(in_features=400, out_features=2, bias=True)
)

In [32]:
trainer.evaluate(test_data)

{'eval_loss': 0.6347505450248718,
 'eval_accuracy': 0.9423162583518931,
 'eval_precision': 0.9408168081117105,
 'eval_recall': 0.9427090250873543,
 'eval_f1': 0.9416652266703727,
 'eval_runtime': 6.9161,
 'eval_samples_per_second': 1947.623,
 'eval_steps_per_second': 15.327,
 'epoch': 10.0}

In [33]:
torch.save(student_model.state_dict(), f"{os.path.expanduser('~')}/models/{DATASET}/bilstm-distill.pth")

In [34]:
test_blank_logits = base.generate_logits(test_blank_dataloader, student_model, images=False)
base.generate_real_test_file_sst2(test_blank_logits, f"{os.path.expanduser('~')}/data/{DATASET}/bilstm-distill-test.tsv")

Generating logits for given dataset:   0%|          | 0/15 [00:00<?, ?it/s]

Created output file named: /home/jovyan/data/sst2/bilstm-distill-test.tsv upload it to GLUE benchmark to obtain results!


Reálné skóre na pravé test části datasetu

![Real test score (GLUE Benchmark)](../imgs/sst2_BiLSTM_distill_score.png)

In [15]:
model = base.BiLSTMClassifier(embedding_matrix=embedding_matrix, embedding_dim=embedding_dim, fc_dim=400, hidden_dim=300, output_dim=2)

In [16]:
training_args = base.get_training_args(output_dir=f"~/results/{DATASET}/bilstm-base-aug", logging_dir=f"~/logs/{DATASET}/bilstm-base-aug", lr=.001,  epochs=10, batch_size=128)

In [17]:
base.reset_seed()

In [18]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=all_train_data,
    eval_dataset=eval_data,
    compute_metrics=base.compute_metrics,
    callbacks = [EarlyStoppingCallback(early_stopping_patience = 4)]
)

In [19]:
trainer.train()

Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.2097,0.417857,0.868119,0.868419,0.867843,0.867993
2,0.0731,0.613483,0.866972,0.870587,0.867896,0.866815
3,0.0403,0.661935,0.858945,0.863384,0.859971,0.858717
4,0.0256,0.721421,0.854358,0.854539,0.854119,0.854238
5,0.017,1.01366,0.864679,0.86497,0.86497,0.864679


TrainOutput(global_step=20845, training_loss=0.07314278174144354, metrics={'train_runtime': 1127.2411, 'train_samples_per_second': 4733.947, 'train_steps_per_second': 36.984, 'total_flos': 0.0, 'train_loss': 0.07314278174144354, 'epoch': 5.0})

In [20]:
model.eval()

BiLSTMClassifier(
  (embedding): Embedding(16154, 300)
  (lstm): LSTM(300, 300, batch_first=True, bidirectional=True)
  (fc1): Linear(in_features=600, out_features=400, bias=True)
  (dropout): Dropout(p=0.2, inplace=False)
  (fc2): Linear(in_features=400, out_features=2, bias=True)
)

In [21]:
trainer.evaluate(test_data)

{'eval_loss': 0.21508057415485382,
 'eval_accuracy': 0.9354120267260579,
 'eval_precision': 0.9336257763657936,
 'eval_recall': 0.9363465118079115,
 'eval_f1': 0.9347566841704413,
 'eval_runtime': 7.0643,
 'eval_samples_per_second': 1906.784,
 'eval_steps_per_second': 15.005,
 'epoch': 5.0}

In [22]:
torch.save(model.state_dict(), f"{os.path.expanduser('~')}/models/{DATASET}/bilstm-base-aug.pth")

In [25]:
test_blank_logits = base.generate_logits(test_blank_dataloader, model, images=False)
base.generate_real_test_file_sst2(test_blank_logits, f"{os.path.expanduser('~')}/data/{DATASET}/bilstm-base-aug-test.tsv")

Generating logits for given dataset:   0%|          | 0/15 [00:00<?, ?it/s]

Created output file named: /home/jovyan/data/sst2/bilstm-base-aug-test.tsv upload it to GLUE benchmark to obtain results!


Reálné skóre na pravé test části datasetu

![Real test score (GLUE Benchmark)](../imgs/sst2_BiLSTM_base_aug_score.png)

In [26]:
student_model = base.BiLSTMClassifier(embedding_matrix=embedding_matrix, embedding_dim=embedding_dim, fc_dim=400, hidden_dim=300, output_dim=2)

In [27]:
training_args = base.get_training_args(output_dir=f"~/results/{DATASET}/bilstm-distill-aug", remove_unused_columns=False, logging_dir=f"~/logs/{DATASET}/bilstm-distill-aug", lr=.001,  epochs=10, batch_size=128, lambda_param=.75, temp=5)

In [28]:
base.reset_seed()

In [29]:
trainer = base.DistilTrainer(
    student_model=student_model,
    args=training_args,
    train_dataset=all_train_data,
    eval_dataset=eval_data,
    compute_metrics=base.compute_metrics,
    callbacks = [EarlyStoppingCallback(early_stopping_patience = 4)]
)

In [30]:
trainer.train()

Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.851,1.43449,0.87844,0.881399,0.877642,0.878007
2,0.2615,1.108655,0.894495,0.894462,0.894586,0.894482
3,0.1556,1.152781,0.880734,0.880724,0.880652,0.880683
4,0.1147,1.126235,0.885321,0.885504,0.885114,0.885234
5,0.0916,1.176221,0.884174,0.885732,0.884788,0.88414
6,0.0763,1.073746,0.87844,0.87844,0.878568,0.87843


TrainOutput(global_step=25014, training_loss=0.2584310964570778, metrics={'train_runtime': 558.8297, 'train_samples_per_second': 9549.062, 'train_steps_per_second': 74.602, 'total_flos': 0.0, 'train_loss': 0.2584310964570778, 'epoch': 6.0})

In [31]:
student_model.eval()

BiLSTMClassifier(
  (embedding): Embedding(16154, 300)
  (lstm): LSTM(300, 300, batch_first=True, bidirectional=True)
  (fc1): Linear(in_features=600, out_features=400, bias=True)
  (dropout): Dropout(p=0.2, inplace=False)
  (fc2): Linear(in_features=400, out_features=2, bias=True)
)

In [32]:
trainer.evaluate(test_data)

{'eval_loss': 0.462904691696167,
 'eval_accuracy': 0.954046028210839,
 'eval_precision': 0.9529063308919957,
 'eval_recall': 0.9541452775950976,
 'eval_f1': 0.9534883150124864,
 'eval_runtime': 5.4017,
 'eval_samples_per_second': 2493.638,
 'eval_steps_per_second': 19.623,
 'epoch': 6.0}

In [33]:
torch.save(model.state_dict(), f"{os.path.expanduser('~')}/models/{DATASET}/bilstm-distill-aug.pth")

In [34]:
test_blank_logits = base.generate_logits(test_blank_dataloader, student_model, images=False)
base.generate_real_test_file_sst2(test_blank_logits, f"{os.path.expanduser('~')}/data/{DATASET}/bilstm-distill-aug-test.tsv")

Generating logits for given dataset:   0%|          | 0/15 [00:00<?, ?it/s]

Created output file named: /home/jovyan/data/sst2/bilstm-distill-aug-test.tsv upload it to GLUE benchmark to obtain results!


Reálné skóre na pravé test části datasetu

![Real test score (GLUE Benchmark)](../imgs/sst2_BiLSTM_distill_aug_score.png)

Skóre učitelského modelu na reálné test části datasetu

![SST2 test score for best BERT model](../imgs/sst2_BERT_test_score.png)