In [2]:
from datasets import concatenate_datasets, load_from_disk
from transformers import BasicTokenizer, Trainer, EarlyStoppingCallback
from torch.utils.data import DataLoader
import kagglehub
import torch
import base
import os

[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /home/jovyan/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package punkt to /home/jovyan/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to /home/jovyan/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger_eng to
[nltk_data]     /home/jovyan/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger_eng is already up-to-
[nltk_data]       date!


In [3]:
my_glove = kagglehub.dataset_download("thanakomsn/glove6b300dtxt")
print(my_glove)

/home/jovyan/.cache/kagglehub/datasets/thanakomsn/glove6b300dtxt/versions/1


In [4]:
GLOVE_FILE = f"{my_glove}/glove.6B.300d.txt"
DATASET = "sst2"

In [5]:
train_data = load_from_disk(f"~/data/{DATASET}/train-logits")
eval_data = load_from_disk(f"~/data/{DATASET}/eval-logits")
test_data = load_from_disk(f"~/data/{DATASET}/test-logits")

all_train_data = load_from_disk(f"~/data/{DATASET}/train-logits-augmented")
test_blank_data = load_from_disk(f"~/data/{DATASET}/test-blank-logits")

all_data = concatenate_datasets([load_from_disk(file) for file in [f"~/data/{DATASET}/eval-logits", f"~/data/{DATASET}/test-logits", f"~/data/{DATASET}/train-logits-augmented", f"~/data/{DATASET}/test-blank-logits"]])
tokenizer = BasicTokenizer(do_lower_case=True)

In [6]:
if torch.cuda.is_available():
    device = torch.device("cuda")
    print("GPU is available and will be used:", torch.cuda.get_device_name(0))
else:
    device = torch.device("cpu")
    print("GPU is not available, using CPU.")

GPU is available and will be used: NVIDIA A100 80GB PCIe MIG 2g.20gb


In [7]:
train_data_tokens = list(map(lambda e: tokenizer.tokenize(e["sentence"]), train_data))
eval_data_tokens = list(map(lambda e: tokenizer.tokenize(e["sentence"]), eval_data))
test_data_tokens = list(map(lambda e: tokenizer.tokenize(e["sentence"]), test_data))

all_train_data_tokens = list(map(lambda e: tokenizer.tokenize(e["sentence"]), all_train_data))
test_data_blank_tokens = list(map(lambda e: tokenizer.tokenize(e["sentence"]), test_blank_data))

all_data_tokens = list(map(lambda e: tokenizer.tokenize(e["sentence"]), all_data))

In [8]:
vocab = base.get_vocab(all_data_tokens)

In [9]:
word_index = dict(zip(vocab, range(len(vocab))))

In [10]:
embeddings_index = base.get_embeddings_indeces(GLOVE_FILE)

Found 400000 word vectors.


In [11]:
print(len(vocab))
num_tokens = len(vocab) + 2
embedding_dim = 300

16152


In [12]:
embedding_matrix = base.get_embedding_matrix(num_tokens, embedding_dim, word_index, embeddings_index)

Converted 15775 words (377) misses


In [13]:
train_data_index = list(map(lambda x: list(map(lambda y: word_index[y], x)),train_data_tokens))
eval_data_index = list(map(lambda x: list(map(lambda y: word_index[y], x)),eval_data_tokens))
test_data_index = list(map(lambda x: list(map(lambda y: word_index[y], x)),test_data_tokens))

all_train_data_index = list(map(lambda x: list(map(lambda y: word_index[y], x)),all_train_data_tokens))
test_data_blank_index = list(map(lambda x: list(map(lambda y: word_index[y], x)),test_data_blank_tokens))

In [14]:
train_padded_data = list(map(lambda x: base.padd(x,60), train_data_index))
eval_padded_data = list(map(lambda x: base.padd(x,60), eval_data_index))
test_padded_data = list(map(lambda x: base.padd(x,60), test_data_index))

all_train_padded_data = list(map(lambda x: base.padd(x,60), all_train_data_index))
test_blank_padded_data = list(map(lambda x: base.padd(x,60), test_data_blank_index))

In [15]:
train_data = train_data.add_column("input_ids", train_padded_data)
eval_data = eval_data.add_column("input_ids", eval_padded_data)
test_data = test_data.add_column("input_ids", test_padded_data)

all_train_data = all_train_data.add_column("input_ids", all_train_padded_data)
test_blank_data = test_blank_data.add_column("input_ids", test_blank_padded_data)

In [15]:
model = base.BiLSTMClassifier(embedding_matrix=embedding_matrix, embedding_dim=embedding_dim, fc_dim=400, hidden_dim=300, output_dim=2, freeze_embed = False)

In [16]:
print(model)

BiLSTMClassifier(
  (embedding): Embedding(16154, 300)
  (lstm): LSTM(300, 300, batch_first=True, bidirectional=True)
  (fc1): Linear(in_features=600, out_features=400, bias=True)
  (dropout): Dropout(p=0.2, inplace=False)
  (fc2): Linear(in_features=400, out_features=2, bias=True)
)


In [17]:
training_args = base.get_training_args(output_dir=f"~/results/{DATASET}/bilstm-base_embedd", logging_dir=f"~/logs/{DATASET}/bilstm-base_embedd", lr=.001,  epochs=10, batch_size=128)

In [18]:
base.reset_seed()

In [19]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_data,
    eval_dataset=eval_data,
    compute_metrics=base.compute_metrics,
    callbacks = [EarlyStoppingCallback(early_stopping_patience = 3)]
)

In [20]:
trainer.train()

Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.316,0.484727,0.811927,0.830696,0.814094,0.809902
2,0.1764,0.431867,0.836009,0.836462,0.835638,0.835802
3,0.1182,0.413911,0.849771,0.849713,0.849741,0.849726
4,0.0823,0.608139,0.840596,0.841192,0.840185,0.840368
5,0.057,0.673341,0.829128,0.829978,0.828629,0.82882
6,0.0408,0.841219,0.836009,0.840632,0.834923,0.835093


TrainOutput(global_step=2526, training_loss=0.1317673847020286, metrics={'train_runtime': 256.3385, 'train_samples_per_second': 2101.869, 'train_steps_per_second': 16.424, 'total_flos': 0.0, 'train_loss': 0.1317673847020286, 'epoch': 6.0})

In [21]:
model.eval()

BiLSTMClassifier(
  (embedding): Embedding(16154, 300)
  (lstm): LSTM(300, 300, batch_first=True, bidirectional=True)
  (fc1): Linear(in_features=600, out_features=400, bias=True)
  (dropout): Dropout(p=0.2, inplace=False)
  (fc2): Linear(in_features=400, out_features=2, bias=True)
)

In [22]:
trainer.evaluate(test_data)

{'eval_loss': 0.1866329163312912,
 'eval_accuracy': 0.9335560504825539,
 'eval_precision': 0.9317930301132494,
 'eval_recall': 0.9343348203568249,
 'eval_f1': 0.932866655056092,
 'eval_runtime': 6.9727,
 'eval_samples_per_second': 1931.813,
 'eval_steps_per_second': 15.202,
 'epoch': 6.0}

In [23]:
torch.save(model.state_dict(), f"{os.path.expanduser('~')}/models/{DATASET}/bilstm-base_embedd.pth")

In [25]:
test_blank_data.set_format(type="torch", columns=["input_ids"], device="cuda")
test_blank_dataloader = DataLoader(test_blank_data, batch_size=128, shuffle=False)
test_blank_logits = base.generate_logits(test_blank_dataloader, model, images=False)

Generating logits for given dataset:   0%|          | 0/15 [00:00<?, ?it/s]

In [25]:
base.generate_real_test_file_sst2(test_blank_logits, f"{os.path.expanduser('~')}/data/{DATASET}/bilstm-base-embedd-test.tsv")

Created output file named: /home/jovyan/data/sst2/bilstm-base-embedd-test.tsv upload it to GLUE benchmark to obtain results!


Reálné skóre na pravé test části datasetu

![Real test score (GLUE Benchmark)](../imgs/sst2_BiLSTM_base_embedd_score.png)

**updated**

In [26]:
student_model = base.BiLSTMClassifier(embedding_matrix=embedding_matrix, embedding_dim=embedding_dim, fc_dim=400, hidden_dim=300, output_dim=2, freeze_embed=False)

In [27]:
training_args = base.get_training_args(output_dir=f"~/results/{DATASET}/bilstm-distill_embedd", remove_unused_columns=False, logging_dir=f"~/logs/{DATASET}/bilstm-distill_embedd", lr=.001,  epochs=10, batch_size=128, lambda_param=.75, temp=5)

In [28]:
base.reset_seed()

In [29]:
trainer = base.DistilTrainer(
    student_model=student_model,
    args=training_args,
    train_dataset=train_data,
    eval_dataset=eval_data,
    compute_metrics=base.compute_metrics,
    callbacks = [EarlyStoppingCallback(early_stopping_patience = 3)]
)

In [30]:
trainer.train()

Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,1.7193,2.328375,0.805046,0.816254,0.806748,0.803853
2,0.7023,1.488348,0.854358,0.854302,0.85433,0.854315
3,0.4009,1.43368,0.852064,0.852165,0.851867,0.851961
4,0.2527,1.453512,0.861239,0.862418,0.860708,0.86096
5,0.1799,1.513405,0.858945,0.860113,0.858413,0.858662
6,0.1305,1.5147,0.854358,0.854352,0.854246,0.854289
7,0.1015,1.527988,0.862385,0.862537,0.862171,0.862281
8,0.081,1.514017,0.858945,0.859337,0.858624,0.858789
9,0.0663,1.546627,0.854358,0.854311,0.854414,0.854335
10,0.0571,1.563679,0.858945,0.859054,0.85875,0.858847


TrainOutput(global_step=4210, training_loss=0.369151922302971, metrics={'train_runtime': 361.5854, 'train_samples_per_second': 1490.077, 'train_steps_per_second': 11.643, 'total_flos': 0.0, 'train_loss': 0.369151922302971, 'epoch': 10.0})

In [31]:
student_model.eval()

BiLSTMClassifier(
  (embedding): Embedding(16154, 300)
  (lstm): LSTM(300, 300, batch_first=True, bidirectional=True)
  (fc1): Linear(in_features=600, out_features=400, bias=True)
  (dropout): Dropout(p=0.2, inplace=False)
  (fc2): Linear(in_features=400, out_features=2, bias=True)
)

In [32]:
trainer.evaluate(test_data)

{'eval_loss': 0.5100557804107666,
 'eval_accuracy': 0.9475129918337045,
 'eval_precision': 0.946323592092529,
 'eval_recall': 0.9474888424079841,
 'eval_f1': 0.9468724896404928,
 'eval_runtime': 10.3649,
 'eval_samples_per_second': 1299.583,
 'eval_steps_per_second': 10.227,
 'epoch': 10.0}

In [33]:
torch.save(student_model.state_dict(), f"{os.path.expanduser('~')}/models/{DATASET}/bilstm-distill_embedd.pth")

In [34]:
test_blank_logits = base.generate_logits(test_blank_dataloader, student_model, images=False)
base.generate_real_test_file_sst2(test_blank_logits, f"{os.path.expanduser('~')}/data/{DATASET}/bilstm-distill-embedd-test.tsv")

Generating logits for given dataset:   0%|          | 0/15 [00:00<?, ?it/s]

Created output file named: /home/jovyan/data/sst2/bilstm-distill-embedd-test.tsv upload it to GLUE benchmark to obtain results!


Reálné skóre na pravé test části datasetu

![Real test score (GLUE Benchmark)](../imgs/sst2_BiLSTM_distill_embedd_score.png)


**UPDATED**

In [16]:
model = base.BiLSTMClassifier(embedding_matrix=embedding_matrix, embedding_dim=embedding_dim, fc_dim=400, hidden_dim=300, output_dim=2, freeze_embed=False)

In [17]:
training_args = base.get_training_args(output_dir=f"~/results/{DATASET}/bilstm-base-aug_embedd", logging_dir=f"~/logs/{DATASET}/bilstm-base-aug_embedd", lr=.001,  epochs=10, batch_size=128)

In [18]:
base.reset_seed()

In [19]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=all_train_data,
    eval_dataset=eval_data,
    compute_metrics=base.compute_metrics,
    callbacks = [EarlyStoppingCallback(early_stopping_patience = 4)]
)

In [20]:
trainer.train()

Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.1297,0.627202,0.833716,0.83395,0.833428,0.833556
2,0.0456,0.935746,0.819954,0.82039,0.820293,0.819952
3,0.0256,1.162975,0.81078,0.810971,0.810485,0.810598
4,0.0157,1.415282,0.809633,0.810189,0.80919,0.809343
5,0.0099,1.797763,0.797018,0.797096,0.797182,0.797012


TrainOutput(global_step=20845, training_loss=0.045277036400143265, metrics={'train_runtime': 1157.4535, 'train_samples_per_second': 4610.38, 'train_steps_per_second': 36.019, 'total_flos': 0.0, 'train_loss': 0.045277036400143265, 'epoch': 5.0})

In [21]:
model.eval()

BiLSTMClassifier(
  (embedding): Embedding(16154, 300)
  (lstm): LSTM(300, 300, batch_first=True, bidirectional=True)
  (fc1): Linear(in_features=600, out_features=400, bias=True)
  (dropout): Dropout(p=0.2, inplace=False)
  (fc2): Linear(in_features=400, out_features=2, bias=True)
)

In [22]:
trainer.evaluate(test_data)

{'eval_loss': 0.23940490186214447,
 'eval_accuracy': 0.94090571640683,
 'eval_precision': 0.9401852850003345,
 'eval_recall': 0.9399999785491082,
 'eval_f1': 0.9400918558912927,
 'eval_runtime': 47.8406,
 'eval_samples_per_second': 281.56,
 'eval_steps_per_second': 2.216,
 'epoch': 5.0}

In [23]:
torch.save(model.state_dict(), f"{os.path.expanduser('~')}/models/{DATASET}/bilstm-base-aug_embedd.pth")

In [26]:
test_blank_logits = base.generate_logits(test_blank_dataloader, model, images=False)
base.generate_real_test_file_sst2(test_blank_logits, f"{os.path.expanduser('~')}/data/{DATASET}/bilstm-base-aug-embedd-test.tsv")

Generating logits for given dataset:   0%|          | 0/15 [00:00<?, ?it/s]

Created output file named: /home/jovyan/data/sst2/bilstm-base-aug-embedd-test.tsv upload it to GLUE benchmark to obtain results!


Reálné skóre na pravé test části datasetu

![Real test score (GLUE Benchmark)](../imgs/sst2_BiLSTM_base_aug_embedd_score.png)

In [27]:
student_model = base.BiLSTMClassifier(embedding_matrix=embedding_matrix, embedding_dim=embedding_dim, fc_dim=400, hidden_dim=300, output_dim=2, freeze_embed=False)

In [28]:
training_args = base.get_training_args(output_dir=f"~/results/{DATASET}/bilstm-distill-aug_embedd", remove_unused_columns=False, logging_dir=f"~/logs/{DATASET}/bilstm-distill-aug_embedd", lr=.001,  epochs=10, batch_size=128, lambda_param=.75, temp=5)

In [29]:
base.reset_seed()

In [30]:
trainer = base.DistilTrainer(
    student_model=student_model,
    args=training_args,
    train_dataset=all_train_data,
    eval_dataset=eval_data,
    compute_metrics=base.compute_metrics,
    callbacks = [EarlyStoppingCallback(early_stopping_patience = 4)]
)

In [31]:
trainer.train()

Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.4712,1.24585,0.879587,0.879938,0.879904,0.879587
2,0.1505,1.371614,0.856651,0.859405,0.857466,0.856533
3,0.1007,1.373434,0.854358,0.854462,0.854161,0.854256
4,0.0754,1.24974,0.862385,0.8625,0.862592,0.862382
5,0.0605,1.259394,0.869266,0.869559,0.869559,0.869266


TrainOutput(global_step=20845, training_loss=0.17166545425967708, metrics={'train_runtime': 445.1474, 'train_samples_per_second': 11987.714, 'train_steps_per_second': 93.654, 'total_flos': 0.0, 'train_loss': 0.17166545425967708, 'epoch': 5.0})

In [32]:
student_model.eval()

BiLSTMClassifier(
  (embedding): Embedding(16154, 300)
  (lstm): LSTM(300, 300, batch_first=True, bidirectional=True)
  (fc1): Linear(in_features=600, out_features=400, bias=True)
  (dropout): Dropout(p=0.2, inplace=False)
  (fc2): Linear(in_features=400, out_features=2, bias=True)
)

In [33]:
trainer.evaluate(test_data)

{'eval_loss': 0.4344218969345093,
 'eval_accuracy': 0.9533036377134373,
 'eval_precision': 0.9520047557400126,
 'eval_recall': 0.9536539181037126,
 'eval_f1': 0.9527599989685138,
 'eval_runtime': 5.6925,
 'eval_samples_per_second': 2366.29,
 'eval_steps_per_second': 18.621,
 'epoch': 5.0}

In [34]:
torch.save(model.state_dict(), f"{os.path.expanduser('~')}/models/{DATASET}/bilstm-distill-aug_embedd.pth")

In [35]:
test_blank_logits = base.generate_logits(test_blank_dataloader, student_model, images=False)
base.generate_real_test_file_sst2(test_blank_logits, f"{os.path.expanduser('~')}/data/{DATASET}/bilstm-distill-aug-embedd-test.tsv")

Generating logits for given dataset:   0%|          | 0/15 [00:00<?, ?it/s]

Created output file named: /home/jovyan/data/sst2/bilstm-distill-aug-embedd-test.tsv upload it to GLUE benchmark to obtain results!


Reálné skóre na pravé test části datasetu

![Real test score (GLUE Benchmark)](../imgs/ss2_BiLSTM_distil_aug_embedd_score.png)

Skóre učitelského modelu na reálné test části datasetu

![SST2 test score for best BERT model](../imgs/sst2_BERT_test_score.png)