In [141]:
from datasets import concatenate_datasets, load_from_disk
from transformers import BasicTokenizer, Trainer, EarlyStoppingCallback
from torch.utils.data import DataLoader
import kagglehub
import torch
import base
import os
import copy

In [142]:
my_glove = kagglehub.dataset_download("thanakomsn/glove6b300dtxt")
print(my_glove)

/home/jovyan/.cache/kagglehub/datasets/thanakomsn/glove6b300dtxt/versions/1


In [143]:
GLOVE_FILE = f"{my_glove}/glove.6B.300d.txt"
DATASET = "sst2"

In [144]:
train_data = load_from_disk(f"~/data/{DATASET}/train-logits")
eval_data = load_from_disk(f"~/data/{DATASET}/eval-logits")
test_data = load_from_disk(f"~/data/{DATASET}/test-logits")

all_train_data = load_from_disk(f"~/data/{DATASET}/train-logits-augmented")
test_blank_data = load_from_disk(f"~/data/{DATASET}/test-blank-logits")

all_data = concatenate_datasets([load_from_disk(file) for file in [f"~/data/{DATASET}/eval-logits", f"~/data/{DATASET}/test-logits", f"~/data/{DATASET}/train-logits-augmented", f"~/data/{DATASET}/test-blank-logits"]])
tokenizer = BasicTokenizer(do_lower_case=True)

In [145]:
if torch.cuda.is_available():
    device = torch.device("cuda")
    print("GPU is available and will be used:", torch.cuda.get_device_name(0))
else:
    device = torch.device("cpu")
    print("GPU is not available, using CPU.")

GPU is available and will be used: NVIDIA A100 80GB PCIe MIG 2g.20gb


In [146]:
train_data_tokens = list(map(lambda e: tokenizer.tokenize(e["sentence"]), train_data))
eval_data_tokens = list(map(lambda e: tokenizer.tokenize(e["sentence"]), eval_data))
test_data_tokens = list(map(lambda e: tokenizer.tokenize(e["sentence"]), test_data))

all_train_data_tokens = list(map(lambda e: tokenizer.tokenize(e["sentence"]), all_train_data))
test_data_blank_tokens = list(map(lambda e: tokenizer.tokenize(e["sentence"]), test_blank_data))

all_data_tokens = list(map(lambda e: tokenizer.tokenize(e["sentence"]), all_data))

In [147]:
vocab = base.get_vocab(all_data_tokens)

In [148]:
word_index = dict(zip(vocab, range(len(vocab))))

In [149]:
embeddings_index = base.get_embeddings_indeces(GLOVE_FILE)

Found 400000 word vectors.


In [150]:
print(len(vocab))
num_tokens = len(vocab) + 2
embedding_dim = 300

16152


In [151]:
embedding_matrix = base.get_embedding_matrix(num_tokens, embedding_dim, word_index, embeddings_index)

Converted 15775 words (377) misses


In [152]:
train_data_index = list(map(lambda x: list(map(lambda y: word_index[y], x)),train_data_tokens))
eval_data_index = list(map(lambda x: list(map(lambda y: word_index[y], x)),eval_data_tokens))
test_data_index = list(map(lambda x: list(map(lambda y: word_index[y], x)),test_data_tokens))

all_train_data_index = list(map(lambda x: list(map(lambda y: word_index[y], x)),all_train_data_tokens))
test_data_blank_index = list(map(lambda x: list(map(lambda y: word_index[y], x)),test_data_blank_tokens))

In [153]:
train_padded_data = list(map(lambda x: base.padd(x,60), train_data_index))
eval_padded_data = list(map(lambda x: base.padd(x,60), eval_data_index))
test_padded_data = list(map(lambda x: base.padd(x,60), test_data_index))

all_train_padded_data = list(map(lambda x: base.padd(x,60), all_train_data_index))
test_blank_padded_data = list(map(lambda x: base.padd(x,60), test_data_blank_index))

In [154]:
train_data = train_data.add_column("input_ids", train_padded_data)
eval_data = eval_data.add_column("input_ids", eval_padded_data)
test_data = test_data.add_column("input_ids", test_padded_data)

all_train_data = all_train_data.add_column("input_ids", all_train_padded_data)
test_blank_data = test_blank_data.add_column("input_ids", test_blank_padded_data)

In [155]:
train_data_gpu = copy.deepcopy(train_data)
train_data_gpu.set_format(type="torch", columns=["input_ids"], device="cuda")
gpu_data_loader = DataLoader(train_data_gpu, batch_size=1, shuffle=False)

train_data_cpu = copy.deepcopy(train_data)
train_data_cpu.set_format(type="torch", columns=["input_ids"], device="cpu")
cpu_data_loader = DataLoader(train_data_cpu, batch_size=1, shuffle=False)

In [156]:
model = base.BiLSTMClassifier(embedding_matrix=embedding_matrix, embedding_dim=embedding_dim, fc_dim=400, hidden_dim=300, output_dim=2)

In [157]:
print(model)

BiLSTMClassifier(
  (embedding): Embedding(16154, 300)
  (lstm): LSTM(300, 300, batch_first=True, bidirectional=True)
  (fc1): Linear(in_features=600, out_features=400, bias=True)
  (dropout): Dropout(p=0.2, inplace=False)
  (fc2): Linear(in_features=400, out_features=2, bias=True)
)


In [158]:
training_args = base.get_training_args(output_dir=f"~/results/{DATASET}/bilstm-base", logging_dir=f"~/logs/{DATASET}/bilstm-base", lr=.0008,  epochs=20, warmup_steps=30, weight_decay=0.008)

In [159]:
base.reset_seed()

In [160]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_data,
    eval_dataset=eval_data,
    compute_metrics=base.compute_metrics,
    callbacks = [EarlyStoppingCallback(early_stopping_patience = 3)]
)

In [161]:
trainer.train()

Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.4003,0.423575,0.800459,0.803017,0.801276,0.800281
2,0.3063,0.40766,0.840596,0.843909,0.83968,0.839912
3,0.2472,0.42252,0.829128,0.833631,0.828039,0.828174
4,0.1974,0.498381,0.825688,0.828962,0.824745,0.824913
5,0.1536,0.48665,0.844037,0.845379,0.844616,0.843996
6,0.1181,0.497017,0.849771,0.849713,0.849741,0.849726
7,0.0906,0.578221,0.844037,0.846641,0.843226,0.84348
8,0.0664,0.645509,0.844037,0.845589,0.844658,0.843984
9,0.0495,0.766352,0.845183,0.847871,0.845994,0.845056


TrainOutput(global_step=3789, training_loss=0.18104239559702145, metrics={'train_runtime': 83.2241, 'train_samples_per_second': 12947.934, 'train_steps_per_second': 101.173, 'total_flos': 0.0, 'train_loss': 0.18104239559702145, 'epoch': 9.0})

In [162]:
model.eval()

BiLSTMClassifier(
  (embedding): Embedding(16154, 300)
  (lstm): LSTM(300, 300, batch_first=True, bidirectional=True)
  (fc1): Linear(in_features=600, out_features=400, bias=True)
  (dropout): Dropout(p=0.2, inplace=False)
  (fc2): Linear(in_features=400, out_features=2, bias=True)
)

In [163]:
trainer.evaluate(test_data)

{'eval_loss': 0.211845263838768,
 'eval_accuracy': 0.9325167037861916,
 'eval_precision': 0.9311167924071151,
 'eval_recall': 0.9323762422077401,
 'eval_f1': 0.9317044328462286,
 'eval_runtime': 4.278,
 'eval_samples_per_second': 3148.681,
 'eval_steps_per_second': 24.778,
 'epoch': 9.0}

In [164]:
torch.save(model.state_dict(), f"{os.path.expanduser('~')}/models/{DATASET}/bilstm-base.pth")

In [165]:
test_blank_data.set_format(type="torch", columns=["input_ids"], device="cuda")
test_blank_dataloader = DataLoader(test_blank_data, batch_size=128, shuffle=False)
test_blank_logits = base.generate_logits(test_blank_dataloader, model)

Generating logits for given dataset:   0%|          | 0/15 [00:00<?, ?it/s]

In [166]:
base.generate_real_test_file_sst2(test_blank_logits, f"{os.path.expanduser('~')}/data/{DATASET}/bilstm-base-test.tsv")

Created output file named: /home/jovyan/data/sst2/bilstm-base-test.tsv upload it to GLUE benchmark to obtain results!


In [195]:
student_model = base.BiLSTMClassifier(embedding_matrix=embedding_matrix, embedding_dim=embedding_dim, fc_dim=400, hidden_dim=300, output_dim=2)

In [196]:
training_args = base.get_training_args(output_dir=f"~/results/{DATASET}/bilstm-distill", remove_unused_columns=False, logging_dir=f"~/logs/{DATASET}/bilstm-distill", lr=.0009,  epochs=20, lambda_param=.8, temp=6.5, warmup_steps=40, weight_decay=0.05)

In [197]:
base.reset_seed()

In [198]:
trainer = base.DistilTrainer(
    student_model=student_model,
    args=training_args,
    train_dataset=train_data,
    eval_dataset=eval_data,
    compute_metrics=base.compute_metrics,
    callbacks = [EarlyStoppingCallback(early_stopping_patience = 3)]
)

In [199]:
trainer.train()

Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,2.7294,2.047331,0.81078,0.811102,0.811074,0.81078
2,1.7477,2.005532,0.840596,0.842283,0.839932,0.840171
3,1.2549,2.025156,0.838303,0.84128,0.837427,0.837657
4,0.9321,1.948179,0.837156,0.842783,0.835964,0.8361
5,0.7048,2.052461,0.847477,0.848205,0.84791,0.847467
6,0.5393,1.975898,0.845183,0.84552,0.845489,0.845183
7,0.4287,1.846855,0.858945,0.858899,0.859003,0.858923
8,0.3453,1.949711,0.850917,0.85111,0.851162,0.850917
9,0.2838,1.958161,0.857798,0.857912,0.858003,0.857795
10,0.2264,2.103384,0.853211,0.855525,0.853961,0.853118


TrainOutput(global_step=4210, training_loss=0.9192459360154395, metrics={'train_runtime': 97.1213, 'train_samples_per_second': 11095.199, 'train_steps_per_second': 86.696, 'total_flos': 0.0, 'train_loss': 0.9192459360154395, 'epoch': 10.0})

In [200]:
student_model.eval()

BiLSTMClassifier(
  (embedding): Embedding(16154, 300)
  (lstm): LSTM(300, 300, batch_first=True, bidirectional=True)
  (fc1): Linear(in_features=600, out_features=400, bias=True)
  (dropout): Dropout(p=0.2, inplace=False)
  (fc2): Linear(in_features=400, out_features=2, bias=True)
)

In [201]:
trainer.evaluate(test_data)

{'eval_loss': 0.7786824703216553,
 'eval_accuracy': 0.9403860430586488,
 'eval_precision': 0.9389683297197238,
 'eval_recall': 0.9405437586574459,
 'eval_f1': 0.9396900728097117,
 'eval_runtime': 5.3219,
 'eval_samples_per_second': 2531.068,
 'eval_steps_per_second': 19.918,
 'epoch': 10.0}

In [202]:
torch.save(student_model.state_dict(), f"{os.path.expanduser('~')}/models/{DATASET}/bilstm-distill.pth")

In [203]:
test_blank_logits = base.generate_logits(test_blank_dataloader, student_model)
base.generate_real_test_file_sst2(test_blank_logits, f"{os.path.expanduser('~')}/data/{DATASET}/bilstm-distill-test.tsv")

Generating logits for given dataset:   0%|          | 0/15 [00:00<?, ?it/s]

Created output file named: /home/jovyan/data/sst2/bilstm-distill-test.tsv upload it to GLUE benchmark to obtain results!


In [176]:
model = base.BiLSTMClassifier(embedding_matrix=embedding_matrix, embedding_dim=embedding_dim, fc_dim=400, hidden_dim=300, output_dim=2)

In [177]:
training_args = base.get_training_args(output_dir=f"~/results/{DATASET}/bilstm-base-aug", logging_dir=f"~/logs/{DATASET}/bilstm-base-aug", lr=.0015, epochs=20, weight_decay=0.01, warmup_steps=30)

In [178]:
base.reset_seed()

In [179]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=all_train_data,
    eval_dataset=eval_data,
    compute_metrics=base.compute_metrics,
    callbacks = [EarlyStoppingCallback(early_stopping_patience = 3)]
)

In [180]:
trainer.train()

Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.2496,0.408391,0.861239,0.862059,0.860792,0.861015
2,0.1021,0.522532,0.849771,0.850225,0.85012,0.849769
3,0.057,0.598302,0.868119,0.868871,0.868559,0.868111
4,0.0379,0.744056,0.853211,0.853211,0.85333,0.853199
5,0.0273,0.877345,0.855505,0.858107,0.856298,0.855395
6,0.0215,0.942461,0.861239,0.861636,0.860918,0.861085


TrainOutput(global_step=13770, training_loss=0.08254670064866586, metrics={'train_runtime': 171.9199, 'train_samples_per_second': 34159.634, 'train_steps_per_second': 266.985, 'total_flos': 0.0, 'train_loss': 0.08254670064866586, 'epoch': 6.0})

In [181]:
model.eval()

BiLSTMClassifier(
  (embedding): Embedding(16154, 300)
  (lstm): LSTM(300, 300, batch_first=True, bidirectional=True)
  (fc1): Linear(in_features=600, out_features=400, bias=True)
  (dropout): Dropout(p=0.2, inplace=False)
  (fc2): Linear(in_features=400, out_features=2, bias=True)
)

In [182]:
trainer.evaluate(test_data)

{'eval_loss': 0.2846313714981079,
 'eval_accuracy': 0.9418708240534521,
 'eval_precision': 0.940450301472669,
 'eval_recall': 0.9420834857985264,
 'eval_f1': 0.9411959783305908,
 'eval_runtime': 4.143,
 'eval_samples_per_second': 3251.286,
 'eval_steps_per_second': 25.585,
 'epoch': 6.0}

In [183]:
torch.save(model.state_dict(), f"{os.path.expanduser('~')}/models/{DATASET}/bilstm-base-aug.pth")

In [184]:
test_blank_logits = base.generate_logits(test_blank_dataloader, model)
base.generate_real_test_file_sst2(test_blank_logits, f"{os.path.expanduser('~')}/data/{DATASET}/bilstm-base-aug-test.tsv")

Generating logits for given dataset:   0%|          | 0/15 [00:00<?, ?it/s]

Created output file named: /home/jovyan/data/sst2/bilstm-base-aug-test.tsv upload it to GLUE benchmark to obtain results!


In [185]:
student_model = base.BiLSTMClassifier(embedding_matrix=embedding_matrix, embedding_dim=embedding_dim, fc_dim=400, hidden_dim=300, output_dim=2)

In [186]:
training_args = base.get_training_args(output_dir=f"~/results/{DATASET}/bilstm-distill-aug", remove_unused_columns=False, logging_dir=f"~/logs/{DATASET}/bilstm-distill-aug", lr=.002,  epochs=20, weight_decay=0.005, warmup_steps=60, lambda_param=.6, temp=4)

In [187]:
base.reset_seed()

In [188]:
trainer = base.DistilTrainer(
    student_model=student_model,
    args=training_args,
    train_dataset=all_train_data,
    eval_dataset=eval_data,
    compute_metrics=base.compute_metrics,
    callbacks = [EarlyStoppingCallback(early_stopping_patience = 3)]
)

In [189]:
trainer.train()

Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.7797,1.04171,0.865826,0.867457,0.865212,0.865499
2,0.2482,1.082026,0.865826,0.866664,0.865381,0.865609
3,0.1521,1.007746,0.888761,0.888742,0.888871,0.88875
4,0.1157,1.133531,0.881881,0.881962,0.882072,0.881877
5,0.0964,1.017902,0.870413,0.871107,0.870011,0.870227
6,0.0823,0.99948,0.884174,0.88412,0.884198,0.884149


TrainOutput(global_step=13770, training_loss=0.2457339849866813, metrics={'train_runtime': 181.7204, 'train_samples_per_second': 32317.341, 'train_steps_per_second': 252.586, 'total_flos': 0.0, 'train_loss': 0.2457339849866813, 'epoch': 6.0})

In [190]:
student_model.eval()

BiLSTMClassifier(
  (embedding): Embedding(16154, 300)
  (lstm): LSTM(300, 300, batch_first=True, bidirectional=True)
  (fc1): Linear(in_features=600, out_features=400, bias=True)
  (dropout): Dropout(p=0.2, inplace=False)
  (fc2): Linear(in_features=400, out_features=2, bias=True)
)

In [191]:
trainer.evaluate(test_data)

{'eval_loss': 0.36059969663619995,
 'eval_accuracy': 0.9543429844097996,
 'eval_precision': 0.9533770982716383,
 'eval_recall': 0.9541851628471268,
 'eval_f1': 0.9537658180823729,
 'eval_runtime': 5.1442,
 'eval_samples_per_second': 2618.491,
 'eval_steps_per_second': 20.606,
 'epoch': 6.0}

In [192]:
torch.save(model.state_dict(), f"{os.path.expanduser('~')}/models/{DATASET}/bilstm-distill-aug.pth")

In [193]:
test_blank_logits = base.generate_logits(test_blank_dataloader, student_model)
base.generate_real_test_file_sst2(test_blank_logits, f"{os.path.expanduser('~')}/data/{DATASET}/bilstm-distill-aug-test.tsv")

Generating logits for given dataset:   0%|          | 0/15 [00:00<?, ?it/s]

Created output file named: /home/jovyan/data/sst2/bilstm-distill-aug-test.tsv upload it to GLUE benchmark to obtain results!


In [None]:
base.count_parameters(student_model)

model size: 24.918MB.
Total Trainable Params: 1686002.


Unnamed: 0,Modules,Parameters
0,lstm.weight_ih_l0,360000
1,lstm.weight_hh_l0,360000
2,lstm.bias_ih_l0,1200
3,lstm.bias_hh_l0,1200
4,lstm.weight_ih_l0_reverse,360000
5,lstm.weight_hh_l0_reverse,360000
6,lstm.bias_ih_l0_reverse,1200
7,lstm.bias_hh_l0_reverse,1200
8,fc1.weight,240000
9,fc1.bias,400


In [None]:
cpu_benchmark = base.BenchMarkRunner(student_model, cpu_data_loader, "cpu", 1000)
print(cpu_benchmark.run_benchmark())

<torch.utils.benchmark.utils.common.Measurement object at 0x781407387fa0>
self.infer_speed_comp()
  3.45 ms
  1 measurement, 1000 runs , 4 threads


In [None]:
gpu_benchmark = base.BenchMarkRunner(student_model, gpu_data_loader, "cuda", 1000)
print(gpu_benchmark.run_benchmark())

<torch.utils.benchmark.utils.common.Measurement object at 0x78145c7b40a0>
self.infer_speed_comp()
  1.74 ms
  1 measurement, 1000 runs , 4 threads
