In [1]:
from datasets import concatenate_datasets, load_from_disk
from transformers import BasicTokenizer, EarlyStoppingCallback, Trainer
from torch.utils.data import DataLoader
import kagglehub
import torch
import base
import copy
import os

[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /home/jovyan/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package punkt to /home/jovyan/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to /home/jovyan/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger_eng to
[nltk_data]     /home/jovyan/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger_eng is already up-to-
[nltk_data]       date!


In [2]:
my_glove = kagglehub.dataset_download("thanakomsn/glove6b300dtxt")
print(my_glove)

/home/jovyan/.cache/kagglehub/datasets/thanakomsn/glove6b300dtxt/versions/1


In [3]:
GLOVE_FILE = f"{my_glove}/glove.6B.300d.txt"
DATASET = "trec"

In [4]:
train_data = load_from_disk(f"~/data/{DATASET}/train-logits_fine")
eval_data = load_from_disk(f"~/data/{DATASET}/eval-logits_fine")
test_data = load_from_disk(f"~/data/{DATASET}/test-logits_fine")

all_train_data = load_from_disk(f"~/data/{DATASET}/train-logits-augmented_fine")

all_data = concatenate_datasets([load_from_disk(file) for file in [f"~/data/{DATASET}/eval-logits_fine", f"~/data/{DATASET}/test-logits_fine", f"~/data/{DATASET}/train-logits-augmented_fine"]])
tokenizer = BasicTokenizer(do_lower_case=True)

In [5]:
if torch.cuda.is_available():
    device = torch.device("cuda")
    print("GPU is available and will be used:", torch.cuda.get_device_name(0))
else:
    device = torch.device("cpu")
    print("GPU is not available, using CPU.")

GPU is available and will be used: NVIDIA A100 80GB PCIe MIG 2g.20gb


In [6]:
train_data_tokens = list(map(lambda e: tokenizer.tokenize(e["sentence"]), train_data))
eval_data_tokens = list(map(lambda e: tokenizer.tokenize(e["sentence"]), eval_data))
test_data_tokens = list(map(lambda e: tokenizer.tokenize(e["sentence"]), test_data))

all_train_data_tokens = list(map(lambda e: tokenizer.tokenize(e["sentence"]), all_train_data))

all_data_tokens = list(map(lambda e: tokenizer.tokenize(e["sentence"]), all_data))

In [7]:
vocab = base.get_vocab(all_data_tokens)

In [8]:
word_index = dict(zip(vocab, range(len(vocab))))

In [9]:
embeddings_index = base.get_embeddings_indeces(GLOVE_FILE)

Found 400000 word vectors.


In [10]:
print(len(vocab))
num_tokens = len(vocab) + 2
embedding_dim = 300

8766


In [11]:
embedding_matrix = base.get_embedding_matrix(num_tokens, embedding_dim, word_index, embeddings_index)

Converted 8551 words (215) misses


In [12]:
train_data_index = list(map(lambda x: list(map(lambda y: word_index[y], x)),train_data_tokens))
eval_data_index = list(map(lambda x: list(map(lambda y: word_index[y], x)),eval_data_tokens))
test_data_index = list(map(lambda x: list(map(lambda y: word_index[y], x)),test_data_tokens))

all_train_data_index = list(map(lambda x: list(map(lambda y: word_index[y], x)),all_train_data_tokens))

In [13]:
train_padded_data = list(map(lambda x: base.padd(x,60), train_data_index))
eval_padded_data = list(map(lambda x: base.padd(x,60), eval_data_index))
test_padded_data = list(map(lambda x: base.padd(x,60), test_data_index))

all_train_padded_data = list(map(lambda x: base.padd(x,60), all_train_data_index))

In [14]:
train_data = train_data.add_column("input_ids", train_padded_data)
eval_data = eval_data.add_column("input_ids", eval_padded_data)
test_data = test_data.add_column("input_ids", test_padded_data)

all_train_data = all_train_data.add_column("input_ids", all_train_padded_data)

In [15]:
train_data_gpu = copy.deepcopy(train_data)
train_data_gpu.set_format(type="torch", columns=["input_ids"], device="cuda")
gpu_data_loader = DataLoader(train_data_gpu, batch_size=1, shuffle=False)

train_data_cpu = copy.deepcopy(train_data)
train_data_cpu.set_format(type="torch", columns=["input_ids"], device="cpu")
cpu_data_loader = DataLoader(train_data_cpu, batch_size=1, shuffle=False)


In [None]:
model = base.BiLSTMClassifier(embedding_matrix=embedding_matrix, embedding_dim=embedding_dim, fc_dim=400, hidden_dim=300, output_dim=50)

In [103]:
print(model)

BiLSTMClassifier(
  (embedding): Embedding(8768, 300)
  (lstm): LSTM(300, 300, batch_first=True, bidirectional=True)
  (fc1): Linear(in_features=600, out_features=400, bias=True)
  (dropout): Dropout(p=0.2, inplace=False)
  (fc2): Linear(in_features=400, out_features=50, bias=True)
)


In [104]:
training_args = base.get_training_args(output_dir=f"~/results/{DATASET}/bilstm-base_fine", logging_dir=f"~/logs/{DATASET}/bilstm-base_fine", lr=.001,  epochs=10, batch_size=128)

In [105]:
base.reset_seed()

In [106]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_data,
    eval_dataset=eval_data,
    compute_metrics=base.compute_metrics,
    callbacks = [EarlyStoppingCallback(early_stopping_patience = 3)]
)

In [107]:
trainer.train()

Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,3.0739,2.503414,0.384051,0.073851,0.083262,0.058125
2,2.1646,1.925456,0.534372,0.137764,0.15621,0.131718
3,1.7397,1.590624,0.591201,0.241389,0.196628,0.192703
4,1.3876,1.379046,0.653529,0.304337,0.265769,0.264644
5,1.1608,1.252556,0.684693,0.352104,0.320205,0.31985
6,0.961,1.202357,0.696609,0.392709,0.365802,0.362126
7,0.8101,1.203011,0.683776,0.414235,0.343203,0.355138
8,0.7355,1.159855,0.701192,0.406064,0.392893,0.389383
9,0.6311,1.167883,0.698442,0.422664,0.391569,0.396941
10,0.5628,1.154422,0.701192,0.425888,0.394864,0.398052


TrainOutput(global_step=350, training_loss=1.3227099609375, metrics={'train_runtime': 66.8441, 'train_samples_per_second': 652.414, 'train_steps_per_second': 5.236, 'total_flos': 0.0, 'train_loss': 1.3227099609375, 'epoch': 10.0})

In [108]:
model.eval()

BiLSTMClassifier(
  (embedding): Embedding(8768, 300)
  (lstm): LSTM(300, 300, batch_first=True, bidirectional=True)
  (fc1): Linear(in_features=600, out_features=400, bias=True)
  (dropout): Dropout(p=0.2, inplace=False)
  (fc2): Linear(in_features=400, out_features=50, bias=True)
)

In [109]:
trainer.evaluate(test_data)

{'eval_loss': 1.0533905029296875,
 'eval_accuracy': 0.712,
 'eval_precision': 0.414032074537255,
 'eval_recall': 0.4614954277729287,
 'eval_f1': 0.4219846375382348,
 'eval_runtime': 4.6833,
 'eval_samples_per_second': 106.761,
 'eval_steps_per_second': 0.854,
 'epoch': 10.0}

In [24]:
torch.save(model.state_dict(), f"{os.path.expanduser('~')}/models/{DATASET}/bilstm-base_fine.pth")

In [25]:
base.count_parameters(model)

model size: 16.539MB.
Total Trainable Params: 1705250.


Unnamed: 0,Modules,Parameters
0,lstm.weight_ih_l0,360000
1,lstm.weight_hh_l0,360000
2,lstm.bias_ih_l0,1200
3,lstm.bias_hh_l0,1200
4,lstm.weight_ih_l0_reverse,360000
5,lstm.weight_hh_l0_reverse,360000
6,lstm.bias_ih_l0_reverse,1200
7,lstm.bias_hh_l0_reverse,1200
8,fc1.weight,240000
9,fc1.bias,400


In [37]:
cpu_benchmark = base.BenchMarkRunner(model, cpu_data_loader, "cpu", 1000)
print(cpu_benchmark.run_benchmark())

<torch.utils.benchmark.utils.common.Measurement object at 0x75e347187eb0>
self.infer_speed_comp()
  4.36 ms
  1 measurement, 1000 runs , 6 threads


In [39]:
gpu_benchmark = base.BenchMarkRunner(model, gpu_data_loader, "cuda", 1000)
print(gpu_benchmark.run_benchmark())

<torch.utils.benchmark.utils.common.Measurement object at 0x75e345fb7f10>
self.infer_speed_comp()
  1.88 ms
  1 measurement, 1000 runs , 6 threads


In [40]:
student_model = base.BiLSTMClassifier(embedding_matrix=embedding_matrix, embedding_dim=embedding_dim, fc_dim=400, hidden_dim=300, output_dim=50)

In [41]:
training_args = base.get_training_args(output_dir=f"~/results/{DATASET}/bilstm-distill_fine", remove_unused_columns=False, logging_dir=f"~/logs/{DATASET}/bilstm-distill_fine", lr=.001,  epochs=10, batch_size=128, lambda_param=.4, temp=2)

In [42]:
base.reset_seed()

In [43]:
trainer = base.DistilTrainer(
    student_model=student_model,
    args=training_args,
    train_dataset=train_data,
    eval_dataset=eval_data,
    compute_metrics=base.compute_metrics,
    callbacks = [EarlyStoppingCallback(early_stopping_patience = 3)]
)

In [44]:
trainer.train()

Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,2.6998,2.159288,0.384051,0.063069,0.081779,0.052825
2,1.8978,1.693547,0.505958,0.11498,0.133844,0.10829
3,1.5812,1.450519,0.562786,0.193427,0.173613,0.15687
4,1.3257,1.280039,0.637947,0.273468,0.23218,0.222331
5,1.1359,1.16361,0.668194,0.275144,0.266553,0.258865
6,0.9779,1.101211,0.678277,0.301448,0.286788,0.2827
7,0.8609,1.09632,0.671861,0.340059,0.283197,0.283167
8,0.7997,1.036376,0.698442,0.365521,0.318395,0.3189
9,0.7313,1.018743,0.699358,0.35007,0.321879,0.323286
10,0.6784,1.006209,0.703025,0.381331,0.328636,0.329858


TrainOutput(global_step=350, training_loss=1.268863983154297, metrics={'train_runtime': 64.6098, 'train_samples_per_second': 674.975, 'train_steps_per_second': 5.417, 'total_flos': 0.0, 'train_loss': 1.268863983154297, 'epoch': 10.0})

In [45]:
student_model.eval()

BiLSTMClassifier(
  (embedding): Embedding(8768, 300)
  (lstm): LSTM(300, 300, batch_first=True, bidirectional=True)
  (fc1): Linear(in_features=600, out_features=400, bias=True)
  (dropout): Dropout(p=0.2, inplace=False)
  (fc2): Linear(in_features=400, out_features=50, bias=True)
)

In [46]:
trainer.evaluate(test_data)

{'eval_loss': 0.9847412705421448,
 'eval_accuracy': 0.706,
 'eval_precision': 0.34694630717536823,
 'eval_recall': 0.3712546955537928,
 'eval_f1': 0.3264487866139811,
 'eval_runtime': 4.7634,
 'eval_samples_per_second': 104.967,
 'eval_steps_per_second': 0.84,
 'epoch': 10.0}

In [47]:
torch.save(student_model.state_dict(), f"{os.path.expanduser('~')}/models/{DATASET}/bilstm-distill_fine.pth")

In [48]:
base.count_parameters(student_model)

model size: 16.539MB.
Total Trainable Params: 1705250.


Unnamed: 0,Modules,Parameters
0,lstm.weight_ih_l0,360000
1,lstm.weight_hh_l0,360000
2,lstm.bias_ih_l0,1200
3,lstm.bias_hh_l0,1200
4,lstm.weight_ih_l0_reverse,360000
5,lstm.weight_hh_l0_reverse,360000
6,lstm.bias_ih_l0_reverse,1200
7,lstm.bias_hh_l0_reverse,1200
8,fc1.weight,240000
9,fc1.bias,400


In [49]:
cpu_benchmark = base.BenchMarkRunner(student_model, cpu_data_loader, "cpu", 1000)
print(cpu_benchmark.run_benchmark())

<torch.utils.benchmark.utils.common.Measurement object at 0x75e3644016f0>
self.infer_speed_comp()
  4.59 ms
  1 measurement, 1000 runs , 6 threads


In [50]:
gpu_benchmark = base.BenchMarkRunner(student_model, gpu_data_loader, "cuda", 1000)
print(gpu_benchmark.run_benchmark())

<torch.utils.benchmark.utils.common.Measurement object at 0x75e1cbc23d00>
self.infer_speed_comp()
  1.88 ms
  1 measurement, 1000 runs , 6 threads


In [51]:
model = base.BiLSTMClassifier(embedding_matrix=embedding_matrix, embedding_dim=embedding_dim, fc_dim=400, hidden_dim=300, output_dim=50)

In [52]:
training_args = base.get_training_args(output_dir=f"~/results/{DATASET}/bilstm-base-aug_fine", logging_dir=f"~/logs/{DATASET}/bilstm-base-aug_fine", lr=.001,  epochs=10, batch_size=128)

In [53]:
base.reset_seed()

In [54]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=all_train_data,
    eval_dataset=eval_data,
    compute_metrics=base.compute_metrics,
    callbacks = [EarlyStoppingCallback(early_stopping_patience = 4)]
)

In [55]:
trainer.train()

Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,1.0459,1.023053,0.758937,0.635869,0.536293,0.552683
2,0.177,1.187909,0.786434,0.691892,0.641475,0.65062
3,0.0557,1.274335,0.80385,0.723567,0.669486,0.686155
4,0.024,1.442288,0.794684,0.68519,0.649686,0.651868
5,0.0108,1.473755,0.812099,0.730327,0.689053,0.696772
6,0.005,1.585668,0.805683,0.728329,0.681206,0.692194
7,0.0037,1.6244,0.814849,0.785707,0.684487,0.713712
8,0.0016,1.612765,0.814849,0.761628,0.692895,0.713338
9,0.0007,1.629979,0.817599,0.780827,0.704461,0.72828
10,0.0004,1.663172,0.815765,0.771773,0.693528,0.71878


TrainOutput(global_step=5250, training_loss=0.13248115424598966, metrics={'train_runtime': 123.4803, 'train_samples_per_second': 5432.445, 'train_steps_per_second': 42.517, 'total_flos': 0.0, 'train_loss': 0.13248115424598966, 'epoch': 10.0})

In [56]:
model.eval()

BiLSTMClassifier(
  (embedding): Embedding(8768, 300)
  (lstm): LSTM(300, 300, batch_first=True, bidirectional=True)
  (fc1): Linear(in_features=600, out_features=400, bias=True)
  (dropout): Dropout(p=0.2, inplace=False)
  (fc2): Linear(in_features=400, out_features=50, bias=True)
)

In [57]:
trainer.evaluate(test_data)

{'eval_loss': 1.3135536909103394,
 'eval_accuracy': 0.846,
 'eval_precision': 0.7058719827930602,
 'eval_recall': 0.7214950186575256,
 'eval_f1': 0.6899065260384348,
 'eval_runtime': 4.4088,
 'eval_samples_per_second': 113.409,
 'eval_steps_per_second': 0.907,
 'epoch': 10.0}

In [58]:
torch.save(model.state_dict(), f"{os.path.expanduser('~')}/models/{DATASET}/bilstm-base-aug_fine.pth")

In [59]:
base.count_parameters(model)

model size: 16.539MB.
Total Trainable Params: 1705250.


Unnamed: 0,Modules,Parameters
0,lstm.weight_ih_l0,360000
1,lstm.weight_hh_l0,360000
2,lstm.bias_ih_l0,1200
3,lstm.bias_hh_l0,1200
4,lstm.weight_ih_l0_reverse,360000
5,lstm.weight_hh_l0_reverse,360000
6,lstm.bias_ih_l0_reverse,1200
7,lstm.bias_hh_l0_reverse,1200
8,fc1.weight,240000
9,fc1.bias,400


In [60]:
cpu_benchmark = base.BenchMarkRunner(student_model, cpu_data_loader, "cpu", 1000)
print(cpu_benchmark.run_benchmark())

<torch.utils.benchmark.utils.common.Measurement object at 0x75e1a4f0e530>
self.infer_speed_comp()
  4.94 ms
  1 measurement, 1000 runs , 6 threads


In [61]:
gpu_benchmark = base.BenchMarkRunner(student_model, gpu_data_loader, "cuda", 1000)
print(gpu_benchmark.run_benchmark())

<torch.utils.benchmark.utils.common.Measurement object at 0x75e1a4f36b00>
self.infer_speed_comp()
  1.94 ms
  1 measurement, 1000 runs , 6 threads


In [16]:
student_model = base.BiLSTMClassifier(embedding_matrix=embedding_matrix, embedding_dim=embedding_dim, fc_dim=400, hidden_dim=300, output_dim=50)

In [17]:
training_args = base.get_training_args(output_dir=f"~/results/{DATASET}/bilstm-distill-aug_fine", remove_unused_columns=False, logging_dir=f"~/logs/{DATASET}/bilstm-distill-aug_fine", lr=0.0015, weight_decay=0.01, warmup_steps=45, adam_beta1=.95, epochs=30, batch_size=128, lambda_param=.9, temp=2)

In [18]:
base.reset_seed()

In [19]:
trainer = base.DistilTrainer(
    student_model=student_model,
    args=training_args,
    train_dataset=all_train_data,
    eval_dataset=eval_data,
    compute_metrics=base.compute_metrics,
    callbacks = [EarlyStoppingCallback(early_stopping_patience = 3)]
)

In [20]:
trainer.train()

Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.8312,0.532684,0.759853,0.40233,0.389453,0.384507
2,0.1825,0.457369,0.814849,0.600572,0.528745,0.548273
3,0.1039,0.428102,0.824931,0.67166,0.584802,0.611524
4,0.0826,0.424568,0.826764,0.735698,0.650434,0.674225
5,0.0717,0.427975,0.828598,0.772014,0.660684,0.695346
6,0.0669,0.407494,0.83868,0.746627,0.650151,0.680647
7,0.065,0.396718,0.842346,0.810542,0.679114,0.724032
8,0.0609,0.401809,0.828598,0.78886,0.66302,0.704743
9,0.0582,0.394696,0.842346,0.794766,0.696065,0.731332
10,0.0558,0.389493,0.846929,0.824274,0.697782,0.738985


TrainOutput(global_step=8925, training_loss=0.11357396358201484, metrics={'train_runtime': 222.9195, 'train_samples_per_second': 9027.473, 'train_steps_per_second': 70.653, 'total_flos': 0.0, 'train_loss': 0.11357396358201484, 'epoch': 17.0})

In [21]:
student_model.eval()

BiLSTMClassifier(
  (embedding): Embedding(8768, 300)
  (lstm): LSTM(300, 300, batch_first=True, bidirectional=True)
  (fc1): Linear(in_features=600, out_features=400, bias=True)
  (dropout): Dropout(p=0.2, inplace=False)
  (fc2): Linear(in_features=400, out_features=50, bias=True)
)

In [22]:
trainer.evaluate(test_data)

{'eval_loss': 0.2879835367202759,
 'eval_accuracy': 0.84,
 'eval_precision': 0.7659675748814134,
 'eval_recall': 0.7009694424186768,
 'eval_f1': 0.7021264683561136,
 'eval_runtime': 6.3233,
 'eval_samples_per_second': 79.073,
 'eval_steps_per_second': 0.633,
 'epoch': 17.0}

In [69]:
torch.save(student_model.state_dict(), f"{os.path.expanduser('~')}/models/{DATASET}/bilstm-distill-aug_fine.pth")

In [70]:
base.count_parameters(student_model)

model size: 16.539MB.
Total Trainable Params: 1705250.


Unnamed: 0,Modules,Parameters
0,lstm.weight_ih_l0,360000
1,lstm.weight_hh_l0,360000
2,lstm.bias_ih_l0,1200
3,lstm.bias_hh_l0,1200
4,lstm.weight_ih_l0_reverse,360000
5,lstm.weight_hh_l0_reverse,360000
6,lstm.bias_ih_l0_reverse,1200
7,lstm.bias_hh_l0_reverse,1200
8,fc1.weight,240000
9,fc1.bias,400


In [71]:
cpu_benchmark = base.BenchMarkRunner(student_model, cpu_data_loader, "cpu", 1000)
print(cpu_benchmark.run_benchmark())

<torch.utils.benchmark.utils.common.Measurement object at 0x75e34752c070>
self.infer_speed_comp()
  5.52 ms
  1 measurement, 1000 runs , 6 threads


In [72]:
gpu_benchmark = base.BenchMarkRunner(student_model, gpu_data_loader, "cuda", 1000)
print(gpu_benchmark.run_benchmark())

<torch.utils.benchmark.utils.common.Measurement object at 0x75e346820b50>
self.infer_speed_comp()
  2.04 ms
  1 measurement, 1000 runs , 6 threads


In [73]:
import time
from torch.utils.data import  DataLoader

base.count_parameters(model)
torch.cuda.synchronize() 
starter, ender = torch.cuda.Event(enable_timing=True), torch.cuda.Event(enable_timing=True)
device = "cuda"
model.to(device)

train_data.set_format(type="torch", columns=["input_ids"], device="cuda")
test_loader = DataLoader(train_data, batch_size=1, shuffle=False)

timings = []



for i, batch in enumerate(test_loader):
    if i >= 1000:
        break
    torch.cuda.synchronize()
    starter.record()
    with torch.no_grad():
        _ = model(**batch)
    ender.record()
    torch.cuda.synchronize()
    timings.append(starter.elapsed_time(ender))

print(f"Average Inference Time on GPU: {sum(timings) / len(timings):.3f} ms")





timings = []
device = "cpu"
model.to(device)
train_data.set_format(type="torch", columns=["input_ids"], device="cpu")
test_loader = DataLoader(train_data, batch_size=1, shuffle=False)
for i, batch in enumerate(test_loader):
    if i >= 1000:
        break
    start_time = time.perf_counter()
    with torch.no_grad():
        _ = model(**batch)
    end_time = time.perf_counter()
    timings.append((end_time - start_time)*1000)


print(f"Average Inference Time on CPU: {sum(timings) / len(timings):.3f} ms")

model size: 16.539MB.
Total Trainable Params: 1705250.
Average Inference Time on GPU: 1.722 ms
Average Inference Time on CPU: 4.301 ms
