In [1]:
from datasets import concatenate_datasets, load_from_disk
from transformers import BasicTokenizer, EarlyStoppingCallback, Trainer
from torch.utils.data import DataLoader
import kagglehub
import torch
import base
import copy
import os

[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /home/jovyan/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package punkt to /home/jovyan/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to /home/jovyan/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger_eng to
[nltk_data]     /home/jovyan/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger_eng is already up-to-
[nltk_data]       date!


In [2]:
my_glove = kagglehub.dataset_download("thanakomsn/glove6b300dtxt")
print(my_glove)

/home/jovyan/.cache/kagglehub/datasets/thanakomsn/glove6b300dtxt/versions/1


In [None]:
GLOVE_FILE = f"{my_glove}/glove.6B.300d.txt"
DATASET = "trec"

In [None]:
train_data = load_from_disk(f"~/data/{DATASET}/train-logits_fine")
eval_data = load_from_disk(f"~/data/{DATASET}/eval-logits_fine")
test_data = load_from_disk(f"~/data/{DATASET}/test-logits_fine")

all_train_data = load_from_disk(f"~/data/{DATASET}/train-logits-augmented_fine")

all_data = concatenate_datasets([load_from_disk(file) for file in [f"~/data/{DATASET}/eval-logits_fine", f"~/data/{DATASET}/test-logits_fine", f"~/data/{DATASET}/train-logits-augmented_fine"]])
tokenizer = BasicTokenizer(do_lower_case=True)

In [5]:
if torch.cuda.is_available():
    device = torch.device("cuda")
    print("GPU is available and will be used:", torch.cuda.get_device_name(0))
else:
    device = torch.device("cpu")
    print("GPU is not available, using CPU.")

GPU is available and will be used: NVIDIA H100 PCIe


In [6]:
train_data_tokens = list(map(lambda e: tokenizer.tokenize(e["sentence"]), train_data))
eval_data_tokens = list(map(lambda e: tokenizer.tokenize(e["sentence"]), eval_data))
test_data_tokens = list(map(lambda e: tokenizer.tokenize(e["sentence"]), test_data))

all_train_data_tokens = list(map(lambda e: tokenizer.tokenize(e["sentence"]), all_train_data))

all_data_tokens = list(map(lambda e: tokenizer.tokenize(e["sentence"]), all_data))

In [7]:
vocab = base.get_vocab(all_data_tokens)

In [None]:
word_index = dict(zip(vocab, range(len(vocab))))

In [9]:
embeddings_index = base.get_embeddings_indeces(GLOVE_FILE)

Found 400000 word vectors.


In [None]:
print(len(vocab))
num_tokens = len(vocab) + 2
embedding_dim = 300

8766


In [11]:
embedding_matrix = base.get_embedding_matrix(num_tokens, embedding_dim, word_index, embeddings_index)

Converted 8551 words (215) misses


In [12]:
train_data_index = list(map(lambda x: list(map(lambda y: word_index[y], x)),train_data_tokens))
eval_data_index = list(map(lambda x: list(map(lambda y: word_index[y], x)),eval_data_tokens))
test_data_index = list(map(lambda x: list(map(lambda y: word_index[y], x)),test_data_tokens))

all_train_data_index = list(map(lambda x: list(map(lambda y: word_index[y], x)),all_train_data_tokens))

In [13]:
train_padded_data = list(map(lambda x: base.padd(x,60), train_data_index))
eval_padded_data = list(map(lambda x: base.padd(x,60), eval_data_index))
test_padded_data = list(map(lambda x: base.padd(x,60), test_data_index))

all_train_padded_data = list(map(lambda x: base.padd(x,60), all_train_data_index))

In [14]:
train_data = train_data.add_column("input_ids", train_padded_data)
eval_data = eval_data.add_column("input_ids", eval_padded_data)
test_data = test_data.add_column("input_ids", test_padded_data)

all_train_data = all_train_data.add_column("input_ids", all_train_padded_data)

In [None]:
train_data_gpu = copy.deepcopy(train_data)
train_data_gpu.set_format(type="torch", columns=["input_ids"], device="cuda")
gpu_data_loader = DataLoader(train_data_gpu, batch_size=1, shuffle=False)

train_data_cpu = copy.deepcopy(train_data)
train_data_cpu.set_format(type="torch", columns=["input_ids"], device="cpu")
cpu_data_loader = DataLoader(train_data_cpu, batch_size=1, shuffle=False)


In [61]:
model = base.BiLSTMClassifier(embedding_matrix=embedding_matrix, embedding_dim=embedding_dim, fc_dim=400, hidden_dim=300, output_dim=50)

In [62]:
print(model)

BiLSTMClassifier(
  (embedding): Embedding(8768, 300)
  (lstm): LSTM(300, 300, batch_first=True, bidirectional=True)
  (fc1): Linear(in_features=600, out_features=400, bias=True)
  (dropout): Dropout(p=0.2, inplace=False)
  (fc2): Linear(in_features=400, out_features=50, bias=True)
)


In [63]:
training_args = base.get_training_args(output_dir=f"~/results/{DATASET}/bilstm-base_fine", logging_dir=f"~/logs/{DATASET}/bilstm-base_fine", lr=.005, weight_decay = .002, warmup_steps = 2,  epochs=15, batch_size=128)

In [64]:
base.reset_seed()

In [65]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_data,
    eval_dataset=eval_data,
    compute_metrics=base.compute_metrics,
    callbacks = [EarlyStoppingCallback(early_stopping_patience = 3)]
)

In [66]:
trainer.train()

Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,2.5796,1.895323,0.516957,0.201717,0.179139,0.166014
2,1.4973,1.379462,0.666361,0.317662,0.299932,0.293096
3,1.0004,1.157564,0.71769,0.385714,0.371105,0.369859
4,0.626,1.080071,0.727773,0.550041,0.480927,0.49295
5,0.3528,1.094388,0.75802,0.599743,0.536896,0.554457
6,0.1714,1.213981,0.778185,0.635408,0.606574,0.606317
7,0.08,1.314857,0.783685,0.672996,0.622436,0.63562
8,0.0352,1.415231,0.784601,0.687267,0.628458,0.642874
9,0.0173,1.298808,0.79835,0.700102,0.647338,0.660204
10,0.0072,1.38002,0.800183,0.700119,0.642714,0.658627


TrainOutput(global_step=490, training_loss=0.4553474649862975, metrics={'train_runtime': 65.6674, 'train_samples_per_second': 996.156, 'train_steps_per_second': 7.995, 'total_flos': 0.0, 'train_loss': 0.4553474649862975, 'epoch': 14.0})

In [67]:
model.eval()

BiLSTMClassifier(
  (embedding): Embedding(8768, 300)
  (lstm): LSTM(300, 300, batch_first=True, bidirectional=True)
  (fc1): Linear(in_features=600, out_features=400, bias=True)
  (dropout): Dropout(p=0.2, inplace=False)
  (fc2): Linear(in_features=400, out_features=50, bias=True)
)

In [68]:
trainer.evaluate(test_data)

{'eval_loss': 1.0148743391036987,
 'eval_accuracy': 0.822,
 'eval_precision': 0.7099063905070399,
 'eval_recall': 0.6618411811215629,
 'eval_f1': 0.666502175489911,
 'eval_runtime': 3.3628,
 'eval_samples_per_second': 148.688,
 'eval_steps_per_second': 1.19,
 'epoch': 14.0}

In [69]:
torch.save(model.state_dict(), f"{os.path.expanduser('~')}/models/{DATASET}/bilstm-base_fine.pth")

In [70]:
base.count_parameters(model)

model size: 16.539MB.
Total Trainable Params: 1705250.


Unnamed: 0,Modules,Parameters
0,lstm.weight_ih_l0,360000
1,lstm.weight_hh_l0,360000
2,lstm.bias_ih_l0,1200
3,lstm.bias_hh_l0,1200
4,lstm.weight_ih_l0_reverse,360000
5,lstm.weight_hh_l0_reverse,360000
6,lstm.bias_ih_l0_reverse,1200
7,lstm.bias_hh_l0_reverse,1200
8,fc1.weight,240000
9,fc1.bias,400


In [71]:
cpu_benchmark = base.BenchMarkRunner(model, cpu_data_loader, "cpu", 1000)
print(cpu_benchmark.run_benchmark())

<torch.utils.benchmark.utils.common.Measurement object at 0x75183187f0a0>
self.infer_speed_comp()
  6.04 ms
  1 measurement, 1000 runs , 4 threads


In [72]:
gpu_benchmark = base.BenchMarkRunner(model, gpu_data_loader, "cuda", 1000)
print(gpu_benchmark.run_benchmark())

<torch.utils.benchmark.utils.common.Measurement object at 0x75183182bdf0>
self.infer_speed_comp()
  2.16 ms
  1 measurement, 1000 runs , 4 threads


In [93]:
student_model = base.BiLSTMClassifier(embedding_matrix=embedding_matrix, embedding_dim=embedding_dim, fc_dim=400, hidden_dim=300, output_dim=50)

In [94]:
training_args = base.get_training_args(output_dir=f"~/results/{DATASET}/bilstm-distill_fine", remove_unused_columns=False, logging_dir=f"~/logs/{DATASET}/bilstm-distill_fine", lr=.005, weight_decay = 0.009, warmup_steps = 4, epochs=20, batch_size=128, lambda_param=.6, temp=6)

In [95]:
base.reset_seed()

In [96]:
trainer = base.DistilTrainer(
    student_model=student_model,
    args=training_args,
    train_dataset=train_data,
    eval_dataset=eval_data,
    compute_metrics=base.compute_metrics,
    callbacks = [EarlyStoppingCallback(early_stopping_patience = 3)]
)

In [97]:
trainer.train()

Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,1.5899,1.181036,0.474794,0.106221,0.123948,0.097965
2,1.0301,0.888245,0.631531,0.234494,0.25494,0.235581
3,0.7275,0.693991,0.718607,0.368497,0.360014,0.354463
4,0.5368,0.619837,0.752521,0.398146,0.394519,0.391347
5,0.3848,0.557465,0.773602,0.466319,0.443512,0.44769
6,0.2869,0.557899,0.769936,0.501812,0.452929,0.465692
7,0.2208,0.513788,0.797434,0.591454,0.522804,0.543003
8,0.1674,0.510017,0.799267,0.635664,0.5588,0.580133
9,0.1283,0.495336,0.807516,0.661863,0.570878,0.601575
10,0.1036,0.48207,0.815765,0.711728,0.635348,0.660211


TrainOutput(global_step=665, training_loss=0.3081107985704465, metrics={'train_runtime': 88.9483, 'train_samples_per_second': 980.569, 'train_steps_per_second': 7.87, 'total_flos': 0.0, 'train_loss': 0.3081107985704465, 'epoch': 19.0})

In [98]:
student_model.eval()

BiLSTMClassifier(
  (embedding): Embedding(8768, 300)
  (lstm): LSTM(300, 300, batch_first=True, bidirectional=True)
  (fc1): Linear(in_features=600, out_features=400, bias=True)
  (dropout): Dropout(p=0.2, inplace=False)
  (fc2): Linear(in_features=400, out_features=50, bias=True)
)

In [99]:
trainer.evaluate(test_data)

{'eval_loss': 0.41698408126831055,
 'eval_accuracy': 0.822,
 'eval_precision': 0.7468531755825939,
 'eval_recall': 0.6641065816458944,
 'eval_f1': 0.6775623521158944,
 'eval_runtime': 3.4288,
 'eval_samples_per_second': 145.826,
 'eval_steps_per_second': 1.167,
 'epoch': 19.0}

In [80]:
torch.save(student_model.state_dict(), f"{os.path.expanduser('~')}/models/{DATASET}/bilstm-distill_fine.pth")

In [81]:
base.count_parameters(student_model)

model size: 16.539MB.
Total Trainable Params: 1705250.


Unnamed: 0,Modules,Parameters
0,lstm.weight_ih_l0,360000
1,lstm.weight_hh_l0,360000
2,lstm.bias_ih_l0,1200
3,lstm.bias_hh_l0,1200
4,lstm.weight_ih_l0_reverse,360000
5,lstm.weight_hh_l0_reverse,360000
6,lstm.bias_ih_l0_reverse,1200
7,lstm.bias_hh_l0_reverse,1200
8,fc1.weight,240000
9,fc1.bias,400


In [82]:
cpu_benchmark = base.BenchMarkRunner(student_model, cpu_data_loader, "cpu", 1000)
print(cpu_benchmark.run_benchmark())

<torch.utils.benchmark.utils.common.Measurement object at 0x75183187c1f0>
self.infer_speed_comp()
  3.33 ms
  1 measurement, 1000 runs , 4 threads


In [83]:
gpu_benchmark = base.BenchMarkRunner(student_model, gpu_data_loader, "cuda", 1000)
print(gpu_benchmark.run_benchmark())

<torch.utils.benchmark.utils.common.Measurement object at 0x7518319d5e40>
self.infer_speed_comp()
  1.83 ms
  1 measurement, 1000 runs , 4 threads


In [84]:
model = base.BiLSTMClassifier(embedding_matrix=embedding_matrix, embedding_dim=embedding_dim, fc_dim=400, hidden_dim=300, output_dim=50)

In [85]:
training_args = base.get_training_args(output_dir=f"~/results/{DATASET}/bilstm-base-aug_fine", logging_dir=f"~/logs/{DATASET}/bilstm-base-aug_fine", lr=.005,  epochs=20, weight_decay=0.009, warmup_steps=49, batch_size=128)

In [86]:
base.reset_seed()

In [87]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=all_train_data,
    eval_dataset=eval_data,
    compute_metrics=base.compute_metrics,
    callbacks = [EarlyStoppingCallback(early_stopping_patience = 3)]
)

In [88]:
trainer.train()

Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.6644,1.046777,0.802933,0.745181,0.692226,0.704679
2,0.0605,1.180362,0.806599,0.751741,0.699872,0.704943
3,0.0297,1.505155,0.805683,0.713317,0.686981,0.687996
4,0.0253,1.47218,0.820348,0.785614,0.720416,0.73406
5,0.0251,1.732541,0.808433,0.793205,0.708483,0.734408
6,0.0133,1.702512,0.814849,0.748111,0.677796,0.697886
7,0.011,1.795035,0.820348,0.810556,0.714336,0.741775
8,0.013,1.755565,0.819432,0.781014,0.710225,0.72996
9,0.0107,1.878143,0.819432,0.813092,0.71392,0.745391
10,0.0083,1.901936,0.815765,0.795357,0.717573,0.742134


TrainOutput(global_step=8891, training_loss=0.05143660313466874, metrics={'train_runtime': 202.976, 'train_samples_per_second': 6588.365, 'train_steps_per_second': 51.533, 'total_flos': 0.0, 'train_loss': 0.05143660313466874, 'epoch': 17.0})

In [89]:
model.eval()

BiLSTMClassifier(
  (embedding): Embedding(8768, 300)
  (lstm): LSTM(300, 300, batch_first=True, bidirectional=True)
  (fc1): Linear(in_features=600, out_features=400, bias=True)
  (dropout): Dropout(p=0.2, inplace=False)
  (fc2): Linear(in_features=400, out_features=50, bias=True)
)

In [92]:
trainer.evaluate(test_data)

{'eval_loss': 1.759264349937439,
 'eval_accuracy': 0.836,
 'eval_precision': 0.7059850941559144,
 'eval_recall': 0.6690233050598904,
 'eval_f1': 0.6735333580584322,
 'eval_runtime': 3.1694,
 'eval_samples_per_second': 157.76,
 'eval_steps_per_second': 1.262,
 'epoch': 17.0}

In [46]:
torch.save(model.state_dict(), f"{os.path.expanduser('~')}/models/{DATASET}/bilstm-base-aug_fine.pth")

In [47]:
base.count_parameters(model)

model size: 16.539MB.
Total Trainable Params: 1705250.


Unnamed: 0,Modules,Parameters
0,lstm.weight_ih_l0,360000
1,lstm.weight_hh_l0,360000
2,lstm.bias_ih_l0,1200
3,lstm.bias_hh_l0,1200
4,lstm.weight_ih_l0_reverse,360000
5,lstm.weight_hh_l0_reverse,360000
6,lstm.bias_ih_l0_reverse,1200
7,lstm.bias_hh_l0_reverse,1200
8,fc1.weight,240000
9,fc1.bias,400


In [48]:
cpu_benchmark = base.BenchMarkRunner(student_model, cpu_data_loader, "cpu", 1000)
print(cpu_benchmark.run_benchmark())

<torch.utils.benchmark.utils.common.Measurement object at 0x75183182a3e0>
self.infer_speed_comp()
  6.05 ms
  1 measurement, 1000 runs , 4 threads


In [49]:
gpu_benchmark = base.BenchMarkRunner(student_model, gpu_data_loader, "cuda", 1000)
print(gpu_benchmark.run_benchmark())

<torch.utils.benchmark.utils.common.Measurement object at 0x7518319d5c00>
self.infer_speed_comp()
  2.14 ms
  1 measurement, 1000 runs , 4 threads


In [50]:
student_model = base.BiLSTMClassifier(embedding_matrix=embedding_matrix, embedding_dim=embedding_dim, fc_dim=400, hidden_dim=300, output_dim=50)

In [51]:
training_args = base.get_training_args(output_dir=f"~/results/{DATASET}/bilstm-distill-aug_fine", remove_unused_columns=False, logging_dir=f"~/logs/{DATASET}/bilstm-distill-aug_fine", lr=0.0045, weight_decay=0.01, warmup_steps=42, epochs=20, batch_size=128, lambda_param=.5, temp=2)

In [52]:
base.reset_seed()

In [53]:
trainer = base.DistilTrainer(
    student_model=student_model,
    args=training_args,
    train_dataset=all_train_data,
    eval_dataset=eval_data,
    compute_metrics=base.compute_metrics,
    callbacks = [EarlyStoppingCallback(early_stopping_patience = 3)]
)

In [54]:
trainer.train()

Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.6783,0.621786,0.83593,0.819613,0.710978,0.747358
2,0.1502,0.602439,0.829514,0.749696,0.674669,0.699273
3,0.117,0.609918,0.84143,0.825495,0.721298,0.753385
4,0.1065,0.58391,0.83593,0.830465,0.709159,0.751163
5,0.1024,0.62372,0.834097,0.805652,0.713997,0.745858
6,0.0998,0.63539,0.824015,0.836893,0.710372,0.755673
7,0.0985,0.625105,0.830431,0.804662,0.706594,0.73845
8,0.0955,0.607409,0.834097,0.821349,0.725821,0.757971
9,0.0917,0.613063,0.83593,0.827569,0.718591,0.757031
10,0.09,0.568588,0.846013,0.836843,0.738641,0.771468


TrainOutput(global_step=10460, training_loss=0.12244620314063807, metrics={'train_runtime': 241.4593, 'train_samples_per_second': 5538.325, 'train_steps_per_second': 43.32, 'total_flos': 0.0, 'train_loss': 0.12244620314063807, 'epoch': 20.0})

In [55]:
student_model.eval()

BiLSTMClassifier(
  (embedding): Embedding(8768, 300)
  (lstm): LSTM(300, 300, batch_first=True, bidirectional=True)
  (fc1): Linear(in_features=600, out_features=400, bias=True)
  (dropout): Dropout(p=0.2, inplace=False)
  (fc2): Linear(in_features=400, out_features=50, bias=True)
)

In [56]:
trainer.evaluate(test_data)

{'eval_loss': 0.46995213627815247,
 'eval_accuracy': 0.86,
 'eval_precision': 0.8016732618616697,
 'eval_recall': 0.7517333803440076,
 'eval_f1': 0.7579899237628692,
 'eval_runtime': 3.8775,
 'eval_samples_per_second': 128.949,
 'eval_steps_per_second': 1.032,
 'epoch': 20.0}

In [57]:
torch.save(student_model.state_dict(), f"{os.path.expanduser('~')}/models/{DATASET}/bilstm-distill-aug_fine.pth")

In [58]:
base.count_parameters(student_model)

model size: 16.539MB.
Total Trainable Params: 1705250.


Unnamed: 0,Modules,Parameters
0,lstm.weight_ih_l0,360000
1,lstm.weight_hh_l0,360000
2,lstm.bias_ih_l0,1200
3,lstm.bias_hh_l0,1200
4,lstm.weight_ih_l0_reverse,360000
5,lstm.weight_hh_l0_reverse,360000
6,lstm.bias_ih_l0_reverse,1200
7,lstm.bias_hh_l0_reverse,1200
8,fc1.weight,240000
9,fc1.bias,400


In [59]:
cpu_benchmark = base.BenchMarkRunner(student_model, cpu_data_loader, "cpu", 1000)
print(cpu_benchmark.run_benchmark())

<torch.utils.benchmark.utils.common.Measurement object at 0x75183187e3e0>
self.infer_speed_comp()
  5.39 ms
  1 measurement, 1000 runs , 4 threads


In [60]:
gpu_benchmark = base.BenchMarkRunner(student_model, gpu_data_loader, "cuda", 1000)
print(gpu_benchmark.run_benchmark())

<torch.utils.benchmark.utils.common.Measurement object at 0x751831909b70>
self.infer_speed_comp()
  2.17 ms
  1 measurement, 1000 runs , 4 threads
