In [1]:
from datasets import concatenate_datasets, load_from_disk
from transformers import BasicTokenizer, Trainer, EarlyStoppingCallback
from torch.utils.data import DataLoader
import kagglehub
import torch
import base
import os
import copy

[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /home/jovyan/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package punkt to /home/jovyan/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to /home/jovyan/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger_eng to
[nltk_data]     /home/jovyan/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger_eng is already up-to-
[nltk_data]       date!


In [2]:
my_glove = kagglehub.dataset_download("thanakomsn/glove6b300dtxt")
print(my_glove)

/home/jovyan/.cache/kagglehub/datasets/thanakomsn/glove6b300dtxt/versions/1


In [3]:
GLOVE_FILE = f"{my_glove}/glove.6B.300d.txt"
DATASET = "trec"

In [4]:
train_data = load_from_disk(f"~/data/{DATASET}/train-logits_coarse")
eval_data = load_from_disk(f"~/data/{DATASET}/eval-logits_coarse")
test_data = load_from_disk(f"~/data/{DATASET}/test-logits_coarse")

all_train_data = load_from_disk(f"~/data/{DATASET}/train-logits-augmented_coarse")

all_data = concatenate_datasets([load_from_disk(file) for file in [f"~/data/{DATASET}/eval-logits_coarse", f"~/data/{DATASET}/test-logits_coarse", f"~/data/{DATASET}/train-logits-augmented_coarse"]])
tokenizer = BasicTokenizer(do_lower_case=True)

In [5]:
if torch.cuda.is_available():
    device = torch.device("cuda")
    print("GPU is available and will be used:", torch.cuda.get_device_name(0))
else:
    device = torch.device("cpu")
    print("GPU is not available, using CPU.")

GPU is available and will be used: NVIDIA A100 80GB PCIe MIG 2g.20gb


In [6]:
train_data_tokens = list(map(lambda e: tokenizer.tokenize(e["sentence"]), train_data))
eval_data_tokens = list(map(lambda e: tokenizer.tokenize(e["sentence"]), eval_data))
test_data_tokens = list(map(lambda e: tokenizer.tokenize(e["sentence"]), test_data))

all_train_data_tokens = list(map(lambda e: tokenizer.tokenize(e["sentence"]), all_train_data))

all_data_tokens = list(map(lambda e: tokenizer.tokenize(e["sentence"]), all_data))

In [7]:
vocab = base.get_vocab(all_data_tokens)

In [8]:
word_index = dict(zip(vocab, range(len(vocab))))

In [9]:
embeddings_index = base.get_embeddings_indeces(GLOVE_FILE)

Found 400000 word vectors.


In [10]:
print(len(vocab))
num_tokens = len(vocab) + 2
embedding_dim = 300

8766


In [11]:
embedding_matrix = base.get_embedding_matrix(num_tokens, embedding_dim, word_index, embeddings_index)

Converted 8551 words (215) misses


In [12]:
train_data_index = list(map(lambda x: list(map(lambda y: word_index[y], x)),train_data_tokens))
eval_data_index = list(map(lambda x: list(map(lambda y: word_index[y], x)),eval_data_tokens))
test_data_index = list(map(lambda x: list(map(lambda y: word_index[y], x)),test_data_tokens))

all_train_data_index = list(map(lambda x: list(map(lambda y: word_index[y], x)),all_train_data_tokens))

In [13]:
train_padded_data = list(map(lambda x: base.padd(x,60), train_data_index))
eval_padded_data = list(map(lambda x: base.padd(x,60), eval_data_index))
test_padded_data = list(map(lambda x: base.padd(x,60), test_data_index))

all_train_padded_data = list(map(lambda x: base.padd(x,60), all_train_data_index))

In [14]:
train_data = train_data.add_column("input_ids", train_padded_data)
eval_data = eval_data.add_column("input_ids", eval_padded_data)
test_data = test_data.add_column("input_ids", test_padded_data)

all_train_data = all_train_data.add_column("input_ids", all_train_padded_data)

In [15]:
train_data_gpu = copy.deepcopy(train_data)
train_data_gpu.set_format(type="torch", columns=["input_ids"], device="cuda")
gpu_data_loader = DataLoader(train_data_gpu, batch_size=1, shuffle=False)

train_data_cpu = copy.deepcopy(train_data)
train_data_cpu.set_format(type="torch", columns=["input_ids"], device="cpu")
cpu_data_loader = DataLoader(train_data_cpu, batch_size=1, shuffle=False)

In [20]:
model = base.BiLSTMClassifier(embedding_matrix=embedding_matrix, embedding_dim=embedding_dim, fc_dim=400, hidden_dim=300, output_dim=6)

In [21]:
print(model)

BiLSTMClassifier(
  (embedding): Embedding(8768, 300)
  (lstm): LSTM(300, 300, batch_first=True, bidirectional=True)
  (fc1): Linear(in_features=600, out_features=400, bias=True)
  (dropout): Dropout(p=0.2, inplace=False)
  (fc2): Linear(in_features=400, out_features=6, bias=True)
)


In [22]:
training_args = base.get_training_args(output_dir=f"~/results/{DATASET}/bilstm-base_coarse", logging_dir=f"~/logs/{DATASET}/bilstm-base_coarse", lr=.0045,  epochs=20, warmup_steps=3, batch_size=128)

In [23]:
base.reset_seed()

In [None]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_data,
    eval_dataset=eval_data,
    compute_metrics=base.compute_metrics,
    callbacks = [EarlyStoppingCallback(early_stopping_patience = 4)]
)

In [67]:
trainer.train()

Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,1.0233,0.791474,0.739688,0.671167,0.623271,0.634305
2,0.5148,0.541716,0.80385,0.849865,0.708108,0.715164
3,0.317,0.481392,0.839597,0.82037,0.806192,0.810741
4,0.183,0.530672,0.857012,0.856434,0.821006,0.835248
5,0.1142,0.564497,0.864345,0.819539,0.845392,0.830031
6,0.0875,0.669544,0.858845,0.875485,0.815237,0.835854
7,0.057,0.608056,0.865261,0.876751,0.828236,0.845916
8,0.0302,0.693445,0.863428,0.883666,0.824832,0.847313
9,0.0086,0.732775,0.88176,0.891874,0.841376,0.860853
10,0.0048,0.731493,0.878093,0.88749,0.829321,0.85021


TrainOutput(global_step=525, training_loss=0.15651094475672359, metrics={'train_runtime': 82.6432, 'train_samples_per_second': 1055.38, 'train_steps_per_second': 8.47, 'total_flos': 0.0, 'train_loss': 0.15651094475672359, 'epoch': 15.0})

In [68]:
model.eval()

BiLSTMClassifier(
  (embedding): Embedding(8768, 300)
  (lstm): LSTM(300, 300, batch_first=True, bidirectional=True)
  (fc1): Linear(in_features=600, out_features=400, bias=True)
  (dropout): Dropout(p=0.2, inplace=False)
  (fc2): Linear(in_features=400, out_features=6, bias=True)
)

In [69]:
trainer.evaluate(test_data)

{'eval_loss': 0.3320004940032959,
 'eval_accuracy': 0.936,
 'eval_precision': 0.9126378531352071,
 'eval_recall': 0.9142122991351692,
 'eval_f1': 0.9132113724742396,
 'eval_runtime': 3.4625,
 'eval_samples_per_second': 144.404,
 'eval_steps_per_second': 1.155,
 'epoch': 15.0}

In [70]:
torch.save(model.state_dict(), f"{os.path.expanduser('~')}/models/{DATASET}/bilstm-base_coarse.pth")

In [16]:
student_model = base.BiLSTMClassifier(embedding_matrix=embedding_matrix, embedding_dim=embedding_dim, fc_dim=400, hidden_dim=300, output_dim=6)

In [72]:
training_args = base.get_training_args(output_dir=f"~/results/{DATASET}/bilstm-distill_coarse", remove_unused_columns=False, logging_dir=f"~/logs/{DATASET}/bilstm-distill_coarse", lr=.004,  epochs=20, warmup_steps=3, weight_decay=.004, batch_size=128, lambda_param=.1, temp=3)

In [73]:
base.reset_seed()

In [None]:
trainer = base.DistilTrainer(
    student_model=student_model,
    args=training_args,
    train_dataset=train_data,
    eval_dataset=eval_data,
    compute_metrics=base.compute_metrics,
    callbacks = [EarlyStoppingCallback(early_stopping_patience = 4)]
)

In [75]:
trainer.train()

Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,1.3507,0.936377,0.751604,0.663102,0.639479,0.642371
2,0.6619,0.752703,0.802933,0.839918,0.753306,0.770711
3,0.4294,0.602888,0.83593,0.878919,0.775714,0.804253
4,0.2756,0.609786,0.853346,0.869071,0.798225,0.822076
5,0.1668,0.514417,0.87626,0.874335,0.837975,0.852335
6,0.0874,0.500827,0.888176,0.896385,0.837412,0.858682
7,0.0548,0.524546,0.88176,0.892463,0.832326,0.853963
8,0.0372,0.523046,0.873511,0.883808,0.825844,0.846594
9,0.0285,0.505586,0.889093,0.898646,0.838648,0.86022
10,0.0255,0.511702,0.883593,0.892654,0.83335,0.854802


TrainOutput(global_step=455, training_loss=0.2449749713415628, metrics={'train_runtime': 58.8677, 'train_samples_per_second': 1481.627, 'train_steps_per_second': 11.891, 'total_flos': 0.0, 'train_loss': 0.2449749713415628, 'epoch': 13.0})

In [76]:
student_model.eval()

BiLSTMClassifier(
  (embedding): Embedding(8768, 300)
  (lstm): LSTM(300, 300, batch_first=True, bidirectional=True)
  (fc1): Linear(in_features=600, out_features=400, bias=True)
  (dropout): Dropout(p=0.2, inplace=False)
  (fc2): Linear(in_features=400, out_features=6, bias=True)
)

In [77]:
trainer.evaluate(test_data)

{'eval_loss': 0.25151944160461426,
 'eval_accuracy': 0.944,
 'eval_precision': 0.9533480866346036,
 'eval_recall': 0.9174022803907276,
 'eval_f1': 0.9328088314332649,
 'eval_runtime': 3.3127,
 'eval_samples_per_second': 150.935,
 'eval_steps_per_second': 1.207,
 'epoch': 13.0}

In [78]:
torch.save(student_model.state_dict(), f"{os.path.expanduser('~')}/models/{DATASET}/bilstm-distill_coarse.pth")

In [79]:
model = base.BiLSTMClassifier(embedding_matrix=embedding_matrix, embedding_dim=embedding_dim, fc_dim=400, hidden_dim=300, output_dim=6)

In [80]:
training_args = base.get_training_args(output_dir=f"~/results/{DATASET}/bilstm-base-aug_coarse", logging_dir=f"~/logs/{DATASET}/bilstm-base-aug_coarse", lr=.0045, weight_decay=.01, warmup_steps=22, epochs=20)

In [81]:
base.reset_seed()

In [None]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=all_train_data,
    eval_dataset=eval_data,
    compute_metrics=base.compute_metrics,
    callbacks = [EarlyStoppingCallback(early_stopping_patience = 4)]
)

In [83]:
trainer.train()

Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.4324,0.500473,0.87901,0.889518,0.838336,0.85799
2,0.0881,0.554305,0.872594,0.847169,0.843123,0.84467
3,0.0422,0.679658,0.885426,0.865688,0.842676,0.852786
4,0.0272,0.739544,0.878093,0.887266,0.827964,0.849042
5,0.0207,0.740285,0.868928,0.881039,0.822143,0.842561


TrainOutput(global_step=1525, training_loss=0.12212660304835585, metrics={'train_runtime': 34.64, 'train_samples_per_second': 22471.128, 'train_steps_per_second': 176.097, 'total_flos': 0.0, 'train_loss': 0.12212660304835585, 'epoch': 5.0})

In [84]:
model.eval()

BiLSTMClassifier(
  (embedding): Embedding(8768, 300)
  (lstm): LSTM(300, 300, batch_first=True, bidirectional=True)
  (fc1): Linear(in_features=600, out_features=400, bias=True)
  (dropout): Dropout(p=0.2, inplace=False)
  (fc2): Linear(in_features=400, out_features=6, bias=True)
)

In [85]:
trainer.evaluate(test_data)

{'eval_loss': 0.2321968674659729,
 'eval_accuracy': 0.932,
 'eval_precision': 0.94400097533244,
 'eval_recall': 0.9078598893954773,
 'eval_f1': 0.9235188389664111,
 'eval_runtime': 11.4073,
 'eval_samples_per_second': 43.832,
 'eval_steps_per_second': 0.351,
 'epoch': 5.0}

In [86]:
torch.save(model.state_dict(), f"{os.path.expanduser('~')}/models/{DATASET}/bilstm-base-aug_coarse.pth")

In [16]:
student_model = base.BiLSTMClassifier(embedding_matrix=embedding_matrix, embedding_dim=embedding_dim, fc_dim=400, hidden_dim=300, output_dim=6)

In [88]:
training_args = base.get_training_args(output_dir=f"~/results/{DATASET}/bilstm-distill-aug_coarse", remove_unused_columns=False, logging_dir=f"~/logs/{DATASET}/bilstm-distill-aug_coarse", lr=.004, warmup_steps=16, epochs=20, batch_size=128, lambda_param=1, temp=4)

In [89]:
base.reset_seed()

In [None]:
trainer = base.DistilTrainer(
    student_model=student_model,
    args=training_args,
    train_dataset=all_train_data,
    eval_dataset=eval_data,
    compute_metrics=base.compute_metrics,
    callbacks = [EarlyStoppingCallback(early_stopping_patience = 4)]
)

In [91]:
trainer.train()

Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,1.3594,0.957379,0.888176,0.898635,0.856489,0.873129
2,0.2844,0.740575,0.905591,0.910574,0.869363,0.886071
3,0.1713,0.727673,0.900092,0.907159,0.856445,0.875885
4,0.1324,0.701846,0.903758,0.907874,0.859341,0.877629
5,0.1123,0.719359,0.895509,0.904026,0.861697,0.878369
6,0.0989,0.697087,0.904675,0.908794,0.860201,0.878677


TrainOutput(global_step=1830, training_loss=0.3597863702826161, metrics={'train_runtime': 44.5602, 'train_samples_per_second': 17468.513, 'train_steps_per_second': 136.894, 'total_flos': 0.0, 'train_loss': 0.3597863702826161, 'epoch': 6.0})

In [92]:
student_model.eval()

BiLSTMClassifier(
  (embedding): Embedding(8768, 300)
  (lstm): LSTM(300, 300, batch_first=True, bidirectional=True)
  (fc1): Linear(in_features=600, out_features=400, bias=True)
  (dropout): Dropout(p=0.2, inplace=False)
  (fc2): Linear(in_features=400, out_features=6, bias=True)
)

In [93]:
trainer.evaluate(test_data)

{'eval_loss': 0.43750298023223877,
 'eval_accuracy': 0.946,
 'eval_precision': 0.9579632596131878,
 'eval_recall': 0.9199566299563388,
 'eval_f1': 0.9356390559837172,
 'eval_runtime': 3.5555,
 'eval_samples_per_second': 140.626,
 'eval_steps_per_second': 1.125,
 'epoch': 6.0}

In [94]:
torch.save(model.state_dict(), f"{os.path.expanduser('~')}/models/{DATASET}/bilstm-distill-aug_coarse.pth")

In [17]:
base.count_parameters(student_model)

model size: 16.472MB.
Total Trainable Params: 1687606.


Unnamed: 0,Modules,Parameters
0,lstm.weight_ih_l0,360000
1,lstm.weight_hh_l0,360000
2,lstm.bias_ih_l0,1200
3,lstm.bias_hh_l0,1200
4,lstm.weight_ih_l0_reverse,360000
5,lstm.weight_hh_l0_reverse,360000
6,lstm.bias_ih_l0_reverse,1200
7,lstm.bias_hh_l0_reverse,1200
8,fc1.weight,240000
9,fc1.bias,400


In [18]:
cpu_benchmark = base.BenchMarkRunner(student_model, cpu_data_loader, "cpu", 1000)
print(cpu_benchmark.run_benchmark())

<torch.utils.benchmark.utils.common.Measurement object at 0x77fc59f93a00>
self.infer_speed_comp()
  3.35 ms
  1 measurement, 1000 runs , 4 threads


In [19]:
gpu_benchmark = base.BenchMarkRunner(student_model, gpu_data_loader, "cuda", 1000)
print(gpu_benchmark.run_benchmark())

<torch.utils.benchmark.utils.common.Measurement object at 0x77fc59ff5d80>
self.infer_speed_comp()
  1.72 ms
  1 measurement, 1000 runs , 4 threads
