In [1]:
from datasets import concatenate_datasets, load_from_disk
from transformers import BasicTokenizer, Trainer, EarlyStoppingCallback
from torch.utils.data import DataLoader
import kagglehub
import torch
import base
import os
import copy

[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /home/jovyan/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package punkt to /home/jovyan/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to /home/jovyan/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger_eng to
[nltk_data]     /home/jovyan/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger_eng is already up-to-
[nltk_data]       date!


In [2]:
my_glove = kagglehub.dataset_download("thanakomsn/glove6b300dtxt")
print(my_glove)

/home/jovyan/.cache/kagglehub/datasets/thanakomsn/glove6b300dtxt/versions/1


In [None]:
GLOVE_FILE = f"{my_glove}/glove.6B.300d.txt"
DATASET = "trec"

In [4]:
train_data = load_from_disk(f"~/data/{DATASET}/train-logits_fine")
eval_data = load_from_disk(f"~/data/{DATASET}/eval-logits_fine")
test_data = load_from_disk(f"~/data/{DATASET}/test-logits_fine")

all_train_data = load_from_disk(f"~/data/{DATASET}/train-logits-augmented_fine")

all_data = concatenate_datasets([load_from_disk(file) for file in [f"~/data/{DATASET}/eval-logits_fine", f"~/data/{DATASET}/test-logits_fine", f"~/data/{DATASET}/train-logits-augmented_fine"]])
tokenizer = BasicTokenizer(do_lower_case=True)

In [5]:
if torch.cuda.is_available():
    device = torch.device("cuda")
    print("GPU is available and will be used:", torch.cuda.get_device_name(0))
else:
    device = torch.device("cpu")
    print("GPU is not available, using CPU.")

GPU is available and will be used: NVIDIA A100 80GB PCIe MIG 2g.20gb


In [6]:
train_data_tokens = list(map(lambda e: tokenizer.tokenize(e["sentence"]), train_data))
eval_data_tokens = list(map(lambda e: tokenizer.tokenize(e["sentence"]), eval_data))
test_data_tokens = list(map(lambda e: tokenizer.tokenize(e["sentence"]), test_data))

all_train_data_tokens = list(map(lambda e: tokenizer.tokenize(e["sentence"]), all_train_data))

all_data_tokens = list(map(lambda e: tokenizer.tokenize(e["sentence"]), all_data))

In [7]:
vocab = base.get_vocab(all_data_tokens)

In [None]:
word_index = dict(zip(vocab, range(len(vocab))))

In [9]:
embeddings_index = base.get_embeddings_indeces(GLOVE_FILE)

Found 400000 word vectors.


In [10]:
print(len(vocab))
num_tokens = len(vocab) + 2
embedding_dim = 300

8766


In [11]:
embedding_matrix = base.get_embedding_matrix(num_tokens, embedding_dim, word_index, embeddings_index)

Converted 8551 words (215) misses


In [12]:
train_data_index = list(map(lambda x: list(map(lambda y: word_index[y], x)),train_data_tokens))
eval_data_index = list(map(lambda x: list(map(lambda y: word_index[y], x)),eval_data_tokens))
test_data_index = list(map(lambda x: list(map(lambda y: word_index[y], x)),test_data_tokens))

all_train_data_index = list(map(lambda x: list(map(lambda y: word_index[y], x)),all_train_data_tokens))

In [13]:
train_padded_data = list(map(lambda x: base.padd(x,60), train_data_index))
eval_padded_data = list(map(lambda x: base.padd(x,60), eval_data_index))
test_padded_data = list(map(lambda x: base.padd(x,60), test_data_index))

all_train_padded_data = list(map(lambda x: base.padd(x,60), all_train_data_index))

In [14]:
train_data = train_data.add_column("input_ids", train_padded_data)
eval_data = eval_data.add_column("input_ids", eval_padded_data)
test_data = test_data.add_column("input_ids", test_padded_data)

all_train_data = all_train_data.add_column("input_ids", all_train_padded_data)

In [15]:
train_data_gpu = copy.deepcopy(train_data)
train_data_gpu.set_format(type="torch", columns=["input_ids"], device="cuda")
gpu_data_loader = DataLoader(train_data_gpu, batch_size=1, shuffle=False)

train_data_cpu = copy.deepcopy(train_data)
train_data_cpu.set_format(type="torch", columns=["input_ids"], device="cpu")
cpu_data_loader = DataLoader(train_data_cpu, batch_size=1, shuffle=False)

In [16]:
model = base.BiLSTMClassifier(embedding_matrix=embedding_matrix, embedding_dim=embedding_dim, fc_dim=400, hidden_dim=300, output_dim=50, freeze_embed=False)

In [17]:
print(model)

BiLSTMClassifier(
  (embedding): Embedding(8768, 300)
  (lstm): LSTM(300, 300, batch_first=True, bidirectional=True)
  (fc1): Linear(in_features=600, out_features=400, bias=True)
  (dropout): Dropout(p=0.2, inplace=False)
  (fc2): Linear(in_features=400, out_features=50, bias=True)
)


In [18]:
training_args = base.get_training_args(output_dir=f"~/results/{DATASET}/bilstm-base-embedd_fine", logging_dir=f"~/logs/{DATASET}/bilstm-base-embedd_fine", lr=.0045, weight_decay=0.001, warmup_steps=2, epochs=20)

In [None]:
base.reset_seed()

In [20]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_data,
    eval_dataset=eval_data,
    compute_metrics=base.compute_metrics,
    callbacks = [EarlyStoppingCallback(early_stopping_patience = 3)]
)

In [21]:
trainer.train()

Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,2.6393,1.913013,0.491292,0.159073,0.16506,0.136266
2,1.4513,1.342894,0.670944,0.313783,0.318221,0.304136
3,0.7944,1.06909,0.750687,0.494381,0.455978,0.455935
4,0.3421,1.166491,0.779102,0.619054,0.53783,0.555848
5,0.1218,1.288387,0.773602,0.662064,0.6605,0.641713
6,0.0487,1.370785,0.780018,0.653505,0.64059,0.630419
7,0.0173,1.401151,0.777269,0.697924,0.683736,0.672125
8,0.0082,1.453418,0.789184,0.688555,0.688764,0.671748
9,0.0021,1.444767,0.8011,0.696339,0.706155,0.690961
10,0.0009,1.469408,0.799267,0.704121,0.705084,0.692603


TrainOutput(global_step=525, training_loss=0.36190190801840455, metrics={'train_runtime': 71.1507, 'train_samples_per_second': 1225.849, 'train_steps_per_second': 9.838, 'total_flos': 0.0, 'train_loss': 0.36190190801840455, 'epoch': 15.0})

In [22]:
model.eval()

BiLSTMClassifier(
  (embedding): Embedding(8768, 300)
  (lstm): LSTM(300, 300, batch_first=True, bidirectional=True)
  (fc1): Linear(in_features=600, out_features=400, bias=True)
  (dropout): Dropout(p=0.2, inplace=False)
  (fc2): Linear(in_features=400, out_features=50, bias=True)
)

In [23]:
trainer.evaluate(test_data)

{'eval_loss': 1.3303099870681763,
 'eval_accuracy': 0.808,
 'eval_precision': 0.6370581076815878,
 'eval_recall': 0.6616504636649941,
 'eval_f1': 0.6334041083600327,
 'eval_runtime': 3.8279,
 'eval_samples_per_second': 130.621,
 'eval_steps_per_second': 1.045,
 'epoch': 15.0}

In [24]:
torch.save(model.state_dict(), f"{os.path.expanduser('~')}/models/{DATASET}/bilstm-base-embedd_fine.pth")

In [25]:
student_model = base.BiLSTMClassifier(embedding_matrix=embedding_matrix, embedding_dim=embedding_dim, fc_dim=400, hidden_dim=300, output_dim=50, freeze_embed=False)

In [26]:
training_args = base.get_training_args(output_dir=f"~/results/{DATASET}/bilstm-distill-embedd_fine", remove_unused_columns=False, logging_dir=f"~/logs/{DATASET}/bilstm-distill-embedd_fine", lr=.005, weight_decay = .007, epochs=20, lambda_param=.4, temp=6.5)

In [27]:
base.reset_seed()

In [28]:
trainer = base.DistilTrainer(
    student_model=student_model,
    args=training_args,
    train_dataset=train_data,
    eval_dataset=eval_data,
    compute_metrics=base.compute_metrics,
    callbacks = [EarlyStoppingCallback(early_stopping_patience = 3)]
)

In [29]:
trainer.train()

Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,1.7812,1.400432,0.520623,0.15679,0.170152,0.15141
2,1.0556,0.960257,0.686526,0.315995,0.308748,0.304696
3,0.6322,0.765697,0.756187,0.448733,0.410224,0.407158
4,0.368,0.705245,0.776352,0.530282,0.504098,0.505777
5,0.2222,0.66052,0.797434,0.618927,0.617819,0.610857
6,0.1423,0.639446,0.80385,0.720509,0.683139,0.689776
7,0.1047,0.628252,0.807516,0.755707,0.701242,0.712092
8,0.0857,0.627174,0.802933,0.719695,0.677646,0.685046
9,0.0771,0.611835,0.812099,0.778042,0.709264,0.727893
10,0.0729,0.614817,0.814849,0.774777,0.70791,0.725702


TrainOutput(global_step=420, training_loss=0.3899246766453698, metrics={'train_runtime': 51.2518, 'train_samples_per_second': 1701.794, 'train_steps_per_second': 13.658, 'total_flos': 0.0, 'train_loss': 0.3899246766453698, 'epoch': 12.0})

In [30]:
student_model.eval()

BiLSTMClassifier(
  (embedding): Embedding(8768, 300)
  (lstm): LSTM(300, 300, batch_first=True, bidirectional=True)
  (fc1): Linear(in_features=600, out_features=400, bias=True)
  (dropout): Dropout(p=0.2, inplace=False)
  (fc2): Linear(in_features=400, out_features=50, bias=True)
)

In [31]:
trainer.evaluate(test_data)

{'eval_loss': 0.5733435750007629,
 'eval_accuracy': 0.832,
 'eval_precision': 0.7266360455109132,
 'eval_recall': 0.7010663597369853,
 'eval_f1': 0.6982661452601435,
 'eval_runtime': 3.0744,
 'eval_samples_per_second': 162.633,
 'eval_steps_per_second': 1.301,
 'epoch': 12.0}

In [32]:
torch.save(student_model.state_dict(), f"{os.path.expanduser('~')}/models/{DATASET}/bilstm-distill-embedd_fine.pth")

In [33]:
model = base.BiLSTMClassifier(embedding_matrix=embedding_matrix, embedding_dim=embedding_dim, fc_dim=400, hidden_dim=300, output_dim=50, freeze_embed=False)

In [34]:
training_args = base.get_training_args(output_dir=f"~/results/{DATASET}/bilstm-base-embedd-aug_fine", logging_dir=f"~/logs/{DATASET}/bilstm-base-embedd-aug_fine", lr=0.0045, weight_decay=0.003, warmup_steps=10, epochs=20)

In [35]:
base.reset_seed()

In [36]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=all_train_data,
    eval_dataset=eval_data,
    compute_metrics=base.compute_metrics,
    callbacks = [EarlyStoppingCallback(early_stopping_patience = 3)]
)

In [37]:
trainer.train()

Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.476,1.164467,0.813932,0.813513,0.771636,0.777388
2,0.0259,1.344479,0.804766,0.767075,0.71837,0.725638
3,0.013,1.619465,0.819432,0.771845,0.756855,0.750266
4,0.0172,1.80422,0.808433,0.773598,0.729012,0.736796


TrainOutput(global_step=2112, training_loss=0.13303653873277432, metrics={'train_runtime': 36.8322, 'train_samples_per_second': 36694.003, 'train_steps_per_second': 286.706, 'total_flos': 0.0, 'train_loss': 0.13303653873277432, 'epoch': 4.0})

In [38]:
model.eval()

BiLSTMClassifier(
  (embedding): Embedding(8768, 300)
  (lstm): LSTM(300, 300, batch_first=True, bidirectional=True)
  (fc1): Linear(in_features=600, out_features=400, bias=True)
  (dropout): Dropout(p=0.2, inplace=False)
  (fc2): Linear(in_features=400, out_features=50, bias=True)
)

In [39]:
trainer.evaluate(test_data)

{'eval_loss': 1.2820587158203125,
 'eval_accuracy': 0.81,
 'eval_precision': 0.6862247210506169,
 'eval_recall': 0.6979972371801639,
 'eval_f1': 0.6713877154091303,
 'eval_runtime': 3.2713,
 'eval_samples_per_second': 152.844,
 'eval_steps_per_second': 1.223,
 'epoch': 4.0}

In [40]:
torch.save(model.state_dict(), f"{os.path.expanduser('~')}/models/{DATASET}/bilstm-base-embedd-aug_fine.pth")

In [41]:
student_model = base.BiLSTMClassifier(embedding_matrix=embedding_matrix, embedding_dim=embedding_dim, fc_dim=400, hidden_dim=300, output_dim=50, freeze_embed=False)

In [42]:
training_args = base.get_training_args(output_dir=f"~/results/{DATASET}/bilstm-distill-embedd-aug_fine", remove_unused_columns=False, logging_dir=f"~/logs/{DATASET}/bilstm-distill-embedd-aug_fine", lr=0.0045, weight_decay=0.002, warmup_steps=45, epochs=20, lambda_param=.75, temp=3)

In [43]:
base.reset_seed()

In [44]:
trainer = base.DistilTrainer(
    student_model=student_model,
    args=training_args,
    train_dataset=all_train_data,
    eval_dataset=eval_data,
    compute_metrics=base.compute_metrics,
    callbacks = [EarlyStoppingCallback(early_stopping_patience = 3)]
)

In [45]:
trainer.train()

Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.3972,0.431343,0.829514,0.759043,0.682606,0.707899
2,0.0866,0.446704,0.824015,0.823294,0.74653,0.771076
3,0.0753,0.436021,0.832264,0.818126,0.746909,0.767275
4,0.0715,0.424255,0.837764,0.825741,0.759657,0.777921
5,0.0677,0.440074,0.832264,0.850235,0.766339,0.794497
6,0.0662,0.444335,0.832264,0.845453,0.767867,0.793365
7,0.0648,0.43599,0.828598,0.842785,0.76018,0.789763
8,0.0627,0.442105,0.824015,0.841279,0.746768,0.779926


TrainOutput(global_step=4224, training_loss=0.11150466131441521, metrics={'train_runtime': 75.4135, 'train_samples_per_second': 17921.468, 'train_steps_per_second': 140.028, 'total_flos': 0.0, 'train_loss': 0.11150466131441521, 'epoch': 8.0})

In [46]:
student_model.eval()

BiLSTMClassifier(
  (embedding): Embedding(8768, 300)
  (lstm): LSTM(300, 300, batch_first=True, bidirectional=True)
  (fc1): Linear(in_features=600, out_features=400, bias=True)
  (dropout): Dropout(p=0.2, inplace=False)
  (fc2): Linear(in_features=400, out_features=50, bias=True)
)

In [47]:
trainer.evaluate(test_data)

{'eval_loss': 0.36612752079963684,
 'eval_accuracy': 0.85,
 'eval_precision': 0.7502428182961576,
 'eval_recall': 0.7270851677785881,
 'eval_f1': 0.7186729878444921,
 'eval_runtime': 4.1422,
 'eval_samples_per_second': 120.709,
 'eval_steps_per_second': 0.966,
 'epoch': 8.0}

In [48]:
torch.save(model.state_dict(), f"{os.path.expanduser('~')}/models/{DATASET}/bilstm-distill-embedd-aug_fine.pth")

In [49]:
base.count_parameters(student_model)

model size: 16.539MB.
Total Trainable Params: 4335650.


Unnamed: 0,Modules,Parameters
0,embedding.weight,2630400
1,lstm.weight_ih_l0,360000
2,lstm.weight_hh_l0,360000
3,lstm.bias_ih_l0,1200
4,lstm.bias_hh_l0,1200
5,lstm.weight_ih_l0_reverse,360000
6,lstm.weight_hh_l0_reverse,360000
7,lstm.bias_ih_l0_reverse,1200
8,lstm.bias_hh_l0_reverse,1200
9,fc1.weight,240000


In [50]:
cpu_benchmark = base.BenchMarkRunner(student_model, cpu_data_loader, "cpu", 1000)
print(cpu_benchmark.run_benchmark())

<torch.utils.benchmark.utils.common.Measurement object at 0x78034cfaaf20>
self.infer_speed_comp()
  3.46 ms
  1 measurement, 1000 runs , 4 threads


In [51]:
gpu_benchmark = base.BenchMarkRunner(student_model, gpu_data_loader, "cuda", 1000)
print(gpu_benchmark.run_benchmark())

<torch.utils.benchmark.utils.common.Measurement object at 0x7804f50a9c30>
self.infer_speed_comp()
  1.73 ms
  1 measurement, 1000 runs , 4 threads
