In [2]:
from datasets import concatenate_datasets, load_from_disk
from transformers import BasicTokenizer, Trainer, EarlyStoppingCallback
import kagglehub
import torch
import base
import os

[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /home/jovyan/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package punkt to /home/jovyan/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to /home/jovyan/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger_eng to
[nltk_data]     /home/jovyan/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger_eng is already up-to-
[nltk_data]       date!


In [3]:
my_glove = kagglehub.dataset_download("thanakomsn/glove6b300dtxt")
print(my_glove)

/home/jovyan/.cache/kagglehub/datasets/thanakomsn/glove6b300dtxt/versions/1


In [4]:
GLOVE_FILE = f"{my_glove}/glove.6B.300d.txt"
DATASET = "trec"

In [5]:
train_data = load_from_disk(f"~/data/{DATASET}/train-logits_fine")
eval_data = load_from_disk(f"~/data/{DATASET}/eval-logits_fine")
test_data = load_from_disk(f"~/data/{DATASET}/test-logits_fine")

all_train_data = load_from_disk(f"~/data/{DATASET}/train-logits-augmented_fine")

all_data = concatenate_datasets([load_from_disk(file) for file in [f"~/data/{DATASET}/eval-logits_fine", f"~/data/{DATASET}/test-logits_fine", f"~/data/{DATASET}/train-logits-augmented_fine"]])
tokenizer = BasicTokenizer(do_lower_case=True)

In [6]:
if torch.cuda.is_available():
    device = torch.device("cuda")
    print("GPU is available and will be used:", torch.cuda.get_device_name(0))
else:
    device = torch.device("cpu")
    print("GPU is not available, using CPU.")

GPU is available and will be used: NVIDIA A100 80GB PCIe MIG 2g.20gb


In [7]:
train_data_tokens = list(map(lambda e: tokenizer.tokenize(e["sentence"]), train_data))
eval_data_tokens = list(map(lambda e: tokenizer.tokenize(e["sentence"]), eval_data))
test_data_tokens = list(map(lambda e: tokenizer.tokenize(e["sentence"]), test_data))

all_train_data_tokens = list(map(lambda e: tokenizer.tokenize(e["sentence"]), all_train_data))

all_data_tokens = list(map(lambda e: tokenizer.tokenize(e["sentence"]), all_data))

In [8]:
vocab = base.get_vocab(all_data_tokens)

In [9]:
word_index = dict(zip(vocab, range(len(vocab))))

In [10]:
embeddings_index = base.get_embeddings_indeces(GLOVE_FILE)

Found 400000 word vectors.


In [11]:
print(len(vocab))
num_tokens = len(vocab) + 2
embedding_dim = 300

8766


In [12]:
embedding_matrix = base.get_embedding_matrix(num_tokens, embedding_dim, word_index, embeddings_index)

Converted 8551 words (215) misses


In [13]:
train_data_index = list(map(lambda x: list(map(lambda y: word_index[y], x)),train_data_tokens))
eval_data_index = list(map(lambda x: list(map(lambda y: word_index[y], x)),eval_data_tokens))
test_data_index = list(map(lambda x: list(map(lambda y: word_index[y], x)),test_data_tokens))

all_train_data_index = list(map(lambda x: list(map(lambda y: word_index[y], x)),all_train_data_tokens))

In [14]:
train_padded_data = list(map(lambda x: base.padd(x,60), train_data_index))
eval_padded_data = list(map(lambda x: base.padd(x,60), eval_data_index))
test_padded_data = list(map(lambda x: base.padd(x,60), test_data_index))

all_train_padded_data = list(map(lambda x: base.padd(x,60), all_train_data_index))

In [15]:
train_data = train_data.add_column("input_ids", train_padded_data)
eval_data = eval_data.add_column("input_ids", eval_padded_data)
test_data = test_data.add_column("input_ids", test_padded_data)

all_train_data = all_train_data.add_column("input_ids", all_train_padded_data)

In [15]:
model = base.BiLSTMClassifier(embedding_matrix=embedding_matrix, embedding_dim=embedding_dim, fc_dim=400, hidden_dim=300, output_dim=50, freeze_embed=False)

In [16]:
print(model)

BiLSTMClassifier(
  (embedding): Embedding(8768, 300)
  (lstm): LSTM(300, 300, batch_first=True, bidirectional=True)
  (fc1): Linear(in_features=600, out_features=400, bias=True)
  (dropout): Dropout(p=0.2, inplace=False)
  (fc2): Linear(in_features=400, out_features=50, bias=True)
)


In [17]:
training_args = base.get_training_args(output_dir=f"~/results/{DATASET}/bilstm-base_fine_embedd", logging_dir=f"~/logs/{DATASET}/bilstm-base_fine_embedd", lr=.001,  epochs=10, batch_size=128)

In [18]:
base.reset_seed()

In [19]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_data,
    eval_dataset=eval_data,
    compute_metrics=base.compute_metrics,
    callbacks = [EarlyStoppingCallback(early_stopping_patience = 3)]
)

In [20]:
trainer.train()

Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,3.0565,2.436181,0.423465,0.062694,0.095029,0.067
2,2.0935,1.847499,0.545371,0.165677,0.158087,0.135725
3,1.6178,1.520944,0.617782,0.255894,0.223985,0.216745
4,1.2109,1.332834,0.656279,0.318186,0.2745,0.27281
5,0.9269,1.200623,0.688359,0.370112,0.325136,0.327787
6,0.6972,1.156946,0.705775,0.43998,0.388992,0.391736
7,0.5335,1.19404,0.703025,0.439554,0.379934,0.393342
8,0.4254,1.14095,0.71494,0.435646,0.435435,0.43119
9,0.3472,1.166825,0.719523,0.473302,0.445483,0.451222
10,0.2853,1.154827,0.71769,0.471534,0.440157,0.445367


TrainOutput(global_step=350, training_loss=1.1194278717041015, metrics={'train_runtime': 50.7084, 'train_samples_per_second': 860.016, 'train_steps_per_second': 6.902, 'total_flos': 0.0, 'train_loss': 1.1194278717041015, 'epoch': 10.0})

In [21]:
model.eval()

BiLSTMClassifier(
  (embedding): Embedding(8768, 300)
  (lstm): LSTM(300, 300, batch_first=True, bidirectional=True)
  (fc1): Linear(in_features=600, out_features=400, bias=True)
  (dropout): Dropout(p=0.2, inplace=False)
  (fc2): Linear(in_features=400, out_features=50, bias=True)
)

In [22]:
trainer.evaluate(test_data)

{'eval_loss': 1.1103469133377075,
 'eval_accuracy': 0.728,
 'eval_precision': 0.4749934121396973,
 'eval_recall': 0.48748620936716064,
 'eval_f1': 0.45324427900232056,
 'eval_runtime': 3.1349,
 'eval_samples_per_second': 159.497,
 'eval_steps_per_second': 1.276,
 'epoch': 10.0}

In [23]:
torch.save(model.state_dict(), f"{os.path.expanduser('~')}/models/{DATASET}/bilstm-base_fine_embedd.pth")

In [24]:
student_model = base.BiLSTMClassifier(embedding_matrix=embedding_matrix, embedding_dim=embedding_dim, fc_dim=400, hidden_dim=300, output_dim=50, freeze_embed=False)

In [25]:
training_args = base.get_training_args(output_dir=f"~/results/{DATASET}/bilstm-distill_fine_embedd", remove_unused_columns=False, logging_dir=f"~/logs/{DATASET}/bilstm-distill_fine_embedd", lr=.001,  epochs=10, batch_size=128, lambda_param=.4, temp=2)

In [26]:
base.reset_seed()

In [27]:
trainer = base.DistilTrainer(
    student_model=student_model,
    args=training_args,
    train_dataset=train_data,
    eval_dataset=eval_data,
    compute_metrics=base.compute_metrics,
    callbacks = [EarlyStoppingCallback(early_stopping_patience = 3)]
)

In [28]:
trainer.train()

Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,2.7434,2.190063,0.374885,0.063283,0.078708,0.051833
2,1.8885,1.70226,0.492209,0.093788,0.12623,0.101897
3,1.5212,1.40056,0.585701,0.160858,0.181285,0.158776
4,1.2118,1.215174,0.647113,0.241764,0.233264,0.217399
5,0.9881,1.107085,0.68011,0.296947,0.294702,0.28531
6,0.813,1.024184,0.703941,0.339447,0.324904,0.317369
7,0.6855,1.025453,0.704858,0.396733,0.329421,0.337294
8,0.6089,0.965684,0.722273,0.391613,0.351299,0.348777
9,0.5324,0.959173,0.721357,0.413331,0.365,0.371822
10,0.4922,0.951961,0.721357,0.40093,0.36336,0.367079


TrainOutput(global_step=350, training_loss=1.1484937231881278, metrics={'train_runtime': 109.8886, 'train_samples_per_second': 396.856, 'train_steps_per_second': 3.185, 'total_flos': 0.0, 'train_loss': 1.1484937231881278, 'epoch': 10.0})

In [29]:
student_model.eval()

BiLSTMClassifier(
  (embedding): Embedding(8768, 300)
  (lstm): LSTM(300, 300, batch_first=True, bidirectional=True)
  (fc1): Linear(in_features=600, out_features=400, bias=True)
  (dropout): Dropout(p=0.2, inplace=False)
  (fc2): Linear(in_features=400, out_features=50, bias=True)
)

In [30]:
trainer.evaluate(test_data)

{'eval_loss': 0.9413951635360718,
 'eval_accuracy': 0.722,
 'eval_precision': 0.3996453701382308,
 'eval_recall': 0.4473198892622083,
 'eval_f1': 0.3991111870917891,
 'eval_runtime': 4.6147,
 'eval_samples_per_second': 108.351,
 'eval_steps_per_second': 0.867,
 'epoch': 10.0}

In [31]:
torch.save(student_model.state_dict(), f"{os.path.expanduser('~')}/models/{DATASET}/bilstm-distill_fine_embedd.pth")

In [50]:
model = base.BiLSTMClassifier(embedding_matrix=embedding_matrix, embedding_dim=embedding_dim, fc_dim=400, hidden_dim=300, output_dim=50, freeze_embed=False)

In [None]:
training_args = base.get_training_args(output_dir=f"~/results/{DATASET}/bilstm-base-aug_fine_embedd", logging_dir=f"~/logs/{DATASET}/bilstm-base-aug_fine_embedd", lr=0.0015, weight_decay=0.01, warmup_steps=45, adam_beta1=.95, epochs=30, batch_size=128)

In [52]:
base.reset_seed()

In [53]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=all_train_data,
    eval_dataset=eval_data,
    compute_metrics=base.compute_metrics,
    callbacks = [EarlyStoppingCallback(early_stopping_patience = 5)]
)

In [54]:
trainer.train()

Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.8855,1.169522,0.777269,0.643539,0.648662,0.629898
2,0.0436,1.269554,0.802016,0.746281,0.729575,0.719434
3,0.0133,1.378293,0.813016,0.7579,0.745359,0.732024
4,0.0103,1.447381,0.805683,0.739511,0.732891,0.725578
5,0.0072,1.450472,0.812099,0.767619,0.737152,0.738159
6,0.0077,1.698676,0.80385,0.74337,0.71237,0.714853
7,0.0051,1.635764,0.808433,0.719606,0.72581,0.710666
8,0.0032,1.694029,0.804766,0.772608,0.725948,0.729005
9,0.0039,1.72215,0.807516,0.743356,0.723619,0.717455
10,0.0016,1.673578,0.811182,0.745635,0.739458,0.730623


TrainOutput(global_step=5250, training_loss=0.09814934185573033, metrics={'train_runtime': 111.7299, 'train_samples_per_second': 18011.288, 'train_steps_per_second': 140.965, 'total_flos': 0.0, 'train_loss': 0.09814934185573033, 'epoch': 10.0})

In [55]:
model.eval()

BiLSTMClassifier(
  (embedding): Embedding(8768, 300)
  (lstm): LSTM(300, 300, batch_first=True, bidirectional=True)
  (fc1): Linear(in_features=600, out_features=400, bias=True)
  (dropout): Dropout(p=0.2, inplace=False)
  (fc2): Linear(in_features=400, out_features=50, bias=True)
)

In [56]:
trainer.evaluate(test_data)

{'eval_loss': 1.349766731262207,
 'eval_accuracy': 0.802,
 'eval_precision': 0.6860133764348323,
 'eval_recall': 0.6866811753501773,
 'eval_f1': 0.6607193638496234,
 'eval_runtime': 4.1044,
 'eval_samples_per_second': 121.822,
 'eval_steps_per_second': 0.975,
 'epoch': 10.0}

In [39]:
torch.save(model.state_dict(), f"{os.path.expanduser('~')}/models/{DATASET}/bilstm-base-aug_fine_embedd.pth")

In [48]:
student_model = base.BiLSTMClassifier(embedding_matrix=embedding_matrix, embedding_dim=embedding_dim, fc_dim=400, hidden_dim=300, output_dim=50, freeze_embed=False)

In [49]:
training_args = base.get_training_args(output_dir=f"~/results/{DATASET}/bilstm-distill-aug_fine_embedd", remove_unused_columns=False, logging_dir=f"~/logs/{DATASET}/bilstm-distill-aug_fine_embedd", lr=0.0015, weight_decay=0.01, warmup_steps=45, adam_beta1=.95, epochs=30, batch_size=128, lambda_param=.9, temp=2)

In [50]:
base.reset_seed()

In [51]:
trainer = base.DistilTrainer(
    student_model=student_model,
    args=training_args,
    train_dataset=all_train_data,
    eval_dataset=eval_data,
    compute_metrics=base.compute_metrics,
    callbacks = [EarlyStoppingCallback(early_stopping_patience = 3)]
)

In [52]:
trainer.train()

Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.7071,0.492144,0.787351,0.491361,0.474795,0.473572
2,0.1091,0.443025,0.820348,0.646857,0.604719,0.617351
3,0.0753,0.425821,0.824015,0.748452,0.672298,0.695759
4,0.0668,0.418902,0.827681,0.745506,0.676751,0.696104
5,0.0619,0.425407,0.821265,0.759629,0.676359,0.703837
6,0.0584,0.429843,0.824931,0.746868,0.684409,0.700482
7,0.0556,0.414237,0.833181,0.758684,0.702429,0.718426
8,0.0541,0.42557,0.831347,0.736853,0.68283,0.698321
9,0.0521,0.413955,0.828598,0.784349,0.694357,0.725718
10,0.0508,0.401972,0.84143,0.791208,0.709334,0.735189


TrainOutput(global_step=12075, training_loss=0.08045230920763984, metrics={'train_runtime': 307.9629, 'train_samples_per_second': 6534.554, 'train_steps_per_second': 51.143, 'total_flos': 0.0, 'train_loss': 0.08045230920763984, 'epoch': 23.0})

In [54]:
student_model.eval()

BiLSTMClassifier(
  (embedding): Embedding(8768, 300)
  (lstm): LSTM(300, 300, batch_first=True, bidirectional=True)
  (fc1): Linear(in_features=600, out_features=400, bias=True)
  (dropout): Dropout(p=0.2, inplace=False)
  (fc2): Linear(in_features=400, out_features=50, bias=True)
)

In [55]:
trainer.evaluate(test_data)

{'eval_loss': 0.30593571066856384,
 'eval_accuracy': 0.846,
 'eval_precision': 0.7545350981094542,
 'eval_recall': 0.7052223789244695,
 'eval_f1': 0.7065513632758179,
 'eval_runtime': 16.4272,
 'eval_samples_per_second': 30.437,
 'eval_steps_per_second': 0.243,
 'epoch': 23.0}

In [47]:
torch.save(model.state_dict(), f"{os.path.expanduser('~')}/models/{DATASET}/bilstm-distill-aug_fine_embedd.pth")

In [25]:
import time
from torch.utils.data import  DataLoader

base.count_parameters(model)
torch.cuda.synchronize() 
starter, ender = torch.cuda.Event(enable_timing=True), torch.cuda.Event(enable_timing=True)
device = "cuda"
model.to(device)

train_data.set_format(type="torch", columns=["input_ids"], device="cuda")
test_loader = DataLoader(train_data, batch_size=1, shuffle=False)

timings = []



for i, batch in enumerate(test_loader):
    if i >= 1000:
        break
    torch.cuda.synchronize()
    starter.record()
    with torch.no_grad():
        _ = model(**batch)
    ender.record()
    torch.cuda.synchronize()
    timings.append(starter.elapsed_time(ender))

print(f"Average Inference Time on GPU: {sum(timings) / len(timings):.3f} ms")





timings = []
device = "cpu"
model.to(device)
train_data.set_format(type="torch", columns=["input_ids"], device="cpu")
test_loader = DataLoader(train_data, batch_size=1, shuffle=False)
for i, batch in enumerate(test_loader):
    if i >= 1000:
        break
    start_time = time.perf_counter()
    with torch.no_grad():
        _ = model(**batch)
    end_time = time.perf_counter()
    timings.append((end_time - start_time)*1000)


print(f"Average Inference Time on CPU: {sum(timings) / len(timings):.3f} ms")

model size: 16.539MB.
Total Trainable Params: 4335650.
Average Inference Time on GPU: 1.030 ms
Average Inference Time on CPU: 9.659 ms
