In [1]:
from datasets import concatenate_datasets, load_from_disk
from transformers import BasicTokenizer, Trainer, EarlyStoppingCallback
import kagglehub
import torch
import base
import os

[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /home/jovyan/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package punkt to /home/jovyan/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to /home/jovyan/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger_eng to
[nltk_data]     /home/jovyan/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger_eng is already up-to-
[nltk_data]       date!


In [2]:
my_glove = kagglehub.dataset_download("thanakomsn/glove6b300dtxt")
print(my_glove)

/home/jovyan/.cache/kagglehub/datasets/thanakomsn/glove6b300dtxt/versions/1


In [3]:
GLOVE_FILE = f"{my_glove}/glove.6B.300d.txt"
DATASET = "trec"

In [4]:
train_data = load_from_disk(f"~/data/{DATASET}/train-logits_fine")
eval_data = load_from_disk(f"~/data/{DATASET}/eval-logits_fine")
test_data = load_from_disk(f"~/data/{DATASET}/test-logits_fine")

all_train_data = load_from_disk(f"~/data/{DATASET}/train-logits-augmented_fine")

all_data = concatenate_datasets([load_from_disk(file) for file in [f"~/data/{DATASET}/eval-logits_fine", f"~/data/{DATASET}/test-logits_fine", f"~/data/{DATASET}/train-logits-augmented_fine"]])
tokenizer = BasicTokenizer(do_lower_case=True)

In [5]:
if torch.cuda.is_available():
    device = torch.device("cuda")
    print("GPU is available and will be used:", torch.cuda.get_device_name(0))
else:
    device = torch.device("cpu")
    print("GPU is not available, using CPU.")

GPU is available and will be used: NVIDIA H100 PCIe


In [6]:
train_data_tokens = list(map(lambda e: tokenizer.tokenize(e["sentence"]), train_data))
eval_data_tokens = list(map(lambda e: tokenizer.tokenize(e["sentence"]), eval_data))
test_data_tokens = list(map(lambda e: tokenizer.tokenize(e["sentence"]), test_data))

all_train_data_tokens = list(map(lambda e: tokenizer.tokenize(e["sentence"]), all_train_data))

all_data_tokens = list(map(lambda e: tokenizer.tokenize(e["sentence"]), all_data))

In [7]:
vocab = base.get_vocab(all_data_tokens)

In [8]:
word_index = dict(zip(vocab, range(len(vocab))))

In [9]:
embeddings_index = base.get_embeddings_indeces(GLOVE_FILE)

Found 400000 word vectors.


In [10]:
print(len(vocab))
num_tokens = len(vocab) + 2
embedding_dim = 300

8766


In [11]:
embedding_matrix = base.get_embedding_matrix(num_tokens, embedding_dim, word_index, embeddings_index)

Converted 8551 words (215) misses


In [12]:
train_data_index = list(map(lambda x: list(map(lambda y: word_index[y], x)),train_data_tokens))
eval_data_index = list(map(lambda x: list(map(lambda y: word_index[y], x)),eval_data_tokens))
test_data_index = list(map(lambda x: list(map(lambda y: word_index[y], x)),test_data_tokens))

all_train_data_index = list(map(lambda x: list(map(lambda y: word_index[y], x)),all_train_data_tokens))

In [13]:
train_padded_data = list(map(lambda x: base.padd(x,60), train_data_index))
eval_padded_data = list(map(lambda x: base.padd(x,60), eval_data_index))
test_padded_data = list(map(lambda x: base.padd(x,60), test_data_index))

all_train_padded_data = list(map(lambda x: base.padd(x,60), all_train_data_index))

In [14]:
train_data = train_data.add_column("input_ids", train_padded_data)
eval_data = eval_data.add_column("input_ids", eval_padded_data)
test_data = test_data.add_column("input_ids", test_padded_data)

all_train_data = all_train_data.add_column("input_ids", all_train_padded_data)

In [15]:
model = base.BiLSTMClassifier(embedding_matrix=embedding_matrix, embedding_dim=embedding_dim, fc_dim=400, hidden_dim=300, output_dim=50, freeze_embed=False)

In [16]:
print(model)

BiLSTMClassifier(
  (embedding): Embedding(8768, 300)
  (lstm): LSTM(300, 300, batch_first=True, bidirectional=True)
  (fc1): Linear(in_features=600, out_features=400, bias=True)
  (dropout): Dropout(p=0.2, inplace=False)
  (fc2): Linear(in_features=400, out_features=50, bias=True)
)


In [18]:
training_args = base.get_training_args(output_dir=f"~/results/{DATASET}/bilstm-base_fine_embedd", logging_dir=f"~/logs/{DATASET}/bilstm-base_fine_embedd", lr=.005, weight_decay=0.001, warmup_steps=2, epochs=20, batch_size=128)

In [19]:
base.reset_seed()

In [20]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_data,
    eval_dataset=eval_data,
    compute_metrics=base.compute_metrics,
    callbacks = [EarlyStoppingCallback(early_stopping_patience = 3)]
)

In [21]:
trainer.train()

Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,2.5926,1.848806,0.534372,0.1897,0.188172,0.169641
2,1.4058,1.276808,0.679193,0.319556,0.330704,0.314073
3,0.7476,1.100861,0.745188,0.520187,0.466819,0.472995
4,0.3496,1.160507,0.75527,0.617434,0.548593,0.558517
5,0.1314,1.296833,0.774519,0.653317,0.62119,0.620714
6,0.0406,1.399598,0.788268,0.68875,0.675803,0.669047
7,0.0174,1.482527,0.780018,0.700052,0.667438,0.661827
8,0.0056,1.487703,0.784601,0.712193,0.686995,0.678266
9,0.0028,1.505378,0.791934,0.725982,0.6629,0.677721
10,0.0022,1.53908,0.793767,0.729023,0.67981,0.68593


TrainOutput(global_step=455, training_loss=0.4075402093948899, metrics={'train_runtime': 60.9205, 'train_samples_per_second': 1431.702, 'train_steps_per_second': 11.49, 'total_flos': 0.0, 'train_loss': 0.4075402093948899, 'epoch': 13.0})

In [22]:
model.eval()

BiLSTMClassifier(
  (embedding): Embedding(8768, 300)
  (lstm): LSTM(300, 300, batch_first=True, bidirectional=True)
  (fc1): Linear(in_features=600, out_features=400, bias=True)
  (dropout): Dropout(p=0.2, inplace=False)
  (fc2): Linear(in_features=400, out_features=50, bias=True)
)

In [23]:
trainer.evaluate(test_data)

{'eval_loss': 1.3100125789642334,
 'eval_accuracy': 0.81,
 'eval_precision': 0.6784269701338591,
 'eval_recall': 0.6670404908702179,
 'eval_f1': 0.653112216645774,
 'eval_runtime': 3.3063,
 'eval_samples_per_second': 151.226,
 'eval_steps_per_second': 1.21,
 'epoch': 13.0}

In [24]:
torch.save(model.state_dict(), f"{os.path.expanduser('~')}/models/{DATASET}/bilstm-base_fine_embedd.pth")

In [25]:
student_model = base.BiLSTMClassifier(embedding_matrix=embedding_matrix, embedding_dim=embedding_dim, fc_dim=400, hidden_dim=300, output_dim=50, freeze_embed=False)

In [27]:
training_args = base.get_training_args(output_dir=f"~/results/{DATASET}/bilstm-distill_fine_embedd", remove_unused_columns=False, logging_dir=f"~/logs/{DATASET}/bilstm-distill_fine_embedd", lr=.005, weight_decay = .008, epochs=20, batch_size=128, lambda_param=.5, temp=6.5)

In [28]:
base.reset_seed()

In [29]:
trainer = base.DistilTrainer(
    student_model=student_model,
    args=training_args,
    train_dataset=train_data,
    eval_dataset=eval_data,
    compute_metrics=base.compute_metrics,
    callbacks = [EarlyStoppingCallback(early_stopping_patience = 3)]
)

In [30]:
trainer.train()

Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,1.586,1.180131,0.569203,0.165335,0.186143,0.163274
2,0.9358,0.89588,0.673694,0.32379,0.303174,0.294209
3,0.5862,0.691576,0.758937,0.421281,0.418588,0.404726
4,0.3531,0.629786,0.767186,0.497468,0.458946,0.465979
5,0.2234,0.589186,0.791017,0.57357,0.536611,0.53932
6,0.1559,0.566985,0.802016,0.666566,0.610506,0.618351
7,0.1108,0.553171,0.808433,0.684647,0.630072,0.645062
8,0.0921,0.550189,0.810266,0.748492,0.656978,0.685277
9,0.0794,0.541531,0.807516,0.734982,0.662601,0.68534
10,0.0727,0.542373,0.812099,0.738007,0.670371,0.689537


TrainOutput(global_step=560, training_loss=0.2864444660288947, metrics={'train_runtime': 72.097, 'train_samples_per_second': 1209.759, 'train_steps_per_second': 9.709, 'total_flos': 0.0, 'train_loss': 0.2864444660288947, 'epoch': 16.0})

In [31]:
student_model.eval()

BiLSTMClassifier(
  (embedding): Embedding(8768, 300)
  (lstm): LSTM(300, 300, batch_first=True, bidirectional=True)
  (fc1): Linear(in_features=600, out_features=400, bias=True)
  (dropout): Dropout(p=0.2, inplace=False)
  (fc2): Linear(in_features=400, out_features=50, bias=True)
)

In [32]:
trainer.evaluate(test_data)

{'eval_loss': 0.49377110600471497,
 'eval_accuracy': 0.842,
 'eval_precision': 0.7307214920902012,
 'eval_recall': 0.7151819267016707,
 'eval_f1': 0.7106890772345754,
 'eval_runtime': 3.3712,
 'eval_samples_per_second': 148.315,
 'eval_steps_per_second': 1.187,
 'epoch': 16.0}

In [33]:
torch.save(student_model.state_dict(), f"{os.path.expanduser('~')}/models/{DATASET}/bilstm-distill_fine_embedd.pth")

In [34]:
model = base.BiLSTMClassifier(embedding_matrix=embedding_matrix, embedding_dim=embedding_dim, fc_dim=400, hidden_dim=300, output_dim=50, freeze_embed=False)

In [35]:
training_args = base.get_training_args(output_dir=f"~/results/{DATASET}/bilstm-base-aug_fine_embedd", logging_dir=f"~/logs/{DATASET}/bilstm-base-aug_fine_embedd", lr=0.0035, weight_decay=0.003, warmup_steps=6, epochs=20, batch_size=128)

In [36]:
base.reset_seed()

In [37]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=all_train_data,
    eval_dataset=eval_data,
    compute_metrics=base.compute_metrics,
    callbacks = [EarlyStoppingCallback(early_stopping_patience = 3)]
)

In [38]:
trainer.train()

Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.5044,1.117437,0.807516,0.769121,0.709247,0.723681
2,0.0214,1.404991,0.802933,0.743122,0.74463,0.728021
3,0.014,1.625707,0.79835,0.773361,0.720528,0.726245
4,0.0136,1.573713,0.811182,0.759771,0.707243,0.718495
5,0.0066,1.807786,0.813016,0.805168,0.752777,0.762375
6,0.0074,1.848962,0.815765,0.794529,0.751498,0.757117
7,0.0041,2.041687,0.812099,0.765724,0.728152,0.734704
8,0.0065,1.950236,0.806599,0.733352,0.713455,0.712038


TrainOutput(global_step=4184, training_loss=0.07225541932860935, metrics={'train_runtime': 115.0389, 'train_samples_per_second': 11624.592, 'train_steps_per_second': 90.926, 'total_flos': 0.0, 'train_loss': 0.07225541932860935, 'epoch': 8.0})

In [39]:
model.eval()

BiLSTMClassifier(
  (embedding): Embedding(8768, 300)
  (lstm): LSTM(300, 300, batch_first=True, bidirectional=True)
  (fc1): Linear(in_features=600, out_features=400, bias=True)
  (dropout): Dropout(p=0.2, inplace=False)
  (fc2): Linear(in_features=400, out_features=50, bias=True)
)

In [40]:
trainer.evaluate(test_data)

{'eval_loss': 1.6040947437286377,
 'eval_accuracy': 0.838,
 'eval_precision': 0.7170619750720718,
 'eval_recall': 0.7312740111641489,
 'eval_f1': 0.7027150612991261,
 'eval_runtime': 3.4479,
 'eval_samples_per_second': 145.017,
 'eval_steps_per_second': 1.16,
 'epoch': 8.0}

In [41]:
torch.save(model.state_dict(), f"{os.path.expanduser('~')}/models/{DATASET}/bilstm-base-aug_fine_embedd.pth")

In [42]:
student_model = base.BiLSTMClassifier(embedding_matrix=embedding_matrix, embedding_dim=embedding_dim, fc_dim=400, hidden_dim=300, output_dim=50, freeze_embed=False)

In [43]:
training_args = base.get_training_args(output_dir=f"~/results/{DATASET}/bilstm-distill-aug_fine_embedd", remove_unused_columns=False, logging_dir=f"~/logs/{DATASET}/bilstm-distill-aug_fine_embedd", lr=0.0045, weight_decay=0.002, warmup_steps=49, epochs=20, batch_size=128, lambda_param=.9, temp=2)

In [44]:
base.reset_seed()

In [45]:
trainer = base.DistilTrainer(
    student_model=student_model,
    args=training_args,
    train_dataset=all_train_data,
    eval_dataset=eval_data,
    compute_metrics=base.compute_metrics,
    callbacks = [EarlyStoppingCallback(early_stopping_patience = 3)]
)

In [46]:
trainer.train()

Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.462,0.485102,0.813016,0.680918,0.612705,0.632058
2,0.0872,0.47944,0.825848,0.794887,0.670816,0.710328
3,0.0725,0.493578,0.819432,0.790792,0.713239,0.736147
4,0.0656,0.474859,0.826764,0.773054,0.690754,0.713957
5,0.061,0.468091,0.826764,0.821256,0.73242,0.75978
6,0.0575,0.472743,0.832264,0.825034,0.744926,0.770607
7,0.0548,0.475578,0.829514,0.846457,0.743583,0.778712
8,0.0531,0.471309,0.833181,0.850078,0.740529,0.775706
9,0.0503,0.447396,0.835014,0.847293,0.747636,0.780225
10,0.0482,0.45321,0.831347,0.836824,0.74772,0.775548


TrainOutput(global_step=6276, training_loss=0.0919562218825934, metrics={'train_runtime': 185.1744, 'train_samples_per_second': 7221.734, 'train_steps_per_second': 56.487, 'total_flos': 0.0, 'train_loss': 0.0919562218825934, 'epoch': 12.0})

In [47]:
student_model.eval()

BiLSTMClassifier(
  (embedding): Embedding(8768, 300)
  (lstm): LSTM(300, 300, batch_first=True, bidirectional=True)
  (fc1): Linear(in_features=600, out_features=400, bias=True)
  (dropout): Dropout(p=0.2, inplace=False)
  (fc2): Linear(in_features=400, out_features=50, bias=True)
)

In [48]:
trainer.evaluate(test_data)

{'eval_loss': 0.31923750042915344,
 'eval_accuracy': 0.838,
 'eval_precision': 0.7880327087102402,
 'eval_recall': 0.728432640970928,
 'eval_f1': 0.7320033045657414,
 'eval_runtime': 3.5403,
 'eval_samples_per_second': 141.233,
 'eval_steps_per_second': 1.13,
 'epoch': 12.0}

In [49]:
torch.save(model.state_dict(), f"{os.path.expanduser('~')}/models/{DATASET}/bilstm-distill-aug_fine_embedd.pth")