In [1]:
from datasets import concatenate_datasets, load_from_disk
from transformers import BasicTokenizer, Trainer, EarlyStoppingCallback
import kagglehub
import torch
import base
import os

[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /home/jovyan/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package punkt to /home/jovyan/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to /home/jovyan/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger_eng to
[nltk_data]     /home/jovyan/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger_eng is already up-to-
[nltk_data]       date!


In [2]:
my_glove = kagglehub.dataset_download("thanakomsn/glove6b300dtxt")
print(my_glove)

/home/jovyan/.cache/kagglehub/datasets/thanakomsn/glove6b300dtxt/versions/1


In [3]:
GLOVE_FILE = f"{my_glove}/glove.6B.300d.txt"
DATASET = "trec"

In [4]:
train_data = load_from_disk(f"~/data/{DATASET}/train-logits_coarse")
eval_data = load_from_disk(f"~/data/{DATASET}/eval-logits_coarse")
test_data = load_from_disk(f"~/data/{DATASET}/test-logits_coarse")

all_train_data = load_from_disk(f"~/data/{DATASET}/train-logits-augmented_coarse")

all_data = concatenate_datasets([load_from_disk(file) for file in [f"~/data/{DATASET}/eval-logits_coarse", f"~/data/{DATASET}/test-logits_coarse", f"~/data/{DATASET}/train-logits-augmented_coarse"]])
tokenizer = BasicTokenizer(do_lower_case=True)

In [5]:
if torch.cuda.is_available():
    device = torch.device("cuda")
    print("GPU is available and will be used:", torch.cuda.get_device_name(0))
else:
    device = torch.device("cpu")
    print("GPU is not available, using CPU.")

GPU is available and will be used: NVIDIA A100 80GB PCIe MIG 2g.20gb


In [6]:
train_data_tokens = list(map(lambda e: tokenizer.tokenize(e["sentence"]), train_data))
eval_data_tokens = list(map(lambda e: tokenizer.tokenize(e["sentence"]), eval_data))
test_data_tokens = list(map(lambda e: tokenizer.tokenize(e["sentence"]), test_data))

all_train_data_tokens = list(map(lambda e: tokenizer.tokenize(e["sentence"]), all_train_data))

all_data_tokens = list(map(lambda e: tokenizer.tokenize(e["sentence"]), all_data))

In [7]:
vocab = base.get_vocab(all_data_tokens)

In [8]:
word_index = dict(zip(vocab, range(len(vocab))))

In [9]:
embeddings_index = base.get_embeddings_indeces(GLOVE_FILE)

Found 400000 word vectors.


In [10]:
print(len(vocab))
num_tokens = len(vocab) + 2
embedding_dim = 300

8766


In [11]:
embedding_matrix = base.get_embedding_matrix(num_tokens, embedding_dim, word_index, embeddings_index)

Converted 8551 words (215) misses


In [12]:
train_data_index = list(map(lambda x: list(map(lambda y: word_index[y], x)),train_data_tokens))
eval_data_index = list(map(lambda x: list(map(lambda y: word_index[y], x)),eval_data_tokens))
test_data_index = list(map(lambda x: list(map(lambda y: word_index[y], x)),test_data_tokens))

all_train_data_index = list(map(lambda x: list(map(lambda y: word_index[y], x)),all_train_data_tokens))

In [13]:
train_padded_data = list(map(lambda x: base.padd(x,60), train_data_index))
eval_padded_data = list(map(lambda x: base.padd(x,60), eval_data_index))
test_padded_data = list(map(lambda x: base.padd(x,60), test_data_index))

all_train_padded_data = list(map(lambda x: base.padd(x,60), all_train_data_index))

In [14]:
train_data = train_data.add_column("input_ids", train_padded_data)
eval_data = eval_data.add_column("input_ids", eval_padded_data)
test_data = test_data.add_column("input_ids", test_padded_data)

all_train_data = all_train_data.add_column("input_ids", all_train_padded_data)

In [15]:
model = base.BiLSTMClassifier(embedding_matrix=embedding_matrix, embedding_dim=embedding_dim, fc_dim=400, hidden_dim=300, output_dim=6, freeze_embed=False)

In [16]:
print(model)

BiLSTMClassifier(
  (embedding): Embedding(8768, 300)
  (lstm): LSTM(300, 300, batch_first=True, bidirectional=True)
  (fc1): Linear(in_features=600, out_features=400, bias=True)
  (dropout): Dropout(p=0.2, inplace=False)
  (fc2): Linear(in_features=400, out_features=6, bias=True)
)


In [17]:
training_args = base.get_training_args(output_dir=f"~/results/{DATASET}/bilstm-base_coarse_embedd", logging_dir=f"~/logs/{DATASET}/bilstm-base_coarse_embedd", lr=.001,  epochs=10, batch_size=128)

In [18]:
base.reset_seed()

In [19]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_data,
    eval_dataset=eval_data,
    compute_metrics=base.compute_metrics,
    callbacks = [EarlyStoppingCallback(early_stopping_patience = 3)]
)

In [20]:
trainer.train()

Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,1.2338,0.811678,0.715857,0.628563,0.609063,0.613781
2,0.6077,0.566749,0.7956,0.676606,0.680306,0.677765
3,0.3874,0.551974,0.809349,0.693454,0.693037,0.687825
4,0.2888,0.563791,0.824931,0.706724,0.702814,0.701053
5,0.2009,0.491679,0.852429,0.859317,0.791095,0.810715
6,0.1413,0.574411,0.846929,0.835949,0.787457,0.801078
7,0.0805,0.554179,0.849679,0.862203,0.804529,0.825238
8,0.0421,0.585277,0.861595,0.832184,0.814428,0.822224
9,0.0271,0.622442,0.855179,0.854621,0.809675,0.826876
10,0.0191,0.63673,0.849679,0.83895,0.805064,0.818681


TrainOutput(global_step=350, training_loss=0.30289041110447473, metrics={'train_runtime': 139.6954, 'train_samples_per_second': 312.179, 'train_steps_per_second': 2.505, 'total_flos': 0.0, 'train_loss': 0.30289041110447473, 'epoch': 10.0})

In [21]:
model.eval()

BiLSTMClassifier(
  (embedding): Embedding(8768, 300)
  (lstm): LSTM(300, 300, batch_first=True, bidirectional=True)
  (fc1): Linear(in_features=600, out_features=400, bias=True)
  (dropout): Dropout(p=0.2, inplace=False)
  (fc2): Linear(in_features=400, out_features=6, bias=True)
)

In [22]:
trainer.evaluate(test_data)

{'eval_loss': 0.3329053819179535,
 'eval_accuracy': 0.9,
 'eval_precision': 0.8943149032477361,
 'eval_recall': 0.8802283162823269,
 'eval_f1': 0.8866233419881647,
 'eval_runtime': 4.1066,
 'eval_samples_per_second': 121.755,
 'eval_steps_per_second': 0.974,
 'epoch': 10.0}

In [23]:
torch.save(model.state_dict(), f"{os.path.expanduser('~')}/models/{DATASET}/bilstm-base_coarse_embedd.pth")

In [24]:
student_model = base.BiLSTMClassifier(embedding_matrix=embedding_matrix, embedding_dim=embedding_dim, fc_dim=400, hidden_dim=300, output_dim=6, freeze_embed=False)

In [25]:
training_args = base.get_training_args(output_dir=f"~/results/{DATASET}/bilstm-distill_coarse_embedd", remove_unused_columns=False, logging_dir=f"~/logs/{DATASET}/bilstm-distill_coarse_embedd", lr=.001,  epochs=10, batch_size=128, lambda_param=.75, temp=5)

In [26]:
base.reset_seed()

In [27]:
trainer = base.DistilTrainer(
    student_model=student_model,
    args=training_args,
    train_dataset=train_data,
    eval_dataset=eval_data,
    compute_metrics=base.compute_metrics,
    callbacks = [EarlyStoppingCallback(early_stopping_patience = 3)]
)

In [28]:
trainer.train()

Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,3.8523,2.672727,0.68286,0.578581,0.582942,0.576538
2,1.9608,1.648873,0.773602,0.646717,0.664364,0.653106
3,1.2175,1.368568,0.821265,0.694613,0.700527,0.694911
4,0.8676,1.264726,0.836847,0.714829,0.713952,0.711406
5,0.6589,1.085741,0.849679,0.710985,0.725634,0.717291
6,0.4806,1.010492,0.861595,0.720762,0.734513,0.727213
7,0.3481,0.946195,0.868011,0.726349,0.739592,0.73275
8,0.2756,0.945951,0.868011,0.728152,0.740343,0.733552
9,0.2278,0.933968,0.867094,0.810358,0.747705,0.749853
10,0.2034,0.955012,0.862511,0.848744,0.762081,0.775653


TrainOutput(global_step=350, training_loss=1.009255609512329, metrics={'train_runtime': 72.9245, 'train_samples_per_second': 598.016, 'train_steps_per_second': 4.799, 'total_flos': 0.0, 'train_loss': 1.009255609512329, 'epoch': 10.0})

In [29]:
student_model.eval()

BiLSTMClassifier(
  (embedding): Embedding(8768, 300)
  (lstm): LSTM(300, 300, batch_first=True, bidirectional=True)
  (fc1): Linear(in_features=600, out_features=400, bias=True)
  (dropout): Dropout(p=0.2, inplace=False)
  (fc2): Linear(in_features=400, out_features=6, bias=True)
)

In [30]:
trainer.evaluate(test_data)

{'eval_loss': 0.7882834672927856,
 'eval_accuracy': 0.908,
 'eval_precision': 0.9207660670802178,
 'eval_recall': 0.834202483449011,
 'eval_f1': 0.8589702187403443,
 'eval_runtime': 3.9348,
 'eval_samples_per_second': 127.071,
 'eval_steps_per_second': 1.017,
 'epoch': 10.0}

In [31]:
torch.save(student_model.state_dict(), f"{os.path.expanduser('~')}/models/{DATASET}/bilstm-distill_coarse_embedd.pth")

In [32]:
model = base.BiLSTMClassifier(embedding_matrix=embedding_matrix, embedding_dim=embedding_dim, fc_dim=400, hidden_dim=300, output_dim=6, freeze_embed=False)

In [33]:
training_args = base.get_training_args(output_dir=f"~/results/{DATASET}/bilstm-base-aug_coarse_embedd", logging_dir=f"~/logs/{DATASET}/bilstm-base-aug_coarse_embedd", lr=.001,  epochs=10, batch_size=128)

In [34]:
base.reset_seed()

In [35]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=all_train_data,
    eval_dataset=eval_data,
    compute_metrics=base.compute_metrics,
    callbacks = [EarlyStoppingCallback(early_stopping_patience = 3)]
)

In [36]:
trainer.train()

Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.4403,0.546446,0.854262,0.855422,0.810439,0.825858
2,0.078,0.598654,0.874427,0.86051,0.837758,0.845682
3,0.0338,0.676794,0.861595,0.862652,0.827101,0.839423
4,0.0191,0.790577,0.866178,0.865805,0.829853,0.842501
5,0.0108,0.873009,0.864345,0.873268,0.819774,0.837186


TrainOutput(global_step=1525, training_loss=0.11640459748565174, metrics={'train_runtime': 112.4509, 'train_samples_per_second': 3460.89, 'train_steps_per_second': 27.123, 'total_flos': 0.0, 'train_loss': 0.11640459748565174, 'epoch': 5.0})

In [37]:
model.eval()

BiLSTMClassifier(
  (embedding): Embedding(8768, 300)
  (lstm): LSTM(300, 300, batch_first=True, bidirectional=True)
  (fc1): Linear(in_features=600, out_features=400, bias=True)
  (dropout): Dropout(p=0.2, inplace=False)
  (fc2): Linear(in_features=400, out_features=6, bias=True)
)

In [38]:
trainer.evaluate(test_data)

{'eval_loss': 0.4405682384967804,
 'eval_accuracy': 0.896,
 'eval_precision': 0.892155015738569,
 'eval_recall': 0.9033938908373247,
 'eval_f1': 0.8961492278185662,
 'eval_runtime': 4.2079,
 'eval_samples_per_second': 118.825,
 'eval_steps_per_second': 0.951,
 'epoch': 5.0}

In [39]:
torch.save(model.state_dict(), f"{os.path.expanduser('~')}/models/{DATASET}/bilstm-base-aug_coarse_embedd.pth")

In [40]:
student_model = base.BiLSTMClassifier(embedding_matrix=embedding_matrix, embedding_dim=embedding_dim, fc_dim=400, hidden_dim=300, output_dim=6, freeze_embed=False)

In [41]:
training_args = base.get_training_args(output_dir=f"~/results/{DATASET}/bilstm-distill-aug_coarse_embedd", remove_unused_columns=False, logging_dir=f"~/logs/{DATASET}/bilstm-distill-aug_coarse_embedd", lr=.001,  epochs=10, batch_size=128, lambda_param=.75, temp=5)

In [42]:
base.reset_seed()

In [43]:
trainer = base.DistilTrainer(
    student_model=student_model,
    args=training_args,
    train_dataset=all_train_data,
    eval_dataset=eval_data,
    compute_metrics=base.compute_metrics,
    callbacks = [EarlyStoppingCallback(early_stopping_patience = 3)]
)

In [44]:
trainer.train()

Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,1.1924,0.905777,0.867094,0.836467,0.759292,0.764748
2,0.2468,0.736972,0.891842,0.899552,0.851978,0.869712
3,0.1537,0.763595,0.88451,0.889441,0.846548,0.860873
4,0.1199,0.702606,0.890009,0.896325,0.849178,0.866385
5,0.1028,0.670004,0.898258,0.904181,0.855402,0.873959
6,0.0916,0.644857,0.899175,0.905682,0.86523,0.881508
7,0.0835,0.655698,0.898258,0.905054,0.864076,0.880592
8,0.0768,0.653196,0.898258,0.904253,0.865518,0.880834
9,0.0716,0.649641,0.898258,0.904024,0.855758,0.874106


Using the latest cached version of the module from /home/jovyan/.cache/huggingface/modules/evaluate_modules/metrics/evaluate-metric--f1/34c46321f42186df33a6260966e34a368f14868d9cc2ba47d142112e2800d233 (last modified on Fri Jan 10 23:14:01 2025) since it couldn't be found locally at evaluate-metric--f1, or remotely on the Hugging Face Hub.


TrainOutput(global_step=2745, training_loss=0.23768656371069735, metrics={'train_runtime': 310.2437, 'train_samples_per_second': 1254.433, 'train_steps_per_second': 9.831, 'total_flos': 0.0, 'train_loss': 0.23768656371069735, 'epoch': 9.0})

In [45]:
student_model.eval()

BiLSTMClassifier(
  (embedding): Embedding(8768, 300)
  (lstm): LSTM(300, 300, batch_first=True, bidirectional=True)
  (fc1): Linear(in_features=600, out_features=400, bias=True)
  (dropout): Dropout(p=0.2, inplace=False)
  (fc2): Linear(in_features=400, out_features=6, bias=True)
)

In [46]:
trainer.evaluate(test_data)

{'eval_loss': 0.4989459216594696,
 'eval_accuracy': 0.936,
 'eval_precision': 0.9445246437132847,
 'eval_recall': 0.9274722826896467,
 'eval_f1': 0.9346960632677659,
 'eval_runtime': 3.9288,
 'eval_samples_per_second': 127.264,
 'eval_steps_per_second': 1.018,
 'epoch': 9.0}

In [47]:
torch.save(model.state_dict(), f"{os.path.expanduser('~')}/models/{DATASET}/bilstm-distill-aug_coarse_embedd.pth")