In [1]:
from datasets import concatenate_datasets, load_from_disk
from transformers import BasicTokenizer, Trainer, EarlyStoppingCallback
import kagglehub
import torch
import base
import os

[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /home/jovyan/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package punkt to /home/jovyan/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to /home/jovyan/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger_eng to
[nltk_data]     /home/jovyan/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger_eng is already up-to-
[nltk_data]       date!


In [2]:
my_glove = kagglehub.dataset_download("thanakomsn/glove6b300dtxt")
print(my_glove)

/home/jovyan/.cache/kagglehub/datasets/thanakomsn/glove6b300dtxt/versions/1


In [None]:
GLOVE_FILE = f"{my_glove}/glove.6B.300d.txt"
DATASET = "trec"

In [4]:
train_data = load_from_disk(f"~/data/{DATASET}/train-logits_coarse")
eval_data = load_from_disk(f"~/data/{DATASET}/eval-logits_coarse")
test_data = load_from_disk(f"~/data/{DATASET}/test-logits_coarse")

all_train_data = load_from_disk(f"~/data/{DATASET}/train-logits-augmented_coarse")

all_data = concatenate_datasets([load_from_disk(file) for file in [f"~/data/{DATASET}/eval-logits_coarse", f"~/data/{DATASET}/test-logits_coarse", f"~/data/{DATASET}/train-logits-augmented_coarse"]])
tokenizer = BasicTokenizer(do_lower_case=True)

In [5]:
if torch.cuda.is_available():
    device = torch.device("cuda")
    print("GPU is available and will be used:", torch.cuda.get_device_name(0))
else:
    device = torch.device("cpu")
    print("GPU is not available, using CPU.")

GPU is available and will be used: NVIDIA A100 80GB PCIe MIG 2g.20gb


In [6]:
train_data_tokens = list(map(lambda e: tokenizer.tokenize(e["sentence"]), train_data))
eval_data_tokens = list(map(lambda e: tokenizer.tokenize(e["sentence"]), eval_data))
test_data_tokens = list(map(lambda e: tokenizer.tokenize(e["sentence"]), test_data))

all_train_data_tokens = list(map(lambda e: tokenizer.tokenize(e["sentence"]), all_train_data))

all_data_tokens = list(map(lambda e: tokenizer.tokenize(e["sentence"]), all_data))

In [7]:
vocab = base.get_vocab(all_data_tokens)

In [8]:
word_index = dict(zip(vocab, range(len(vocab))))

In [9]:
embeddings_index = base.get_embeddings_indeces(GLOVE_FILE)

Found 400000 word vectors.


In [10]:
print(len(vocab))
num_tokens = len(vocab) + 2
embedding_dim = 300

8766


In [11]:
embedding_matrix = base.get_embedding_matrix(num_tokens, embedding_dim, word_index, embeddings_index)

Converted 8551 words (215) misses


In [12]:
train_data_index = list(map(lambda x: list(map(lambda y: word_index[y], x)),train_data_tokens))
eval_data_index = list(map(lambda x: list(map(lambda y: word_index[y], x)),eval_data_tokens))
test_data_index = list(map(lambda x: list(map(lambda y: word_index[y], x)),test_data_tokens))

all_train_data_index = list(map(lambda x: list(map(lambda y: word_index[y], x)),all_train_data_tokens))

In [13]:
train_padded_data = list(map(lambda x: base.padd(x,60), train_data_index))
eval_padded_data = list(map(lambda x: base.padd(x,60), eval_data_index))
test_padded_data = list(map(lambda x: base.padd(x,60), test_data_index))

all_train_padded_data = list(map(lambda x: base.padd(x,60), all_train_data_index))

In [14]:
train_data = train_data.add_column("input_ids", train_padded_data)
eval_data = eval_data.add_column("input_ids", eval_padded_data)
test_data = test_data.add_column("input_ids", test_padded_data)

all_train_data = all_train_data.add_column("input_ids", all_train_padded_data)

In [15]:
model = base.BiLSTMClassifier(embedding_matrix=embedding_matrix, embedding_dim=embedding_dim, fc_dim=400, hidden_dim=300, output_dim=6, freeze_embed=False)

In [16]:
print(model)

BiLSTMClassifier(
  (embedding): Embedding(8768, 300)
  (lstm): LSTM(300, 300, batch_first=True, bidirectional=True)
  (fc1): Linear(in_features=600, out_features=400, bias=True)
  (dropout): Dropout(p=0.2, inplace=False)
  (fc2): Linear(in_features=400, out_features=6, bias=True)
)


In [17]:
training_args = base.get_training_args(output_dir=f"~/results/{DATASET}/bilstm-base_coarse_embedd", logging_dir=f"~/logs/{DATASET}/bilstm-base_coarse_embedd", lr=.001,  epochs=10, batch_size=128)

In [18]:
base.reset_seed()

In [19]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_data,
    eval_dataset=eval_data,
    compute_metrics=base.compute_metrics,
    callbacks = [EarlyStoppingCallback(early_stopping_patience = 3)]
)

In [20]:
trainer.train()

Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,1.2506,0.857616,0.68561,0.595424,0.583163,0.583347
2,0.6345,0.579901,0.791934,0.673079,0.676358,0.673359
3,0.3921,0.533116,0.812099,0.69027,0.693964,0.688408
4,0.2819,0.514561,0.831347,0.704016,0.710143,0.704868
5,0.1902,0.48651,0.846013,0.850758,0.767319,0.786812
6,0.1224,0.554185,0.854262,0.808573,0.811417,0.808842
7,0.0687,0.559728,0.861595,0.849448,0.815577,0.828978
8,0.0392,0.593935,0.859762,0.830696,0.833512,0.831674
9,0.023,0.655703,0.856095,0.847305,0.810184,0.825159
10,0.0166,0.653781,0.863428,0.853453,0.825502,0.837353


TrainOutput(global_step=350, training_loss=0.30190385052136015, metrics={'train_runtime': 177.481, 'train_samples_per_second': 245.716, 'train_steps_per_second': 1.972, 'total_flos': 0.0, 'train_loss': 0.30190385052136015, 'epoch': 10.0})

In [21]:
model.eval()

BiLSTMClassifier(
  (embedding): Embedding(8768, 300)
  (lstm): LSTM(300, 300, batch_first=True, bidirectional=True)
  (fc1): Linear(in_features=600, out_features=400, bias=True)
  (dropout): Dropout(p=0.2, inplace=False)
  (fc2): Linear(in_features=400, out_features=6, bias=True)
)

In [22]:
trainer.evaluate(test_data)

{'eval_loss': 0.42177897691726685,
 'eval_accuracy': 0.886,
 'eval_precision': 0.8668023696075048,
 'eval_recall': 0.8678893594143776,
 'eval_f1': 0.8671094147896995,
 'eval_runtime': 4.1474,
 'eval_samples_per_second': 120.558,
 'eval_steps_per_second': 0.964,
 'epoch': 10.0}

In [23]:
torch.save(model.state_dict(), f"{os.path.expanduser('~')}/models/{DATASET}/bilstm-base_coarse_embedd.pth")

In [None]:
student_model = base.BiLSTMClassifier(embedding_matrix=embedding_matrix, embedding_dim=embedding_dim, fc_dim=400, hidden_dim=300, output_dim=6, freeze_embed=False)

In [None]:
training_args = base.get_training_args(output_dir=f"~/results/{DATASET}/bilstm-distill_coarse_embedd", remove_unused_columns=False, logging_dir=f"~/logs/{DATASET}/bilstm-distill_coarse_embedd", lr=.001,  epochs=10, batch_size=128, lambda_param=.75, temp=5)

In [None]:
base.reset_seed()

In [None]:
trainer = base.DistilTrainer(
    student_model=student_model,
    args=training_args,
    train_dataset=train_data,
    eval_dataset=eval_data,
    compute_metrics=base.compute_metrics,
    callbacks = [EarlyStoppingCallback(early_stopping_patience = 3)]
)

In [28]:
trainer.train()

Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,3.8707,2.693134,0.64528,0.579706,0.544875,0.54336
2,2.0196,1.672698,0.782768,0.675972,0.666575,0.668807
3,1.2215,1.364676,0.820348,0.696495,0.699867,0.695209
4,0.8592,1.295775,0.829514,0.707543,0.707515,0.704878
5,0.6459,1.093244,0.853346,0.712827,0.728843,0.720308
6,0.4598,1.026738,0.858845,0.719117,0.732024,0.724823
7,0.337,0.988894,0.868928,0.72946,0.739841,0.733685
8,0.2709,0.971776,0.868011,0.729009,0.738607,0.732946
9,0.2248,0.962097,0.857012,0.718957,0.730273,0.724298
10,0.1991,0.967958,0.862511,0.807061,0.742943,0.745741


TrainOutput(global_step=350, training_loss=1.0108567987169539, metrics={'train_runtime': 114.2892, 'train_samples_per_second': 381.576, 'train_steps_per_second': 3.062, 'total_flos': 0.0, 'train_loss': 1.0108567987169539, 'epoch': 10.0})

In [29]:
student_model.eval()

BiLSTMClassifier(
  (embedding): Embedding(8768, 300)
  (lstm): LSTM(300, 300, batch_first=True, bidirectional=True)
  (fc1): Linear(in_features=600, out_features=400, bias=True)
  (dropout): Dropout(p=0.2, inplace=False)
  (fc2): Linear(in_features=400, out_features=6, bias=True)
)

In [30]:
trainer.evaluate(test_data)

{'eval_loss': 0.8290835022926331,
 'eval_accuracy': 0.904,
 'eval_precision': 0.9163667144077592,
 'eval_recall': 0.7812834211073395,
 'eval_f1': 0.7891125931238729,
 'eval_runtime': 4.808,
 'eval_samples_per_second': 103.993,
 'eval_steps_per_second': 0.832,
 'epoch': 10.0}

In [31]:
torch.save(student_model.state_dict(), f"{os.path.expanduser('~')}/models/{DATASET}/bilstm-distill_coarse_embedd.pth")

In [32]:
model = base.BiLSTMClassifier(embedding_matrix=embedding_matrix, embedding_dim=embedding_dim, fc_dim=400, hidden_dim=300, output_dim=6, freeze_embed=False)

In [33]:
training_args = base.get_training_args(output_dir=f"~/results/{DATASET}/bilstm-base-aug_coarse_embedd", logging_dir=f"~/logs/{DATASET}/bilstm-base-aug_coarse_embedd", lr=.001,  epochs=10, batch_size=128)

In [None]:
base.reset_seed()

In [35]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=all_train_data,
    eval_dataset=eval_data,
    compute_metrics=base.compute_metrics,
    callbacks = [EarlyStoppingCallback(early_stopping_patience = 4)]
)

In [36]:
trainer.train()

Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.4418,0.515758,0.857929,0.853537,0.80478,0.820779
2,0.0768,0.607188,0.860678,0.847326,0.82888,0.833521
3,0.0315,0.669516,0.860678,0.851934,0.825322,0.835144
4,0.0178,0.806528,0.861595,0.873287,0.816259,0.83562
5,0.0117,0.788131,0.870761,0.877039,0.816121,0.834535
6,0.007,0.777795,0.875344,0.885985,0.836196,0.854757
7,0.0043,0.8838,0.879927,0.887174,0.831248,0.85024
8,0.0025,0.878168,0.875344,0.871533,0.827301,0.843697
9,0.0014,0.926619,0.87626,0.870804,0.828641,0.843832
10,0.0009,0.939382,0.873511,0.868124,0.826261,0.841308


TrainOutput(global_step=3050, training_loss=0.059561726998110284, metrics={'train_runtime': 254.7526, 'train_samples_per_second': 1527.678, 'train_steps_per_second': 11.972, 'total_flos': 0.0, 'train_loss': 0.059561726998110284, 'epoch': 10.0})

In [37]:
model.eval()

BiLSTMClassifier(
  (embedding): Embedding(8768, 300)
  (lstm): LSTM(300, 300, batch_first=True, bidirectional=True)
  (fc1): Linear(in_features=600, out_features=400, bias=True)
  (dropout): Dropout(p=0.2, inplace=False)
  (fc2): Linear(in_features=400, out_features=6, bias=True)
)

In [38]:
trainer.evaluate(test_data)

{'eval_loss': 0.5025595426559448,
 'eval_accuracy': 0.904,
 'eval_precision': 0.8864450161514458,
 'eval_recall': 0.9036840961210855,
 'eval_f1': 0.8939905382735702,
 'eval_runtime': 4.6607,
 'eval_samples_per_second': 107.28,
 'eval_steps_per_second': 0.858,
 'epoch': 10.0}

In [39]:
torch.save(model.state_dict(), f"{os.path.expanduser('~')}/models/{DATASET}/bilstm-base-aug_coarse_embedd.pth")

In [None]:
student_model = base.BiLSTMClassifier(embedding_matrix=embedding_matrix, embedding_dim=embedding_dim, fc_dim=400, hidden_dim=300, output_dim=6, freeze_embed=False)

In [None]:
training_args = base.get_training_args(output_dir=f"~/results/{DATASET}/bilstm-distill-aug_coarse_embedd", remove_unused_columns=False, logging_dir=f"~/logs/{DATASET}/bilstm-distill-aug_coarse_embedd", lr=.001,  epochs=10, batch_size=128, lambda_param=.75, temp=5)

In [None]:
base.reset_seed()

In [43]:
trainer = base.DistilTrainer(
    student_model=student_model,
    args=training_args,
    train_dataset=all_train_data,
    eval_dataset=eval_data,
    compute_metrics=base.compute_metrics,
    callbacks = [EarlyStoppingCallback(early_stopping_patience = 4)]
)

In [44]:
trainer.train()

Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,1.1943,0.94947,0.874427,0.84132,0.766095,0.769762
2,0.2454,0.718364,0.887259,0.893595,0.848022,0.864694
3,0.1537,0.71262,0.892759,0.897511,0.85331,0.868519
4,0.1207,0.687321,0.891842,0.899605,0.86067,0.875963
5,0.1043,0.673547,0.893676,0.902083,0.862065,0.878082
6,0.0917,0.641931,0.895509,0.892825,0.86263,0.875504
7,0.0837,0.643228,0.899175,0.90612,0.866004,0.882135
8,0.077,0.648706,0.896425,0.891919,0.864335,0.875793
9,0.0722,0.637627,0.899175,0.906073,0.85692,0.875758
10,0.0679,0.634152,0.899175,0.89592,0.866022,0.878733


Using the latest cached version of the module from /home/jovyan/.cache/huggingface/modules/evaluate_modules/metrics/evaluate-metric--recall/11f90e583db35601050aed380d48e83202a896976b9608432fba9244fb447f24 (last modified on Fri Jan 10 23:14:00 2025) since it couldn't be found locally at evaluate-metric--recall, or remotely on the Hugging Face Hub.


TrainOutput(global_step=3050, training_loss=0.22109305397408907, metrics={'train_runtime': 433.5096, 'train_samples_per_second': 897.743, 'train_steps_per_second': 7.036, 'total_flos': 0.0, 'train_loss': 0.22109305397408907, 'epoch': 10.0})

In [45]:
student_model.eval()

BiLSTMClassifier(
  (embedding): Embedding(8768, 300)
  (lstm): LSTM(300, 300, batch_first=True, bidirectional=True)
  (fc1): Linear(in_features=600, out_features=400, bias=True)
  (dropout): Dropout(p=0.2, inplace=False)
  (fc2): Linear(in_features=400, out_features=6, bias=True)
)

In [46]:
trainer.evaluate(test_data)

{'eval_loss': 0.4838023781776428,
 'eval_accuracy': 0.936,
 'eval_precision': 0.9450377120484253,
 'eval_recall': 0.9102791552025528,
 'eval_f1': 0.9246358480157321,
 'eval_runtime': 47.7663,
 'eval_samples_per_second': 10.468,
 'eval_steps_per_second': 0.084,
 'epoch': 10.0}

In [47]:
torch.save(model.state_dict(), f"{os.path.expanduser('~')}/models/{DATASET}/bilstm-distill-aug_coarse_embedd.pth")