In [1]:
from datasets import concatenate_datasets, load_from_disk
from transformers import BasicTokenizer, Trainer, EarlyStoppingCallback
import kagglehub
import torch
import base
import os

[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /home/jovyan/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package punkt to /home/jovyan/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to /home/jovyan/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger_eng to
[nltk_data]     /home/jovyan/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger_eng is already up-to-
[nltk_data]       date!


In [2]:
my_glove = kagglehub.dataset_download("thanakomsn/glove6b300dtxt")
print(my_glove)

/home/jovyan/.cache/kagglehub/datasets/thanakomsn/glove6b300dtxt/versions/1


In [3]:
GLOVE_FILE = f"{my_glove}/glove.6B.300d.txt"
DATASET = "trec"

In [4]:
train_data = load_from_disk(f"~/data/{DATASET}/train-logits_coarse")
eval_data = load_from_disk(f"~/data/{DATASET}/eval-logits_coarse")
test_data = load_from_disk(f"~/data/{DATASET}/test-logits_coarse")

all_train_data = load_from_disk(f"~/data/{DATASET}/train-logits-augmented_coarse")

all_data = concatenate_datasets([load_from_disk(file) for file in [f"~/data/{DATASET}/eval-logits_coarse", f"~/data/{DATASET}/test-logits_coarse", f"~/data/{DATASET}/train-logits-augmented_coarse"]])
tokenizer = BasicTokenizer(do_lower_case=True)

In [5]:
if torch.cuda.is_available():
    device = torch.device("cuda")
    print("GPU is available and will be used:", torch.cuda.get_device_name(0))
else:
    device = torch.device("cpu")
    print("GPU is not available, using CPU.")

GPU is available and will be used: NVIDIA A100 80GB PCIe MIG 2g.20gb


In [6]:
train_data_tokens = list(map(lambda e: tokenizer.tokenize(e["sentence"]), train_data))
eval_data_tokens = list(map(lambda e: tokenizer.tokenize(e["sentence"]), eval_data))
test_data_tokens = list(map(lambda e: tokenizer.tokenize(e["sentence"]), test_data))

all_train_data_tokens = list(map(lambda e: tokenizer.tokenize(e["sentence"]), all_train_data))

all_data_tokens = list(map(lambda e: tokenizer.tokenize(e["sentence"]), all_data))

In [7]:
vocab = base.get_vocab(all_data_tokens)

In [8]:
word_index = dict(zip(vocab, range(len(vocab))))

In [9]:
embeddings_index = base.get_embeddings_indeces(GLOVE_FILE)

Found 400000 word vectors.


In [10]:
print(len(vocab))
num_tokens = len(vocab) + 2
embedding_dim = 300

8766


In [11]:
embedding_matrix = base.get_embedding_matrix(num_tokens, embedding_dim, word_index, embeddings_index)

Converted 8551 words (215) misses


In [12]:
train_data_index = list(map(lambda x: list(map(lambda y: word_index[y], x)),train_data_tokens))
eval_data_index = list(map(lambda x: list(map(lambda y: word_index[y], x)),eval_data_tokens))
test_data_index = list(map(lambda x: list(map(lambda y: word_index[y], x)),test_data_tokens))

all_train_data_index = list(map(lambda x: list(map(lambda y: word_index[y], x)),all_train_data_tokens))

In [13]:
train_padded_data = list(map(lambda x: base.padd(x,60), train_data_index))
eval_padded_data = list(map(lambda x: base.padd(x,60), eval_data_index))
test_padded_data = list(map(lambda x: base.padd(x,60), test_data_index))

all_train_padded_data = list(map(lambda x: base.padd(x,60), all_train_data_index))

In [14]:
train_data = train_data.add_column("input_ids", train_padded_data)
eval_data = eval_data.add_column("input_ids", eval_padded_data)
test_data = test_data.add_column("input_ids", test_padded_data)

all_train_data = all_train_data.add_column("input_ids", all_train_padded_data)

In [15]:
model = base.BiLSTMClassifier(embedding_matrix=embedding_matrix, embedding_dim=embedding_dim, fc_dim=400, hidden_dim=300, output_dim=6)

In [16]:
print(model)

BiLSTMClassifier(
  (embedding): Embedding(8768, 300)
  (lstm): LSTM(300, 300, batch_first=True, bidirectional=True)
  (fc1): Linear(in_features=600, out_features=400, bias=True)
  (dropout): Dropout(p=0.2, inplace=False)
  (fc2): Linear(in_features=400, out_features=6, bias=True)
)


In [17]:
training_args = base.get_training_args(output_dir=f"~/results/{DATASET}/bilstm-base_coarse", logging_dir=f"~/logs/{DATASET}/bilstm-base_coarse", lr=.001,  epochs=10, batch_size=128)

In [18]:
base.reset_seed()

In [19]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_data,
    eval_dataset=eval_data,
    compute_metrics=base.compute_metrics,
    callbacks = [EarlyStoppingCallback(early_stopping_patience = 3)]
)

In [20]:
trainer.train()

Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,1.246,0.846388,0.698442,0.610574,0.59316,0.597197
2,0.693,0.614969,0.779102,0.660829,0.665606,0.662733
3,0.4994,0.584964,0.7956,0.677893,0.681942,0.671973
4,0.4223,0.570457,0.807516,0.696038,0.68907,0.688467
5,0.3664,0.519411,0.824015,0.693268,0.70488,0.698033
6,0.299,0.510217,0.83593,0.86462,0.723499,0.723825
7,0.227,0.518191,0.831347,0.813933,0.756563,0.770139
8,0.1884,0.509452,0.84143,0.841872,0.799367,0.815206
9,0.1534,0.543658,0.831347,0.829477,0.771924,0.790803
10,0.1256,0.539504,0.833181,0.838077,0.789803,0.808582


TrainOutput(global_step=350, training_loss=0.4220496640886579, metrics={'train_runtime': 74.5041, 'train_samples_per_second': 585.337, 'train_steps_per_second': 4.698, 'total_flos': 0.0, 'train_loss': 0.4220496640886579, 'epoch': 10.0})

In [21]:
model.eval()

BiLSTMClassifier(
  (embedding): Embedding(8768, 300)
  (lstm): LSTM(300, 300, batch_first=True, bidirectional=True)
  (fc1): Linear(in_features=600, out_features=400, bias=True)
  (dropout): Dropout(p=0.2, inplace=False)
  (fc2): Linear(in_features=400, out_features=6, bias=True)
)

In [22]:
trainer.evaluate(test_data)

{'eval_loss': 0.26853400468826294,
 'eval_accuracy': 0.908,
 'eval_precision': 0.902279473701105,
 'eval_recall': 0.8847339910282831,
 'eval_f1': 0.8915560578488799,
 'eval_runtime': 4.7147,
 'eval_samples_per_second': 106.051,
 'eval_steps_per_second': 0.848,
 'epoch': 10.0}

In [23]:
torch.save(model.state_dict(), f"{os.path.expanduser('~')}/models/{DATASET}/bilstm-base_coarse.pth")

In [24]:
student_model = base.BiLSTMClassifier(embedding_matrix=embedding_matrix, embedding_dim=embedding_dim, fc_dim=400, hidden_dim=300, output_dim=6)

In [25]:
training_args = base.get_training_args(output_dir=f"~/results/{DATASET}/bilstm-distill_coarse", remove_unused_columns=False, logging_dir=f"~/logs/{DATASET}/bilstm-distill_coarse", lr=.001,  epochs=10, batch_size=128, lambda_param=.75, temp=5)

In [26]:
base.reset_seed()

In [27]:
trainer = base.DistilTrainer(
    student_model=student_model,
    args=training_args,
    train_dataset=train_data,
    eval_dataset=eval_data,
    compute_metrics=base.compute_metrics,
    callbacks = [EarlyStoppingCallback(early_stopping_patience = 3)]
)

In [28]:
trainer.train()

Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,3.9121,2.85321,0.635197,0.549034,0.536916,0.535826
2,2.2396,1.897054,0.759853,0.652839,0.646407,0.647711
3,1.5714,1.595143,0.792851,0.669355,0.678792,0.671043
4,1.2551,1.456243,0.815765,0.696535,0.695518,0.693243
5,1.0928,1.379534,0.822181,0.686632,0.704398,0.694558
6,0.9472,1.324449,0.821265,0.68624,0.705162,0.693458
7,0.7785,1.264702,0.827681,0.691225,0.708355,0.6988
8,0.6881,1.200637,0.839597,0.70428,0.717625,0.710522
9,0.6244,1.201634,0.836847,0.70323,0.714991,0.708736
10,0.5566,1.194637,0.843263,0.708686,0.719579,0.713868


TrainOutput(global_step=350, training_loss=1.3665848323277066, metrics={'train_runtime': 103.6091, 'train_samples_per_second': 420.909, 'train_steps_per_second': 3.378, 'total_flos': 0.0, 'train_loss': 1.3665848323277066, 'epoch': 10.0})

In [29]:
student_model.eval()

BiLSTMClassifier(
  (embedding): Embedding(8768, 300)
  (lstm): LSTM(300, 300, batch_first=True, bidirectional=True)
  (fc1): Linear(in_features=600, out_features=400, bias=True)
  (dropout): Dropout(p=0.2, inplace=False)
  (fc2): Linear(in_features=400, out_features=6, bias=True)
)

In [30]:
trainer.evaluate(test_data)

{'eval_loss': 0.7435383200645447,
 'eval_accuracy': 0.912,
 'eval_precision': 0.7643928362599205,
 'eval_recall': 0.7710233665958136,
 'eval_f1': 0.7668866586081791,
 'eval_runtime': 3.712,
 'eval_samples_per_second': 134.699,
 'eval_steps_per_second': 1.078,
 'epoch': 10.0}

In [31]:
torch.save(student_model.state_dict(), f"{os.path.expanduser('~')}/models/{DATASET}/bilstm-distill_coarse.pth")

In [32]:
model = base.BiLSTMClassifier(embedding_matrix=embedding_matrix, embedding_dim=embedding_dim, fc_dim=400, hidden_dim=300, output_dim=6)

In [33]:
training_args = base.get_training_args(output_dir=f"~/results/{DATASET}/bilstm-base-aug_coarse", logging_dir=f"~/logs/{DATASET}/bilstm-base-aug_coarse", lr=.001,  epochs=10, batch_size=128)

In [34]:
base.reset_seed()

In [35]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=all_train_data,
    eval_dataset=eval_data,
    compute_metrics=base.compute_metrics,
    callbacks = [EarlyStoppingCallback(early_stopping_patience = 4)]
)

In [36]:
trainer.train()

Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.5694,0.493772,0.843263,0.85409,0.783481,0.80313
2,0.2017,0.504589,0.869844,0.861142,0.831558,0.843828
3,0.1003,0.574397,0.863428,0.857808,0.836195,0.844196
4,0.0561,0.685708,0.857012,0.870567,0.823639,0.839423
5,0.0292,0.743264,0.868011,0.854185,0.820964,0.83397
6,0.018,0.816032,0.866178,0.865469,0.828484,0.843224
7,0.0104,0.817278,0.868928,0.868481,0.831244,0.84604
8,0.005,0.903768,0.868928,0.867548,0.831212,0.845613
9,0.0023,0.951181,0.864345,0.861195,0.818557,0.834224
10,0.0011,0.973914,0.867094,0.863839,0.820522,0.836652


TrainOutput(global_step=3050, training_loss=0.09932779459679714, metrics={'train_runtime': 255.8929, 'train_samples_per_second': 1520.871, 'train_steps_per_second': 11.919, 'total_flos': 0.0, 'train_loss': 0.09932779459679714, 'epoch': 10.0})

In [37]:
model.eval()

BiLSTMClassifier(
  (embedding): Embedding(8768, 300)
  (lstm): LSTM(300, 300, batch_first=True, bidirectional=True)
  (fc1): Linear(in_features=600, out_features=400, bias=True)
  (dropout): Dropout(p=0.2, inplace=False)
  (fc2): Linear(in_features=400, out_features=6, bias=True)
)

In [38]:
trainer.evaluate(test_data)

{'eval_loss': 0.3563491702079773,
 'eval_accuracy': 0.918,
 'eval_precision': 0.9325982090687973,
 'eval_recall': 0.8943902921739459,
 'eval_f1': 0.9104356943313895,
 'eval_runtime': 3.8063,
 'eval_samples_per_second': 131.36,
 'eval_steps_per_second': 1.051,
 'epoch': 10.0}

In [39]:
torch.save(model.state_dict(), f"{os.path.expanduser('~')}/models/{DATASET}/bilstm-base-aug_coarse.pth")

In [40]:
student_model = base.BiLSTMClassifier(embedding_matrix=embedding_matrix, embedding_dim=embedding_dim, fc_dim=400, hidden_dim=300, output_dim=6)

In [41]:
training_args = base.get_training_args(output_dir=f"~/results/{DATASET}/bilstm-distill-aug_coarse", remove_unused_columns=False, logging_dir=f"~/logs/{DATASET}/bilstm-distill-aug_coarse", lr=.001,  epochs=10, batch_size=128, lambda_param=.75, temp=5)

In [42]:
base.reset_seed()

In [43]:
trainer = base.DistilTrainer(
    student_model=student_model,
    args=training_args,
    train_dataset=all_train_data,
    eval_dataset=eval_data,
    compute_metrics=base.compute_metrics,
    callbacks = [EarlyStoppingCallback(early_stopping_patience = 4)]
)

In [44]:
trainer.train()

Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,1.5076,1.110688,0.851512,0.714904,0.726449,0.720364
2,0.5204,0.949013,0.860678,0.870576,0.817388,0.834654
3,0.2931,0.802245,0.877177,0.888141,0.838818,0.857485
4,0.2025,0.777252,0.882676,0.891843,0.842722,0.86125
5,0.1544,0.755722,0.890009,0.898726,0.856719,0.873812
6,0.1273,0.722156,0.886343,0.898055,0.853564,0.87168
7,0.1113,0.699527,0.890926,0.902004,0.857295,0.875529
8,0.0997,0.697561,0.890009,0.900507,0.856857,0.87465
9,0.0923,0.692361,0.893676,0.90295,0.85951,0.877275
10,0.0868,0.689877,0.893676,0.902606,0.859757,0.877219


TrainOutput(global_step=3050, training_loss=0.3195460322645844, metrics={'train_runtime': 315.7299, 'train_samples_per_second': 1232.636, 'train_steps_per_second': 9.66, 'total_flos': 0.0, 'train_loss': 0.3195460322645844, 'epoch': 10.0})

In [45]:
student_model.eval()

BiLSTMClassifier(
  (embedding): Embedding(8768, 300)
  (lstm): LSTM(300, 300, batch_first=True, bidirectional=True)
  (fc1): Linear(in_features=600, out_features=400, bias=True)
  (dropout): Dropout(p=0.2, inplace=False)
  (fc2): Linear(in_features=400, out_features=6, bias=True)
)

In [46]:
trainer.evaluate(test_data)

Using the latest cached version of the module from /home/jovyan/.cache/huggingface/modules/evaluate_modules/metrics/evaluate-metric--f1/34c46321f42186df33a6260966e34a368f14868d9cc2ba47d142112e2800d233 (last modified on Fri Jan 10 23:14:01 2025) since it couldn't be found locally at evaluate-metric--f1, or remotely on the Hugging Face Hub.


{'eval_loss': 0.3294939696788788,
 'eval_accuracy': 0.956,
 'eval_precision': 0.9646380806832182,
 'eval_recall': 0.9263001312577118,
 'eval_f1': 0.9428458023192788,
 'eval_runtime': 111.1032,
 'eval_samples_per_second': 4.5,
 'eval_steps_per_second': 0.036,
 'epoch': 10.0}

In [47]:
torch.save(model.state_dict(), f"{os.path.expanduser('~')}/models/{DATASET}/bilstm-distill-aug_coarse.pth")