In [1]:
from datasets import concatenate_datasets, load_from_disk
from transformers import BasicTokenizer, Trainer, EarlyStoppingCallback
import kagglehub
import torch
import base
import os

[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /home/jovyan/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package punkt to /home/jovyan/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to /home/jovyan/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger_eng to
[nltk_data]     /home/jovyan/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger_eng is already up-to-
[nltk_data]       date!


In [2]:
my_glove = kagglehub.dataset_download("thanakomsn/glove6b300dtxt")
print(my_glove)

/home/jovyan/.cache/kagglehub/datasets/thanakomsn/glove6b300dtxt/versions/1


In [3]:
GLOVE_FILE = f"{my_glove}/glove.6B.300d.txt"
DATASET = "trec"

In [4]:
train_data = load_from_disk(f"~/data/{DATASET}/train-logits_coarse")
eval_data = load_from_disk(f"~/data/{DATASET}/eval-logits_coarse")
test_data = load_from_disk(f"~/data/{DATASET}/test-logits_coarse")

all_train_data = load_from_disk(f"~/data/{DATASET}/train-logits-augmented_coarse")

all_data = concatenate_datasets([load_from_disk(file) for file in [f"~/data/{DATASET}/eval-logits_coarse", f"~/data/{DATASET}/test-logits_coarse", f"~/data/{DATASET}/train-logits-augmented_coarse"]])
tokenizer = BasicTokenizer(do_lower_case=True)

In [5]:
if torch.cuda.is_available():
    device = torch.device("cuda")
    print("GPU is available and will be used:", torch.cuda.get_device_name(0))
else:
    device = torch.device("cpu")
    print("GPU is not available, using CPU.")

GPU is available and will be used: NVIDIA A100 80GB PCIe MIG 2g.20gb


In [6]:
train_data_tokens = list(map(lambda e: tokenizer.tokenize(e["sentence"]), train_data))
eval_data_tokens = list(map(lambda e: tokenizer.tokenize(e["sentence"]), eval_data))
test_data_tokens = list(map(lambda e: tokenizer.tokenize(e["sentence"]), test_data))

all_train_data_tokens = list(map(lambda e: tokenizer.tokenize(e["sentence"]), all_train_data))

all_data_tokens = list(map(lambda e: tokenizer.tokenize(e["sentence"]), all_data))

In [7]:
vocab = base.get_vocab(all_data_tokens)

In [8]:
word_index = dict(zip(vocab, range(len(vocab))))

In [9]:
embeddings_index = base.get_embeddings_indeces(GLOVE_FILE)

Found 400000 word vectors.


In [10]:
print(len(vocab))
num_tokens = len(vocab) + 2
embedding_dim = 300

8766


In [11]:
embedding_matrix = base.get_embedding_matrix(num_tokens, embedding_dim, word_index, embeddings_index)

Converted 8551 words (215) misses


In [12]:
train_data_index = list(map(lambda x: list(map(lambda y: word_index[y], x)),train_data_tokens))
eval_data_index = list(map(lambda x: list(map(lambda y: word_index[y], x)),eval_data_tokens))
test_data_index = list(map(lambda x: list(map(lambda y: word_index[y], x)),test_data_tokens))

all_train_data_index = list(map(lambda x: list(map(lambda y: word_index[y], x)),all_train_data_tokens))

In [13]:
train_padded_data = list(map(lambda x: base.padd(x,60), train_data_index))
eval_padded_data = list(map(lambda x: base.padd(x,60), eval_data_index))
test_padded_data = list(map(lambda x: base.padd(x,60), test_data_index))

all_train_padded_data = list(map(lambda x: base.padd(x,60), all_train_data_index))

In [14]:
train_data = train_data.add_column("input_ids", train_padded_data)
eval_data = eval_data.add_column("input_ids", eval_padded_data)
test_data = test_data.add_column("input_ids", test_padded_data)

all_train_data = all_train_data.add_column("input_ids", all_train_padded_data)

In [15]:
model = base.BiLSTMClassifier(embedding_matrix=embedding_matrix, embedding_dim=embedding_dim, fc_dim=400, hidden_dim=300, output_dim=6)

In [16]:
print(model)

BiLSTMClassifier(
  (embedding): Embedding(8768, 300)
  (lstm): LSTM(300, 300, batch_first=True, bidirectional=True)
  (fc1): Linear(in_features=600, out_features=400, bias=True)
  (dropout): Dropout(p=0.2, inplace=False)
  (fc2): Linear(in_features=400, out_features=6, bias=True)
)


In [17]:
training_args = base.get_training_args(output_dir=f"~/results/{DATASET}/bilstm-base_coarse", logging_dir=f"~/logs/{DATASET}/bilstm-base_coarse", lr=.001,  epochs=10, batch_size=128)

In [18]:
base.reset_seed()

In [19]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_data,
    eval_dataset=eval_data,
    compute_metrics=base.compute_metrics,
    callbacks = [EarlyStoppingCallback(early_stopping_patience = 3)]
)

In [20]:
trainer.train()

Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,1.2829,0.912075,0.649863,0.593692,0.548132,0.557292
2,0.7181,0.636597,0.780935,0.6593,0.669084,0.662579
3,0.5047,0.576275,0.800183,0.679035,0.684724,0.675586
4,0.418,0.562616,0.815765,0.698941,0.695457,0.693416
5,0.3616,0.526553,0.815765,0.691049,0.696795,0.692138
6,0.3089,0.513737,0.828598,0.816765,0.73766,0.74569
7,0.2283,0.53423,0.824931,0.832556,0.762076,0.778812
8,0.1925,0.508672,0.850596,0.866069,0.825261,0.841737
9,0.1513,0.539941,0.842346,0.859852,0.818203,0.834781
10,0.1207,0.536126,0.84418,0.861858,0.818562,0.836197


TrainOutput(global_step=350, training_loss=0.42870248794555665, metrics={'train_runtime': 104.5592, 'train_samples_per_second': 417.084, 'train_steps_per_second': 3.347, 'total_flos': 0.0, 'train_loss': 0.42870248794555665, 'epoch': 10.0})

In [21]:
model.eval()

BiLSTMClassifier(
  (embedding): Embedding(8768, 300)
  (lstm): LSTM(300, 300, batch_first=True, bidirectional=True)
  (fc1): Linear(in_features=600, out_features=400, bias=True)
  (dropout): Dropout(p=0.2, inplace=False)
  (fc2): Linear(in_features=400, out_features=6, bias=True)
)

In [22]:
trainer.evaluate(test_data)

{'eval_loss': 0.2760658860206604,
 'eval_accuracy': 0.908,
 'eval_precision': 0.9032861049628674,
 'eval_recall': 0.8847890149398184,
 'eval_f1': 0.8930860894509288,
 'eval_runtime': 3.3417,
 'eval_samples_per_second': 149.623,
 'eval_steps_per_second': 1.197,
 'epoch': 10.0}

In [23]:
torch.save(model.state_dict(), f"{os.path.expanduser('~')}/models/{DATASET}/bilstm-base_coarse.pth")

In [24]:
student_model = base.BiLSTMClassifier(embedding_matrix=embedding_matrix, embedding_dim=embedding_dim, fc_dim=400, hidden_dim=300, output_dim=6)

In [25]:
training_args = base.get_training_args(output_dir=f"~/results/{DATASET}/bilstm-distill_coarse", remove_unused_columns=False, logging_dir=f"~/logs/{DATASET}/bilstm-distill_coarse", lr=.001,  epochs=10, batch_size=128, lambda_param=.75, temp=5)

In [26]:
base.reset_seed()

In [27]:
trainer = base.DistilTrainer(
    student_model=student_model,
    args=training_args,
    train_dataset=train_data,
    eval_dataset=eval_data,
    compute_metrics=base.compute_metrics,
    callbacks = [EarlyStoppingCallback(early_stopping_patience = 3)]
)

In [28]:
trainer.train()

Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,3.9202,2.866678,0.644363,0.569422,0.540566,0.544017
2,2.2442,1.874293,0.767186,0.662347,0.653242,0.655423
3,1.5341,1.603264,0.791017,0.668389,0.677059,0.669121
4,1.2421,1.47217,0.809349,0.691329,0.690261,0.687741
5,1.08,1.362312,0.818515,0.683706,0.700937,0.69147
6,0.9268,1.313107,0.835014,0.697408,0.716188,0.705463
7,0.7698,1.240867,0.833181,0.699257,0.712311,0.705225
8,0.6758,1.19628,0.839597,0.703896,0.717324,0.710367
9,0.6111,1.205685,0.836847,0.70196,0.715702,0.708252
10,0.5469,1.194018,0.834097,0.700694,0.71236,0.706146


TrainOutput(global_step=350, training_loss=1.3551088878086635, metrics={'train_runtime': 69.5658, 'train_samples_per_second': 626.888, 'train_steps_per_second': 5.031, 'total_flos': 0.0, 'train_loss': 1.3551088878086635, 'epoch': 10.0})

In [29]:
student_model.eval()

BiLSTMClassifier(
  (embedding): Embedding(8768, 300)
  (lstm): LSTM(300, 300, batch_first=True, bidirectional=True)
  (fc1): Linear(in_features=600, out_features=400, bias=True)
  (dropout): Dropout(p=0.2, inplace=False)
  (fc2): Linear(in_features=400, out_features=6, bias=True)
)

In [30]:
trainer.evaluate(test_data)

{'eval_loss': 0.7540001273155212,
 'eval_accuracy': 0.906,
 'eval_precision': 0.757071471622322,
 'eval_recall': 0.7617816111635793,
 'eval_f1': 0.758196841110497,
 'eval_runtime': 4.1381,
 'eval_samples_per_second': 120.829,
 'eval_steps_per_second': 0.967,
 'epoch': 10.0}

In [31]:
torch.save(student_model.state_dict(), f"{os.path.expanduser('~')}/models/{DATASET}/bilstm-distill_coarse.pth")

In [32]:
model = base.BiLSTMClassifier(embedding_matrix=embedding_matrix, embedding_dim=embedding_dim, fc_dim=400, hidden_dim=300, output_dim=6)

In [33]:
training_args = base.get_training_args(output_dir=f"~/results/{DATASET}/bilstm-base-aug_coarse", logging_dir=f"~/logs/{DATASET}/bilstm-base-aug_coarse", lr=.001,  epochs=10, batch_size=128)

In [34]:
base.reset_seed()

In [35]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=all_train_data,
    eval_dataset=eval_data,
    compute_metrics=base.compute_metrics,
    callbacks = [EarlyStoppingCallback(early_stopping_patience = 3)]
)

In [36]:
trainer.train()

Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.5635,0.49119,0.842346,0.856231,0.791902,0.811971
2,0.2006,0.508214,0.877177,0.871364,0.828497,0.844321
3,0.1007,0.570596,0.867094,0.8579,0.838175,0.84635
4,0.056,0.65567,0.868011,0.868462,0.830978,0.845113
5,0.0314,0.717148,0.872594,0.872505,0.832823,0.848256
6,0.0165,0.78784,0.865261,0.862929,0.828469,0.841386
7,0.0102,0.846776,0.868928,0.868299,0.830532,0.845225
8,0.0051,0.931736,0.871677,0.868341,0.833237,0.846759


TrainOutput(global_step=2440, training_loss=0.12299815904898721, metrics={'train_runtime': 164.0093, 'train_samples_per_second': 2372.915, 'train_steps_per_second': 18.597, 'total_flos': 0.0, 'train_loss': 0.12299815904898721, 'epoch': 8.0})

In [37]:
model.eval()

BiLSTMClassifier(
  (embedding): Embedding(8768, 300)
  (lstm): LSTM(300, 300, batch_first=True, bidirectional=True)
  (fc1): Linear(in_features=600, out_features=400, bias=True)
  (dropout): Dropout(p=0.2, inplace=False)
  (fc2): Linear(in_features=400, out_features=6, bias=True)
)

In [38]:
trainer.evaluate(test_data)

{'eval_loss': 0.2950372099876404,
 'eval_accuracy': 0.93,
 'eval_precision': 0.9383741752762922,
 'eval_recall': 0.9066378156204484,
 'eval_f1': 0.9199569632344592,
 'eval_runtime': 4.0634,
 'eval_samples_per_second': 123.05,
 'eval_steps_per_second': 0.984,
 'epoch': 8.0}

In [39]:
torch.save(model.state_dict(), f"{os.path.expanduser('~')}/models/{DATASET}/bilstm-base-aug_coarse.pth")

In [40]:
student_model = base.BiLSTMClassifier(embedding_matrix=embedding_matrix, embedding_dim=embedding_dim, fc_dim=400, hidden_dim=300, output_dim=6)

In [41]:
training_args = base.get_training_args(output_dir=f"~/results/{DATASET}/bilstm-distill-aug_coarse", remove_unused_columns=False, logging_dir=f"~/logs/{DATASET}/bilstm-distill-aug_coarse", lr=.001,  epochs=10, batch_size=128, lambda_param=.75, temp=5)

In [42]:
base.reset_seed()

In [43]:
trainer = base.DistilTrainer(
    student_model=student_model,
    args=training_args,
    train_dataset=all_train_data,
    eval_dataset=eval_data,
    compute_metrics=base.compute_metrics,
    callbacks = [EarlyStoppingCallback(early_stopping_patience = 3)]
)

In [44]:
trainer.train()

Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,1.4854,1.126492,0.847846,0.714537,0.724797,0.717829
2,0.5285,0.908019,0.873511,0.882713,0.826952,0.845989
3,0.2913,0.768489,0.88451,0.891994,0.854442,0.868996
4,0.1979,0.788898,0.890009,0.897349,0.858085,0.873594
5,0.1517,0.71616,0.895509,0.892813,0.86114,0.874687
6,0.1254,0.721514,0.896425,0.902672,0.862502,0.878668
7,0.1098,0.71239,0.893676,0.899684,0.860932,0.876211
8,0.0985,0.699308,0.895509,0.902632,0.86177,0.878171
9,0.0907,0.689765,0.899175,0.90534,0.864618,0.881061
10,0.0851,0.688744,0.897342,0.903821,0.862958,0.879486


TrainOutput(global_step=3050, training_loss=0.3164315151777424, metrics={'train_runtime': 293.5936, 'train_samples_per_second': 1325.574, 'train_steps_per_second': 10.389, 'total_flos': 0.0, 'train_loss': 0.3164315151777424, 'epoch': 10.0})

In [45]:
student_model.eval()

BiLSTMClassifier(
  (embedding): Embedding(8768, 300)
  (lstm): LSTM(300, 300, batch_first=True, bidirectional=True)
  (fc1): Linear(in_features=600, out_features=400, bias=True)
  (dropout): Dropout(p=0.2, inplace=False)
  (fc2): Linear(in_features=400, out_features=6, bias=True)
)

In [46]:
trainer.evaluate(test_data)

{'eval_loss': 0.30519503355026245,
 'eval_accuracy': 0.956,
 'eval_precision': 0.9643149351288102,
 'eval_recall': 0.9279410678765495,
 'eval_f1': 0.9434906356880663,
 'eval_runtime': 4.102,
 'eval_samples_per_second': 121.891,
 'eval_steps_per_second': 0.975,
 'epoch': 10.0}

In [47]:
torch.save(model.state_dict(), f"{os.path.expanduser('~')}/models/{DATASET}/bilstm-distill-aug_coarse.pth")