In [1]:
from datasets import concatenate_datasets, load_from_disk
from transformers import BasicTokenizer, Trainer, EarlyStoppingCallback
import kagglehub
import torch
import base
import os

[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /home/jovyan/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package punkt to /home/jovyan/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to /home/jovyan/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger_eng to
[nltk_data]     /home/jovyan/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger_eng is already up-to-
[nltk_data]       date!


In [2]:
my_glove = kagglehub.dataset_download("thanakomsn/glove6b300dtxt")
print(my_glove)

/home/jovyan/.cache/kagglehub/datasets/thanakomsn/glove6b300dtxt/versions/1


In [3]:
GLOVE_FILE = f"{my_glove}/glove.6B.300d.txt"
DATASET = "trec"

In [4]:
train_data = load_from_disk(f"~/data/{DATASET}/train-logits_fine")
eval_data = load_from_disk(f"~/data/{DATASET}/eval-logits_fine")
test_data = load_from_disk(f"~/data/{DATASET}/test-logits_fine")

all_train_data = load_from_disk(f"~/data/{DATASET}/train-logits-augmented_fine")

all_data = concatenate_datasets([load_from_disk(file) for file in [f"~/data/{DATASET}/eval-logits_fine", f"~/data/{DATASET}/test-logits_fine", f"~/data/{DATASET}/train-logits-augmented_fine"]])
tokenizer = BasicTokenizer(do_lower_case=True)

In [5]:
if torch.cuda.is_available():
    device = torch.device("cuda")
    print("GPU is available and will be used:", torch.cuda.get_device_name(0))
else:
    device = torch.device("cpu")
    print("GPU is not available, using CPU.")

GPU is available and will be used: NVIDIA A100 80GB PCIe MIG 2g.20gb


In [6]:
train_data_tokens = list(map(lambda e: tokenizer.tokenize(e["sentence"]), train_data))
eval_data_tokens = list(map(lambda e: tokenizer.tokenize(e["sentence"]), eval_data))
test_data_tokens = list(map(lambda e: tokenizer.tokenize(e["sentence"]), test_data))

all_train_data_tokens = list(map(lambda e: tokenizer.tokenize(e["sentence"]), all_train_data))

all_data_tokens = list(map(lambda e: tokenizer.tokenize(e["sentence"]), all_data))

In [7]:
vocab = base.get_vocab(all_data_tokens)

In [8]:
word_index = dict(zip(vocab, range(len(vocab))))

In [9]:
embeddings_index = base.get_embeddings_indeces(GLOVE_FILE)

Found 400000 word vectors.


In [10]:
print(len(vocab))
num_tokens = len(vocab) + 2
embedding_dim = 300

8766


In [11]:
embedding_matrix = base.get_embedding_matrix(num_tokens, embedding_dim, word_index, embeddings_index)

Converted 8551 words (215) misses


In [12]:
train_data_index = list(map(lambda x: list(map(lambda y: word_index[y], x)),train_data_tokens))
eval_data_index = list(map(lambda x: list(map(lambda y: word_index[y], x)),eval_data_tokens))
test_data_index = list(map(lambda x: list(map(lambda y: word_index[y], x)),test_data_tokens))

all_train_data_index = list(map(lambda x: list(map(lambda y: word_index[y], x)),all_train_data_tokens))

In [13]:
train_padded_data = list(map(lambda x: base.padd(x,60), train_data_index))
eval_padded_data = list(map(lambda x: base.padd(x,60), eval_data_index))
test_padded_data = list(map(lambda x: base.padd(x,60), test_data_index))

all_train_padded_data = list(map(lambda x: base.padd(x,60), all_train_data_index))

In [14]:
train_data = train_data.add_column("input_ids", train_padded_data)
eval_data = eval_data.add_column("input_ids", eval_padded_data)
test_data = test_data.add_column("input_ids", test_padded_data)

all_train_data = all_train_data.add_column("input_ids", all_train_padded_data)

In [15]:
model = base.BiLSTMClassifier(embedding_matrix=embedding_matrix, embedding_dim=embedding_dim, fc_dim=400, hidden_dim=300, output_dim=50, freeze_embed=False)

In [16]:
print(model)

BiLSTMClassifier(
  (embedding): Embedding(8768, 300)
  (lstm): LSTM(300, 300, batch_first=True, bidirectional=True)
  (fc1): Linear(in_features=600, out_features=400, bias=True)
  (dropout): Dropout(p=0.2, inplace=False)
  (fc2): Linear(in_features=400, out_features=50, bias=True)
)


In [17]:
training_args = base.get_training_args(output_dir=f"~/results/{DATASET}/bilstm-base_fine_embedd", logging_dir=f"~/logs/{DATASET}/bilstm-base_fine_embedd", lr=.001,  epochs=10, batch_size=128)

In [18]:
base.reset_seed()

In [19]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_data,
    eval_dataset=eval_data,
    compute_metrics=base.compute_metrics,
    callbacks = [EarlyStoppingCallback(early_stopping_patience = 3)]
)

In [20]:
trainer.train()

Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,3.0829,2.47853,0.392301,0.057929,0.08469,0.058044
2,2.1135,1.892381,0.540788,0.141427,0.154633,0.13153
3,1.6469,1.545151,0.615949,0.248878,0.219639,0.211352
4,1.2446,1.377138,0.657195,0.31211,0.27287,0.267682
5,0.9587,1.242039,0.68561,0.404154,0.335245,0.340474
6,0.7201,1.177933,0.708524,0.437544,0.380006,0.387522
7,0.546,1.186077,0.710357,0.445521,0.398908,0.410581
8,0.4469,1.164008,0.721357,0.439881,0.422036,0.425925
9,0.3493,1.183514,0.722273,0.473095,0.433533,0.440763
10,0.2904,1.179597,0.72594,0.469336,0.434259,0.441378


TrainOutput(global_step=350, training_loss=1.1399329621451242, metrics={'train_runtime': 83.2939, 'train_samples_per_second': 523.568, 'train_steps_per_second': 4.202, 'total_flos': 0.0, 'train_loss': 1.1399329621451242, 'epoch': 10.0})

In [21]:
model.eval()

BiLSTMClassifier(
  (embedding): Embedding(8768, 300)
  (lstm): LSTM(300, 300, batch_first=True, bidirectional=True)
  (fc1): Linear(in_features=600, out_features=400, bias=True)
  (dropout): Dropout(p=0.2, inplace=False)
  (fc2): Linear(in_features=400, out_features=50, bias=True)
)

In [22]:
trainer.evaluate(test_data)

{'eval_loss': 1.0430619716644287,
 'eval_accuracy': 0.742,
 'eval_precision': 0.5333965099104782,
 'eval_recall': 0.5098265118205139,
 'eval_f1': 0.4844533228344757,
 'eval_runtime': 4.857,
 'eval_samples_per_second': 102.944,
 'eval_steps_per_second': 0.824,
 'epoch': 10.0}

In [23]:
torch.save(model.state_dict(), f"{os.path.expanduser('~')}/models/{DATASET}/bilstm-base_fine_embedd.pth")

In [24]:
student_model = base.BiLSTMClassifier(embedding_matrix=embedding_matrix, embedding_dim=embedding_dim, fc_dim=400, hidden_dim=300, output_dim=50, freeze_embed=False)

In [25]:
training_args = base.get_training_args(output_dir=f"~/results/{DATASET}/bilstm-distill_fine_embedd", remove_unused_columns=False, logging_dir=f"~/logs/{DATASET}/bilstm-distill_fine_embedd", lr=.001,  epochs=10, batch_size=128, lambda_param=.4, temp=2)

In [26]:
base.reset_seed()

In [27]:
trainer = base.DistilTrainer(
    student_model=student_model,
    args=training_args,
    train_dataset=train_data,
    eval_dataset=eval_data,
    compute_metrics=base.compute_metrics,
    callbacks = [EarlyStoppingCallback(early_stopping_patience = 3)]
)

In [28]:
trainer.train()

Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,2.7431,2.220762,0.371219,0.0441,0.076876,0.051174
2,1.9043,1.68253,0.505041,0.113401,0.131711,0.106919
3,1.5154,1.423556,0.573786,0.173883,0.170543,0.150719
4,1.2342,1.235636,0.650779,0.30547,0.241903,0.228783
5,1.0164,1.115374,0.679193,0.29121,0.283516,0.275814
6,0.8348,1.039087,0.702108,0.38633,0.319737,0.319049
7,0.7027,1.012158,0.708524,0.379321,0.335861,0.339772
8,0.61,0.962038,0.72319,0.372811,0.355939,0.355823
9,0.5365,0.957035,0.725023,0.417339,0.373162,0.378029
10,0.4942,0.95537,0.732356,0.417877,0.384735,0.386927


TrainOutput(global_step=350, training_loss=1.1591509355817522, metrics={'train_runtime': 108.2008, 'train_samples_per_second': 403.047, 'train_steps_per_second': 3.235, 'total_flos': 0.0, 'train_loss': 1.1591509355817522, 'epoch': 10.0})

In [29]:
student_model.eval()

BiLSTMClassifier(
  (embedding): Embedding(8768, 300)
  (lstm): LSTM(300, 300, batch_first=True, bidirectional=True)
  (fc1): Linear(in_features=600, out_features=400, bias=True)
  (dropout): Dropout(p=0.2, inplace=False)
  (fc2): Linear(in_features=400, out_features=50, bias=True)
)

In [30]:
trainer.evaluate(test_data)

{'eval_loss': 0.9259228110313416,
 'eval_accuracy': 0.73,
 'eval_precision': 0.3508543292541387,
 'eval_recall': 0.4107860004183365,
 'eval_f1': 0.3584356517688107,
 'eval_runtime': 4.3087,
 'eval_samples_per_second': 116.043,
 'eval_steps_per_second': 0.928,
 'epoch': 10.0}

In [31]:
torch.save(student_model.state_dict(), f"{os.path.expanduser('~')}/models/{DATASET}/bilstm-distill_fine_embedd.pth")

In [32]:
model = base.BiLSTMClassifier(embedding_matrix=embedding_matrix, embedding_dim=embedding_dim, fc_dim=400, hidden_dim=300, output_dim=50, freeze_embed=False)

In [33]:
training_args = base.get_training_args(output_dir=f"~/results/{DATASET}/bilstm-base-aug_fine_embedd", logging_dir=f"~/logs/{DATASET}/bilstm-base-aug_fine_embedd", lr=.001,  epochs=10, batch_size=128)

In [34]:
base.reset_seed()

In [35]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=all_train_data,
    eval_dataset=eval_data,
    compute_metrics=base.compute_metrics,
    callbacks = [EarlyStoppingCallback(early_stopping_patience = 3)]
)

In [36]:
trainer.train()

Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.8719,1.100653,0.779102,0.641217,0.628317,0.618581
2,0.0671,1.242237,0.799267,0.736541,0.661474,0.679083
3,0.019,1.440925,0.783685,0.694763,0.650809,0.655739


KeyboardInterrupt: 

In [None]:
model.eval()

BiLSTMClassifier(
  (embedding): Embedding(8768, 300)
  (lstm): LSTM(300, 300, batch_first=True, bidirectional=True)
  (fc1): Linear(in_features=600, out_features=400, bias=True)
  (dropout): Dropout(p=0.2, inplace=False)
  (fc2): Linear(in_features=400, out_features=50, bias=True)
)

In [None]:
trainer.evaluate(test_data)

{'eval_loss': 1.1257561445236206,
 'eval_accuracy': 0.826,
 'eval_precision': 0.6777421222188665,
 'eval_recall': 0.7079567236011463,
 'eval_f1': 0.6720153525980083,
 'eval_runtime': 3.6497,
 'eval_samples_per_second': 136.998,
 'eval_steps_per_second': 1.096,
 'epoch': 10.0}

In [None]:
torch.save(model.state_dict(), f"{os.path.expanduser('~')}/models/{DATASET}/bilstm-base-aug_fine_embedd.pth")

In [None]:
student_model = base.BiLSTMClassifier(embedding_matrix=embedding_matrix, embedding_dim=embedding_dim, fc_dim=400, hidden_dim=300, output_dim=50, freeze_embed=False)

In [None]:
training_args = base.get_training_args(output_dir=f"~/results/{DATASET}/bilstm-distill-aug_fine_embedd", remove_unused_columns=False, logging_dir=f"~/logs/{DATASET}/bilstm-distill-aug_fine_embedd", lr=.001,  epochs=10, batch_size=128, lambda_param=.4, temp=2)

In [None]:
base.reset_seed()

In [None]:
trainer = base.DistilTrainer(
    student_model=student_model,
    args=training_args,
    train_dataset=all_train_data,
    eval_dataset=eval_data,
    compute_metrics=base.compute_metrics,
    callbacks = [EarlyStoppingCallback(early_stopping_patience = 3)]
)

In [None]:
trainer.train()

Epoch,Training Loss,Validation Loss


In [None]:
student_model.eval()

BiLSTMClassifier(
  (embedding): Embedding(8768, 300)
  (lstm): LSTM(300, 300, batch_first=True, bidirectional=True)
  (fc1): Linear(in_features=600, out_features=400, bias=True)
  (dropout): Dropout(p=0.2, inplace=False)
  (fc2): Linear(in_features=400, out_features=50, bias=True)
)

In [None]:
trainer.evaluate(test_data)

{'eval_loss': 0.31038418412208557,
 'eval_accuracy': 0.82,
 'eval_precision': 0.6734959934708443,
 'eval_recall': 0.6293883870941059,
 'eval_f1': 0.6251044469767673,
 'eval_runtime': 3.883,
 'eval_samples_per_second': 128.765,
 'eval_steps_per_second': 1.03,
 'epoch': 10.0}

In [None]:
torch.save(model.state_dict(), f"{os.path.expanduser('~')}/models/{DATASET}/bilstm-distill-aug_fine_embedd.pth")