In [1]:
from datasets import concatenate_datasets, load_from_disk
from transformers import BasicTokenizer, EarlyStoppingCallback, Trainer, BertForSequenceClassification, AutoConfig, BertTokenizer
from torch.utils.data import DataLoader
import kagglehub
import torch
import base
import copy
import os

[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /home/jovyan/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package punkt to /home/jovyan/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to /home/jovyan/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger_eng to
[nltk_data]     /home/jovyan/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger_eng is already up-to-
[nltk_data]       date!


In [2]:
my_glove = kagglehub.dataset_download("thanakomsn/glove6b300dtxt")
print(my_glove)

/home/jovyan/.cache/kagglehub/datasets/thanakomsn/glove6b300dtxt/versions/1


In [3]:
GLOVE_FILE = f"{my_glove}/glove.6B.300d.txt"
DATASET = "trec"

In [4]:
train_data = load_from_disk(f"~/data/{DATASET}/train-logits_coarse")
eval_data = load_from_disk(f"~/data/{DATASET}/eval-logits_coarse")
test_data = load_from_disk(f"~/data/{DATASET}/test-logits_coarse")

all_train_data = load_from_disk(f"~/data/{DATASET}/train-logits-augmented_coarse")

all_data = concatenate_datasets([load_from_disk(file) for file in [f"~/data/{DATASET}/eval-logits_coarse", f"~/data/{DATASET}/test-logits_coarse", f"~/data/{DATASET}/train-logits-augmented_coarse"]])
tokenizer = BasicTokenizer(do_lower_case=True)
teacher_tokenizer = BertTokenizer.from_pretrained("carrassi-ni/bert-base-trec-question-classification")

In [5]:
if torch.cuda.is_available():
    device = torch.device("cuda")
    print("GPU is available and will be used:", torch.cuda.get_device_name(0))
else:
    device = torch.device("cpu")
    print("GPU is not available, using CPU.")

GPU is available and will be used: NVIDIA H100 PCIe


In [6]:
train_data_tokens = list(map(lambda e: tokenizer.tokenize(e["sentence"]), train_data))
eval_data_tokens = list(map(lambda e: tokenizer.tokenize(e["sentence"]), eval_data))
test_data_tokens = list(map(lambda e: tokenizer.tokenize(e["sentence"]), test_data))

all_train_data_tokens = list(map(lambda e: tokenizer.tokenize(e["sentence"]), all_train_data))

all_data_tokens = list(map(lambda e: tokenizer.tokenize(e["sentence"]), all_data))

In [7]:
vocab = base.get_vocab(all_data_tokens)

In [8]:
word_index = dict(zip(vocab, range(len(vocab))))

In [9]:
embeddings_index = base.get_embeddings_indeces(GLOVE_FILE)

Found 400000 word vectors.


In [10]:
print(len(vocab))
num_tokens = len(vocab) + 2
embedding_dim = 300

8766


In [11]:
embedding_matrix = base.get_embedding_matrix(num_tokens, embedding_dim, word_index, embeddings_index)

Converted 8551 words (215) misses


In [12]:
train_data_index = list(map(lambda x: list(map(lambda y: word_index[y], x)),train_data_tokens))
eval_data_index = list(map(lambda x: list(map(lambda y: word_index[y], x)),eval_data_tokens))
test_data_index = list(map(lambda x: list(map(lambda y: word_index[y], x)),test_data_tokens))

all_train_data_index = list(map(lambda x: list(map(lambda y: word_index[y], x)),all_train_data_tokens))

In [13]:
train_padded_data = list(map(lambda x: base.padd(x,60), train_data_index))
eval_padded_data = list(map(lambda x: base.padd(x,60), eval_data_index))
test_padded_data = list(map(lambda x: base.padd(x,60), test_data_index))

all_train_padded_data = list(map(lambda x: base.padd(x,60), all_train_data_index))

In [14]:
train_teacher_data = base.prepare_dataset_teacher(train_data, teacher_tokenizer)
eval_teacher_data = base.prepare_dataset_teacher(eval_data, teacher_tokenizer)
test_teacher_data = base.prepare_dataset_teacher(test_data, teacher_tokenizer)

all_train_teacher_data = base.prepare_dataset_teacher(all_train_data, teacher_tokenizer)

In [15]:
train_data = train_data.add_column("input_ids", train_padded_data)
train_data = train_data.add_column("teacher_ids", train_teacher_data[0])
train_data = train_data.add_column("teacher_attention", train_teacher_data[1])

eval_data = eval_data.add_column("input_ids", eval_padded_data)
eval_data = eval_data.add_column("teacher_ids", eval_teacher_data[0])
eval_data = eval_data.add_column("teacher_attention", eval_teacher_data[1])

test_data = test_data.add_column("input_ids", test_padded_data)
test_data = test_data.add_column("teacher_ids", test_teacher_data[0])
test_data = test_data.add_column("teacher_attention", test_teacher_data[1])

all_train_data = all_train_data.add_column("input_ids", all_train_padded_data)
all_train_data = all_train_data.add_column("teacher_ids", all_train_teacher_data[0])
all_train_data = all_train_data.add_column("teacher_attention", all_train_teacher_data[1])

In [16]:
student_model = base.BiLSTMClassifier(embedding_matrix=embedding_matrix, embedding_dim=embedding_dim, fc_dim=400, hidden_dim=300, output_dim=6)

In [17]:
training_args = base.get_training_args(output_dir=f"~/results/{DATASET}/bilstm-distill_coarse", remove_unused_columns=False, logging_dir=f"~/logs/{DATASET}/bilstm-distill_coarse")

In [18]:
base.reset_seed()

In [19]:
train_data.set_format(type="torch", columns=["input_ids", "logits", "labels"], device="cpu")
eval_data.set_format(type="torch", columns=["input_ids", "logits", "labels"], device="cpu")

In [20]:
trainer = base.DistilTrainer(
    student_model=student_model,
    args=training_args,
    train_dataset=train_data,
    eval_dataset=eval_data,
    compute_metrics=base.compute_metrics
)

In [21]:
trainer.train()

Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,3.9648,3.862671,0.355637,0.149173,0.260871,0.176619
2,3.8087,3.715521,0.36022,0.144238,0.264381,0.177362
3,3.7106,3.654732,0.366636,0.316966,0.269417,0.191795
4,3.6647,3.612571,0.425298,0.285923,0.31531,0.258715
5,3.6266,3.595595,0.427131,0.281216,0.316818,0.262028


TrainOutput(global_step=175, training_loss=3.7550796944754463, metrics={'train_runtime': 25.2282, 'train_samples_per_second': 864.311, 'train_steps_per_second': 6.937, 'total_flos': 0.0, 'train_loss': 3.7550796944754463, 'epoch': 5.0})

In [22]:
base.reset_seed()

In [23]:
student_model = base.BiLSTMClassifier(embedding_matrix=embedding_matrix, embedding_dim=embedding_dim, fc_dim=400, hidden_dim=300, output_dim=6)
teacher_model = BertForSequenceClassification.from_pretrained("carrassi-ni/bert-base-trec-question-classification", num_labels=6)
teacher_model.to(device)
teacher_model.eval()

BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(28996, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSdpaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e

In [24]:
training_args = base.get_training_args(output_dir=f"~/results/{DATASET}/bilstm-distill_coarse_infer", remove_unused_columns=False, logging_dir=f"~/logs/{DATASET}/bilstm-distill_coarse_infer")

In [25]:
base.reset_seed()

In [26]:
train_data.reset_format()
eval_data.reset_format()   

In [27]:
trainer = base.DistilTrainerInferText(
    student_model=student_model,
    teacher_model=teacher_model,
    args=training_args,
    train_dataset=train_data,
    eval_dataset=eval_data,
    compute_metrics=base.compute_metrics
)

In [28]:
trainer.train()

Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,3.9283,3.823031,0.307058,0.204172,0.224517,0.152854
2,3.7811,3.703598,0.343721,0.346648,0.251907,0.179733
3,3.7053,3.64718,0.419798,0.281584,0.311493,0.262607
4,3.6501,3.601635,0.451879,0.246299,0.337299,0.278132
5,3.6185,3.583049,0.453712,0.250104,0.339161,0.28174


TrainOutput(global_step=175, training_loss=3.736647469656808, metrics={'train_runtime': 25.3822, 'train_samples_per_second': 859.066, 'train_steps_per_second': 6.895, 'total_flos': 0.0, 'train_loss': 3.736647469656808, 'epoch': 5.0})

In [29]:
student_model = base.BiLSTMClassifier(embedding_matrix=embedding_matrix, embedding_dim=embedding_dim, fc_dim=400, hidden_dim=300, output_dim=6)

In [30]:
training_args = base.get_training_args(output_dir=f"~/results/{DATASET}/bilstm-distill_coarse", remove_unused_columns=False, logging_dir=f"~/logs/{DATASET}/bilstm-distill_coarse")

In [31]:
base.reset_seed()

In [32]:
all_train_data.set_format(type="torch", columns=["input_ids", "logits", "labels"], device="cpu")
eval_data.set_format(type="torch", columns=["input_ids", "logits", "labels"], device="cpu")

In [33]:
trainer = base.DistilTrainer(
    student_model=student_model,
    args=training_args,
    train_dataset= all_train_data,
    eval_dataset=eval_data,
    compute_metrics=base.compute_metrics
)

In [34]:
trainer.train()

Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,3.057,2.575615,0.589368,0.538035,0.489129,0.491729
2,2.1153,2.060866,0.662695,0.554828,0.565544,0.554615
3,1.7407,1.825887,0.710357,0.603763,0.604536,0.602153
4,1.5536,1.715739,0.72594,0.615198,0.619013,0.615878
5,1.4755,1.685099,0.736022,0.621906,0.628333,0.62395


TrainOutput(global_step=1525, training_loss=1.9884228115394467, metrics={'train_runtime': 39.2084, 'train_samples_per_second': 4963.219, 'train_steps_per_second': 38.895, 'total_flos': 0.0, 'train_loss': 1.9884228115394467, 'epoch': 5.0})

In [35]:
student_model = base.BiLSTMClassifier(embedding_matrix=embedding_matrix, embedding_dim=embedding_dim, fc_dim=400, hidden_dim=300, output_dim=6)

In [36]:
training_args = base.get_training_args(output_dir=f"~/results/{DATASET}/bilstm-distill_coarse_infer", remove_unused_columns=False, logging_dir=f"~/logs/{DATASET}/bilstm-distill_coarse_infer")

In [37]:
base.reset_seed()

In [38]:
all_train_data.reset_format()
eval_data.reset_format()   

In [39]:
trainer = base.DistilTrainerInferText(
    student_model=student_model,
    teacher_model=teacher_model,
    args=training_args,
    train_dataset=all_train_data,
    eval_dataset=eval_data,
    compute_metrics=base.compute_metrics
)

In [40]:
trainer.train()

Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,3.0568,2.578793,0.583868,0.538872,0.483194,0.487143
2,2.1186,2.064631,0.660862,0.553841,0.5641,0.553762
3,1.7463,1.830782,0.708524,0.60262,0.602858,0.600749
4,1.5583,1.720387,0.727773,0.616949,0.620347,0.617436
5,1.4795,1.689142,0.736939,0.622542,0.628647,0.62455


TrainOutput(global_step=1525, training_loss=1.9919109407018443, metrics={'train_runtime': 49.5141, 'train_samples_per_second': 3930.194, 'train_steps_per_second': 30.799, 'total_flos': 0.0, 'train_loss': 1.9919109407018443, 'epoch': 5.0})

In [63]:
student_model = BertForSequenceClassification.from_pretrained("google/bert_uncased_L-2_H-128_A-2", num_labels=6)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [64]:
training_args = base.get_training_args(output_dir=f"~/results/{DATASET}/bilstm-distill_coarse", remove_unused_columns=False, logging_dir=f"~/logs/{DATASET}/bilstm-distill_coarse")

In [43]:
train_data = train_data.remove_columns(["input_ids"])
train_data = train_data.rename_column("teacher_attention", "attention_mask")
train_data = train_data.rename_column("teacher_ids", "input_ids")

eval_data = eval_data.remove_columns(["input_ids"])
eval_data = eval_data.rename_column("teacher_attention", "attention_mask")
eval_data = eval_data.rename_column("teacher_ids", "input_ids")

train_data.set_format(type="torch", columns=["input_ids", "attention_mask", "logits", "labels"], device="cpu")
eval_data.set_format(type="torch", columns=["input_ids", "attention_mask", "logits", "labels"], device="cpu")

In [65]:
base.reset_seed()

In [66]:
trainer = base.DistilTrainer(
    student_model=student_model,
    args=training_args,
    train_dataset=train_data,
    eval_dataset=eval_data,
    compute_metrics=base.compute_metrics
)

In [67]:
trainer.train()

Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,3.9382,3.837287,0.27681,0.107154,0.202435,0.124772
2,3.8325,3.752981,0.336389,0.213873,0.247184,0.164454
3,3.7558,3.657221,0.395967,0.365622,0.295355,0.240001
4,3.6803,3.593544,0.436297,0.392907,0.33537,0.304363
5,3.6336,3.571911,0.446379,0.393985,0.346483,0.317883


TrainOutput(global_step=175, training_loss=3.7680834524972098, metrics={'train_runtime': 24.892, 'train_samples_per_second': 875.986, 'train_steps_per_second': 7.03, 'total_flos': 3250492282800.0, 'train_loss': 3.7680834524972098, 'epoch': 5.0})

In [47]:
base.reset_seed()

In [48]:
student_model = BertForSequenceClassification.from_pretrained("google/bert_uncased_L-2_H-128_A-2", num_labels=6)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [49]:
training_args = base.get_training_args(output_dir=f"~/results/{DATASET}/bilstm-distill_coarse_infer", remove_unused_columns=False, logging_dir=f"~/logs/{DATASET}/bilstm-distill_coarse_infer")

In [50]:
trainer = base.DistilTrainerInfer(
    student_model=student_model,
    teacher_model=teacher_model,
    args=training_args,
    train_dataset=train_data,
    eval_dataset=eval_data,
    compute_metrics=base.compute_metrics
)

In [51]:
trainer.train()

Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,3.9432,3.819921,0.286893,0.321482,0.212421,0.142836
2,3.7978,3.674825,0.460128,0.381106,0.35635,0.328353
3,3.684,3.559548,0.483043,0.392773,0.378551,0.35851
4,3.5957,3.497603,0.487626,0.544527,0.387375,0.371732
5,3.5515,3.478526,0.491292,0.535266,0.392566,0.381309


TrainOutput(global_step=175, training_loss=3.7144506399972097, metrics={'train_runtime': 25.9137, 'train_samples_per_second': 841.445, 'train_steps_per_second': 6.753, 'total_flos': 3250492282800.0, 'train_loss': 3.7144506399972097, 'epoch': 5.0})

In [52]:
student_model = BertForSequenceClassification.from_pretrained("google/bert_uncased_L-2_H-128_A-2", num_labels=6)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [53]:
training_args = base.get_training_args(output_dir=f"~/results/{DATASET}/bilstm-distill_coarse", remove_unused_columns=False, logging_dir=f"~/logs/{DATASET}/bilstm-distill_coarse")

In [54]:
base.reset_seed()

In [55]:
all_train_data = all_train_data.remove_columns(["input_ids"])
all_train_data = all_train_data.rename_column("teacher_attention", "attention_mask")
all_train_data = all_train_data.rename_column("teacher_ids", "input_ids")

all_train_data.set_format(type="torch", columns=["input_ids", "attention_mask", "logits", "labels"], device="cpu")

In [56]:
trainer = base.DistilTrainer(
    student_model=student_model,
    args=training_args,
    train_dataset=all_train_data,
    eval_dataset=eval_data,
    compute_metrics=base.compute_metrics
)

In [57]:
trainer.train()

Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,3.2653,2.972102,0.555454,0.517429,0.445443,0.438753
2,2.3138,2.22392,0.722273,0.627722,0.613703,0.617018
3,1.7236,1.878752,0.76352,0.656433,0.648971,0.651068
4,1.4187,1.723236,0.781852,0.666621,0.665619,0.665197
5,1.2782,1.682092,0.782768,0.665374,0.666529,0.665358


TrainOutput(global_step=1525, training_loss=1.9999078269082993, metrics={'train_runtime': 39.4718, 'train_samples_per_second': 4930.104, 'train_steps_per_second': 38.635, 'total_flos': 29009208816000.0, 'train_loss': 1.9999078269082993, 'epoch': 5.0})

In [58]:
base.reset_seed()

In [59]:
student_model = BertForSequenceClassification.from_pretrained("google/bert_uncased_L-2_H-128_A-2", num_labels=6)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [60]:
training_args = base.get_training_args(output_dir=f"~/results/{DATASET}/bilstm-distill_coarse_infer", remove_unused_columns=False, logging_dir=f"~/logs/{DATASET}/bilstm-distill_coarse_infer")

In [61]:
trainer = base.DistilTrainerInfer(
    student_model=student_model,
    teacher_model=teacher_model,
    args=training_args,
    train_dataset=all_train_data,
    eval_dataset=eval_data,
    compute_metrics=base.compute_metrics
)

In [62]:
trainer.train()

Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,3.213,2.877418,0.598533,0.620367,0.491311,0.510665
2,2.2231,2.116112,0.72319,0.626496,0.619247,0.62139
3,1.6429,1.799599,0.762603,0.657653,0.652383,0.653547
4,1.3278,1.65799,0.767186,0.664202,0.657342,0.657944
5,1.1914,1.612124,0.771769,0.662429,0.661861,0.661051


TrainOutput(global_step=1525, training_loss=1.9196197809938524, metrics={'train_runtime': 52.2044, 'train_samples_per_second': 3727.657, 'train_steps_per_second': 29.212, 'total_flos': 29009208816000.0, 'train_loss': 1.9196197809938524, 'epoch': 5.0})