In [1]:
from datasets import concatenate_datasets, load_from_disk
from transformers import BasicTokenizer, EarlyStoppingCallback, Trainer, BertForSequenceClassification, AutoConfig, BertTokenizer
from torch.utils.data import DataLoader
import kagglehub
import torch
import base
import copy
import os

[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /home/jovyan/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package punkt to /home/jovyan/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to /home/jovyan/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger_eng to
[nltk_data]     /home/jovyan/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger_eng is already up-to-
[nltk_data]       date!


In [2]:
my_glove = kagglehub.dataset_download("thanakomsn/glove6b300dtxt")
print(my_glove)

/home/jovyan/.cache/kagglehub/datasets/thanakomsn/glove6b300dtxt/versions/1


In [3]:
GLOVE_FILE = f"{my_glove}/glove.6B.300d.txt"
DATASET = "trec"

In [93]:
train_data = load_from_disk(f"~/data/{DATASET}/train-logits_fine")
eval_data = load_from_disk(f"~/data/{DATASET}/eval-logits_fine")
test_data = load_from_disk(f"~/data/{DATASET}/test-logits_fine")

all_train_data = load_from_disk(f"~/data/{DATASET}/train-logits-augmented_fine")

all_data = concatenate_datasets([load_from_disk(file) for file in [f"~/data/{DATASET}/eval-logits_fine", f"~/data/{DATASET}/test-logits_fine", f"~/data/{DATASET}/train-logits-augmented_fine"]])
tokenizer = BasicTokenizer(do_lower_case=True)
teacher_tokenizer = BertTokenizer.from_pretrained("carrassi-ni/bert-base-trec-question-classification")

In [64]:
if torch.cuda.is_available():
    device = torch.device("cuda")
    print("GPU is available and will be used:", torch.cuda.get_device_name(0))
else:
    device = torch.device("cpu")
    print("GPU is not available, using CPU.")

GPU is available and will be used: NVIDIA H100 PCIe


In [6]:
train_data_tokens = list(map(lambda e: tokenizer.tokenize(e["sentence"]), train_data))
eval_data_tokens = list(map(lambda e: tokenizer.tokenize(e["sentence"]), eval_data))
test_data_tokens = list(map(lambda e: tokenizer.tokenize(e["sentence"]), test_data))

all_train_data_tokens = list(map(lambda e: tokenizer.tokenize(e["sentence"]), all_train_data))

all_data_tokens = list(map(lambda e: tokenizer.tokenize(e["sentence"]), all_data))

In [7]:
vocab = base.get_vocab(all_data_tokens)

In [8]:
word_index = dict(zip(vocab, range(len(vocab))))

In [9]:
embeddings_index = base.get_embeddings_indeces(GLOVE_FILE)

Found 400000 word vectors.


In [10]:
print(len(vocab))
num_tokens = len(vocab) + 2
embedding_dim = 300

8766


In [11]:
embedding_matrix = base.get_embedding_matrix(num_tokens, embedding_dim, word_index, embeddings_index)

Converted 8551 words (215) misses


In [12]:
train_data_index = list(map(lambda x: list(map(lambda y: word_index[y], x)),train_data_tokens))
eval_data_index = list(map(lambda x: list(map(lambda y: word_index[y], x)),eval_data_tokens))
test_data_index = list(map(lambda x: list(map(lambda y: word_index[y], x)),test_data_tokens))

all_train_data_index = list(map(lambda x: list(map(lambda y: word_index[y], x)),all_train_data_tokens))

In [13]:
train_padded_data = list(map(lambda x: base.padd(x,60), train_data_index))
eval_padded_data = list(map(lambda x: base.padd(x,60), eval_data_index))
test_padded_data = list(map(lambda x: base.padd(x,60), test_data_index))

all_train_padded_data = list(map(lambda x: base.padd(x,60), all_train_data_index))

In [14]:
train_teacher_data = base.prepare_dataset_teacher(train_data, teacher_tokenizer)
eval_teacher_data = base.prepare_dataset_teacher(eval_data, teacher_tokenizer)
test_teacher_data = base.prepare_dataset_teacher(test_data, teacher_tokenizer)

all_train_teacher_data = base.prepare_dataset_teacher(all_train_data, teacher_tokenizer)

In [94]:
train_data = train_data.add_column("input_ids", train_padded_data)
train_data = train_data.add_column("teacher_ids", train_teacher_data[0])
train_data = train_data.add_column("teacher_attention", train_teacher_data[1])

eval_data = eval_data.add_column("input_ids", eval_padded_data)
eval_data = eval_data.add_column("teacher_ids", eval_teacher_data[0])
eval_data = eval_data.add_column("teacher_attention", eval_teacher_data[1])

test_data = test_data.add_column("input_ids", test_padded_data)
test_data = test_data.add_column("teacher_ids", test_teacher_data[0])
test_data = test_data.add_column("teacher_attention", test_teacher_data[1])

all_train_data = all_train_data.add_column("input_ids", all_train_padded_data)
all_train_data = all_train_data.add_column("teacher_ids", all_train_teacher_data[0])
all_train_data = all_train_data.add_column("teacher_attention", all_train_teacher_data[1])

In [16]:
student_model = base.BiLSTMClassifier(embedding_matrix=embedding_matrix, embedding_dim=embedding_dim, fc_dim=400, hidden_dim=300, output_dim=50)

In [17]:
training_args = base.get_training_args(output_dir=f"~/results/{DATASET}/bilstm-distill_fine", remove_unused_columns=False, logging_dir=f"~/logs/{DATASET}/bilstm-distill_fine")

In [18]:
base.reset_seed()

In [19]:
train_data.set_format(type="torch", columns=["input_ids", "logits", "labels"], device="cpu")
eval_data.set_format(type="torch", columns=["input_ids", "logits", "labels"], device="cpu")

In [20]:
trainer = base.DistilTrainer(
    student_model=student_model,
    args=training_args,
    train_dataset=train_data,
    eval_dataset=eval_data,
    compute_metrics=base.compute_metrics
)

In [21]:
trainer.train()

Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,2.4505,2.373377,0.176902,0.003538,0.02,0.006012
2,2.27,2.15912,0.176902,0.003538,0.02,0.006012
3,2.1539,2.117269,0.176902,0.003538,0.02,0.006012
4,2.1178,2.103396,0.176902,0.003538,0.02,0.006012
5,2.1251,2.098362,0.176902,0.003538,0.02,0.006012


TrainOutput(global_step=175, training_loss=2.2234579031808037, metrics={'train_runtime': 22.1668, 'train_samples_per_second': 983.679, 'train_steps_per_second': 7.895, 'total_flos': 0.0, 'train_loss': 2.2234579031808037, 'epoch': 5.0})

In [22]:
base.reset_seed()

In [66]:
student_model = base.BiLSTMClassifier(embedding_matrix=embedding_matrix, embedding_dim=embedding_dim, fc_dim=400, hidden_dim=300, output_dim=50)
config = AutoConfig.from_pretrained("ndavid/autotrain-trec-fine-bert-739422530")
config.max_length = 20 #revert to default ot skip warning 
config.num_labels = 50
teacher_model = BertForSequenceClassification.from_pretrained("ndavid/autotrain-trec-fine-bert-739422530", config=config, ignore_mismatched_sizes=True)
model_path = f"{os.path.expanduser('~')}/models/{DATASET}/teacher_fine.pth"
state_dict = torch.load(model_path, map_location=torch.device('cpu')) 
teacher_model.load_state_dict(state_dict)
teacher_model.to(device)
teacher_model.eval()

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at ndavid/autotrain-trec-fine-bert-739422530 and are newly initialized because the shapes did not match:
- classifier.weight: found shape torch.Size([47, 768]) in the checkpoint and torch.Size([50, 768]) in the model instantiated
- classifier.bias: found shape torch.Size([47]) in the checkpoint and torch.Size([50]) in the model instantiated
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  state_dict = torch.load(model_path, map_location=torch.device('cpu'))


BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSdpaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e

In [67]:
training_args = base.get_training_args(output_dir=f"~/results/{DATASET}/bilstm-distill_fine_infer", remove_unused_columns=False, logging_dir=f"~/logs/{DATASET}/bilstm-distill_fine_infer")

In [68]:
base.reset_seed()

In [69]:
train_data.reset_format()
eval_data.reset_format()   

In [70]:
trainer = base.DistilTrainerInferText(
    student_model=student_model,
    teacher_model=teacher_model,
    args=training_args,
    train_dataset=train_data,
    eval_dataset=eval_data,
    compute_metrics=base.compute_metrics
)

In [28]:
trainer.train()

Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,2.1918,2.134382,0.176902,0.003538,0.02,0.006012
2,2.0411,1.933704,0.154904,0.016379,0.032953,0.015893
3,1.9049,1.874159,0.197067,0.007594,0.032456,0.011759
4,1.8728,1.862509,0.191567,0.011374,0.036397,0.01497
5,1.8766,1.85833,0.205316,0.009184,0.036547,0.013548


TrainOutput(global_step=175, training_loss=1.9774344744001116, metrics={'train_runtime': 23.4669, 'train_samples_per_second': 929.182, 'train_steps_per_second': 7.457, 'total_flos': 0.0, 'train_loss': 1.9774344744001116, 'epoch': 5.0})

In [119]:
student_model = base.BiLSTMClassifier(embedding_matrix=embedding_matrix, embedding_dim=embedding_dim, fc_dim=400, hidden_dim=300, output_dim=50)

In [120]:
training_args = base.get_training_args(output_dir=f"~/results/{DATASET}/bilstm-distill_fine", remove_unused_columns=False, logging_dir=f"~/logs/{DATASET}/bilstm-distill_fine")

In [121]:
base.reset_seed()

In [122]:
all_train_data.set_format(type="torch", columns=["input_ids", "logits", "labels"], device="cpu")
eval_data.set_format(type="torch", columns=["input_ids", "logits", "labels"], device="cpu")

In [123]:
trainer = base.DistilTrainer(
    student_model=student_model,
    args=training_args,
    train_dataset= all_train_data,
    eval_dataset=eval_data,
    compute_metrics=base.compute_metrics
)

In [124]:
trainer.train()

Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,1.8222,1.541512,0.411549,0.046698,0.088173,0.058124
2,1.3731,1.335844,0.492209,0.111465,0.124481,0.09973
3,1.2253,1.251113,0.538038,0.142554,0.15819,0.136669
4,1.1473,1.202749,0.550871,0.157393,0.167889,0.147884
5,1.1109,1.188447,0.560953,0.164303,0.174087,0.154892


TrainOutput(global_step=2615, training_loss=1.335760789801924, metrics={'train_runtime': 54.295, 'train_samples_per_second': 6157.47, 'train_steps_per_second': 48.163, 'total_flos': 0.0, 'train_loss': 1.335760789801924, 'epoch': 5.0})

In [125]:
base.reset_seed()

In [131]:
student_model = base.BiLSTMClassifier(embedding_matrix=embedding_matrix, embedding_dim=embedding_dim, fc_dim=400, hidden_dim=300, output_dim=50)

In [132]:
training_args = base.get_training_args(output_dir=f"~/results/{DATASET}/bilstm-distill_fine_infer", remove_unused_columns=False, logging_dir=f"~/logs/{DATASET}/bilstm-distill_fine_infer")

In [133]:
all_train_data.reset_format()
eval_data.reset_format()   

In [134]:
trainer = base.DistilTrainerInferText(
    student_model=student_model,
    teacher_model=teacher_model,
    args=training_args,
    train_dataset=all_train_data,
    eval_dataset=eval_data,
    compute_metrics=base.compute_metrics
)

In [135]:
trainer.train()

Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,1.6646,1.454043,0.420715,0.058872,0.098306,0.070845
2,1.3378,1.316299,0.482126,0.131999,0.126327,0.106951
3,1.2319,1.257713,0.536205,0.184151,0.164032,0.152147
4,1.1752,1.224481,0.55912,0.201576,0.177484,0.167082
5,1.1493,1.214771,0.566453,0.197191,0.181364,0.170022


TrainOutput(global_step=2615, training_loss=1.3117448445715822, metrics={'train_runtime': 75.3864, 'train_samples_per_second': 4434.75, 'train_steps_per_second': 34.688, 'total_flos': 0.0, 'train_loss': 1.3117448445715822, 'epoch': 5.0})

In [82]:
student_model = BertForSequenceClassification.from_pretrained("google/bert_uncased_L-2_H-128_A-2", num_labels=50)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [83]:
training_args = base.get_training_args(output_dir=f"~/results/{DATASET}/bilstm-distill_fine", remove_unused_columns=False, logging_dir=f"~/logs/{DATASET}/bilstm-distill_fine")

In [84]:
base.reset_seed()

In [85]:
train_data = train_data.remove_columns(["input_ids"])
train_data = train_data.rename_column("teacher_attention", "attention_mask")
train_data = train_data.rename_column("teacher_ids", "input_ids")

eval_data = eval_data.remove_columns(["input_ids"])
eval_data = eval_data.rename_column("teacher_attention", "attention_mask")
eval_data = eval_data.rename_column("teacher_ids", "input_ids")

train_data.set_format(type="torch", columns=["input_ids", "attention_mask", "logits", "labels"], device="cpu")
eval_data.set_format(type="torch", columns=["input_ids", "attention_mask", "logits", "labels"], device="cpu")

In [86]:
trainer = base.DistilTrainer(
    student_model=student_model,
    args=training_args,
    train_dataset=train_data,
    eval_dataset=eval_data,
    compute_metrics=base.compute_metrics
)

In [87]:
trainer.train()

Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,2.4366,2.356579,0.176902,0.003538,0.02,0.006012
2,2.3341,2.28904,0.176902,0.003538,0.02,0.006012
3,2.2813,2.247475,0.176902,0.003538,0.02,0.006012
4,2.246,2.222223,0.176902,0.003538,0.02,0.006012
5,2.2335,2.213092,0.176902,0.003538,0.02,0.006012


TrainOutput(global_step=175, training_loss=2.3062732369559154, metrics={'train_runtime': 23.1155, 'train_samples_per_second': 943.306, 'train_steps_per_second': 7.571, 'total_flos': 3295047747600.0, 'train_loss': 2.3062732369559154, 'epoch': 5.0})

In [88]:
base.reset_seed()

In [89]:
student_model = BertForSequenceClassification.from_pretrained("google/bert_uncased_L-2_H-128_A-2", num_labels=50)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [90]:
training_args = base.get_training_args(output_dir=f"~/results/{DATASET}/bilstm-distill_fine_infer", remove_unused_columns=False, logging_dir=f"~/logs/{DATASET}/bilstm-distill_fine_infer")

In [91]:
trainer = base.DistilTrainerInfer(
    student_model=student_model,
    teacher_model=teacher_model,
    args=training_args,
    train_dataset=train_data,
    eval_dataset=eval_data,
    compute_metrics=base.compute_metrics
)

In [92]:
trainer.train()

Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,2.1505,2.089359,0.051329,0.005879,0.020191,0.003905
2,2.0608,2.023119,0.043996,0.010778,0.019713,0.002667
3,2.0107,1.986471,0.044913,0.015098,0.020518,0.00256
4,1.9809,1.967636,0.041247,0.010808,0.020104,0.001759
5,1.9727,1.961234,0.043996,0.01681,0.020415,0.002366


TrainOutput(global_step=175, training_loss=2.03512202671596, metrics={'train_runtime': 26.4685, 'train_samples_per_second': 823.81, 'train_steps_per_second': 6.612, 'total_flos': 3295047747600.0, 'train_loss': 2.03512202671596, 'epoch': 5.0})

In [52]:
student_model = BertForSequenceClassification.from_pretrained("google/bert_uncased_L-2_H-128_A-2", num_labels=50)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [53]:
training_args = base.get_training_args(output_dir=f"~/results/{DATASET}/bilstm-distill_fine", remove_unused_columns=False, logging_dir=f"~/logs/{DATASET}/bilstm-distill_fine")

In [54]:
base.reset_seed()

In [55]:
all_train_data = all_train_data.remove_columns(["input_ids"])
all_train_data = all_train_data.rename_column("teacher_attention", "attention_mask")
all_train_data = all_train_data.rename_column("teacher_ids", "input_ids")

all_train_data.set_format(type="torch", columns=["input_ids", "attention_mask", "logits", "labels"], device="cpu")

In [56]:
trainer = base.DistilTrainer(
    student_model=student_model,
    args=training_args,
    train_dataset=all_train_data,
    eval_dataset=eval_data,
    compute_metrics=base.compute_metrics
)

In [57]:
trainer.train()

Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,1.9302,1.662366,0.448213,0.107548,0.117002,0.096661
2,1.3657,1.356978,0.553621,0.181809,0.189507,0.171213
3,1.1053,1.235595,0.580202,0.203831,0.21119,0.18861
4,0.9694,1.181589,0.6022,0.244138,0.23774,0.222202
5,0.9096,1.169394,0.616865,0.264583,0.254345,0.243203


TrainOutput(global_step=2615, training_loss=1.256040718268254, metrics={'train_runtime': 66.2553, 'train_samples_per_second': 5045.937, 'train_steps_per_second': 39.469, 'total_flos': 50520539462400.0, 'train_loss': 1.256040718268254, 'epoch': 5.0})

In [58]:
base.reset_seed()

In [59]:
student_model = BertForSequenceClassification.from_pretrained("google/bert_uncased_L-2_H-128_A-2", num_labels=50)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [60]:
training_args = base.get_training_args(output_dir=f"~/results/{DATASET}/bilstm-distill_fine_infer", remove_unused_columns=False, logging_dir=f"~/logs/{DATASET}/bilstm-distill_fine_infer")

In [61]:
trainer = base.DistilTrainerInfer(
    student_model=student_model,
    teacher_model=teacher_model,
    args=training_args,
    train_dataset=all_train_data,
    eval_dataset=eval_data,
    compute_metrics=base.compute_metrics
)

In [62]:
trainer.train()

Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,1.7422,1.539939,0.427131,0.133057,0.129187,0.114794
2,1.3121,1.301025,0.562786,0.233558,0.207617,0.197182
3,1.1146,1.216223,0.5967,0.253616,0.238101,0.231129
4,1.0211,1.17909,0.611366,0.246654,0.246109,0.237689
5,0.9806,1.168793,0.613199,0.256938,0.249865,0.241859


TrainOutput(global_step=2615, training_loss=1.2341336135426626, metrics={'train_runtime': 90.0998, 'train_samples_per_second': 3710.553, 'train_steps_per_second': 29.023, 'total_flos': 50520539462400.0, 'train_loss': 1.2341336135426626, 'epoch': 5.0})