In [1]:
from datasets import concatenate_datasets, load_from_disk
from transformers import BasicTokenizer, EarlyStoppingCallback, Trainer, BertForSequenceClassification, AutoConfig, BertTokenizer
from torch.utils.data import DataLoader
import kagglehub
import torch
import base
import copy
import os

[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /home/jovyan/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package punkt to /home/jovyan/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to /home/jovyan/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger_eng to
[nltk_data]     /home/jovyan/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger_eng is already up-to-
[nltk_data]       date!


In [2]:
my_glove = kagglehub.dataset_download("thanakomsn/glove6b300dtxt")
print(my_glove)

/home/jovyan/.cache/kagglehub/datasets/thanakomsn/glove6b300dtxt/versions/1


In [3]:
GLOVE_FILE = f"{my_glove}/glove.6B.300d.txt"
DATASET = "dbpedia"

In [4]:
train_data = load_from_disk(f"~/data/{DATASET}/train-logits")
eval_data = load_from_disk(f"~/data/{DATASET}/eval-logits")
test_data = load_from_disk(f"~/data/{DATASET}/test-logits")

all_train_data = load_from_disk(f"~/data/{DATASET}/train-logits-augmented")

all_data = concatenate_datasets([load_from_disk(file) for file in [f"~/data/{DATASET}/eval-logits", f"~/data/{DATASET}/test-logits", f"~/data/{DATASET}/train-logits-augmented"]])
tokenizer = BasicTokenizer(do_lower_case=True)
teacher_tokenizer = BertTokenizer.from_pretrained("fabriceyhc/bert-base-uncased-dbpedia_14")

In [5]:
if torch.cuda.is_available():
    device = torch.device("cuda")
    print("GPU is available and will be used:", torch.cuda.get_device_name(0))
else:
    device = torch.device("cpu")
    print("GPU is not available, using CPU.")

GPU is available and will be used: NVIDIA H100 PCIe


In [6]:
train_data_tokens = list(map(lambda e: tokenizer.tokenize(e["sentence"]), train_data))
eval_data_tokens = list(map(lambda e: tokenizer.tokenize(e["sentence"]), eval_data))
test_data_tokens = list(map(lambda e: tokenizer.tokenize(e["sentence"]), test_data))

all_train_data_tokens = list(map(lambda e: tokenizer.tokenize(e["sentence"]), all_train_data))

all_data_tokens = list(map(lambda e: tokenizer.tokenize(e["sentence"]), all_data))

In [7]:
vocab = base.get_vocab(all_data_tokens)

In [8]:
word_index = dict(zip(vocab, range(len(vocab))))

In [9]:
embeddings_index = base.get_embeddings_indeces(GLOVE_FILE)

Found 400000 word vectors.


In [10]:
print(len(vocab))
num_tokens = len(vocab) + 2
embedding_dim = 300

691158


In [11]:
embedding_matrix = base.get_embedding_matrix(num_tokens, embedding_dim, word_index, embeddings_index)

Converted 212978 words (478180) misses


In [12]:
train_data_index = list(map(lambda x: list(map(lambda y: word_index[y], x)),train_data_tokens))
eval_data_index = list(map(lambda x: list(map(lambda y: word_index[y], x)),eval_data_tokens))
test_data_index = list(map(lambda x: list(map(lambda y: word_index[y], x)),test_data_tokens))

all_train_data_index = list(map(lambda x: list(map(lambda y: word_index[y], x)),all_train_data_tokens))

In [13]:
train_padded_data = list(map(lambda x: base.padd(x,60), train_data_index))
eval_padded_data = list(map(lambda x: base.padd(x,60), eval_data_index))
test_padded_data = list(map(lambda x: base.padd(x,60), test_data_index))

all_train_padded_data = list(map(lambda x: base.padd(x,60), all_train_data_index))

In [None]:
train_teacher_data = base.prepare_dataset_teacher(train_data, teacher_tokenizer)
eval_teacher_data = base.prepare_dataset_teacher(eval_data, teacher_tokenizer)
test_teacher_data = base.prepare_dataset_teacher(test_data, teacher_tokenizer)

all_train_teacher_data = base.prepare_dataset_teacher(all_train_data, teacher_tokenizer)

Tokenizing the provided dataset:   0%|          | 0/448000 [00:00<?, ? examples/s]

Tokenizing the provided dataset:   0%|          | 0/112000 [00:00<?, ? examples/s]

Tokenizing the provided dataset:   0%|          | 0/70000 [00:00<?, ? examples/s]

Tokenizing the provided dataset:   0%|          | 0/879354 [00:00<?, ? examples/s]

In [14]:
train_data = train_data.add_column("input_ids", train_padded_data)
train_data = train_data.add_column("teacher_ids", train_teacher_data[0])
train_data = train_data.add_column("teacher_attention", train_teacher_data[1])

eval_data = eval_data.add_column("input_ids", eval_padded_data)
eval_data = eval_data.add_column("teacher_ids", eval_teacher_data[0])
eval_data = eval_data.add_column("teacher_attention", eval_teacher_data[1])

test_data = test_data.add_column("input_ids", test_padded_data)
test_data = test_data.add_column("teacher_ids", test_teacher_data[0])
test_data = test_data.add_column("teacher_attention", test_teacher_data[1])

all_train_data = all_train_data.add_column("input_ids", all_train_padded_data)
all_train_data = all_train_data.add_column("teacher_ids", all_train_teacher_data[0])
all_train_data = all_train_data.add_column("teacher_attention", all_train_teacher_data[1])

In [15]:
student_model = base.BiLSTMClassifier(embedding_matrix=embedding_matrix, embedding_dim=embedding_dim, fc_dim=400, hidden_dim=300, output_dim=14)

In [16]:
training_args = base.get_training_args(output_dir=f"~/results/{DATASET}/bilstm-distill_fine", remove_unused_columns=False, logging_dir=f"~/logs/{DATASET}/bilstm-distill_fine")

In [17]:
base.reset_seed()

In [None]:
train_data.set_format(type="torch", columns=["input_ids", "logits", "labels"], device="cpu")
eval_data.set_format(type="torch", columns=["input_ids", "logits", "labels"], device="cpu")

In [18]:
trainer = base.DistilTrainer(
    student_model=student_model,
    args=training_args,
    train_dataset=train_data,
    eval_dataset=eval_data,
    compute_metrics=base.compute_metrics
)

In [19]:
trainer.train()

Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.2151,0.145761,0.980804,0.980867,0.980804,0.980806
2,0.166,0.122987,0.983009,0.983013,0.983009,0.982992
3,0.144,0.112328,0.984286,0.984293,0.984286,0.98427
4,0.1294,0.102112,0.985143,0.985151,0.985143,0.985144
5,0.1205,0.098072,0.985482,0.985468,0.985482,0.98547


TrainOutput(global_step=17500, training_loss=0.15500399518694197, metrics={'train_runtime': 304.3155, 'train_samples_per_second': 7360.782, 'train_steps_per_second': 57.506, 'total_flos': 0.0, 'train_loss': 0.15500399518694197, 'epoch': 5.0})

In [20]:
base.reset_seed()

In [21]:
student_model = base.BiLSTMClassifier(embedding_matrix=embedding_matrix, embedding_dim=embedding_dim, fc_dim=400, hidden_dim=300, output_dim=14)
teacher_model = BertForSequenceClassification.from_pretrained("fabriceyhc/bert-base-uncased-dbpedia_14", num_labels=14)
teacher_model.to(device)
teacher_model.eval()

BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSdpaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e

In [23]:
training_args = base.get_training_args(output_dir=f"~/results/{DATASET}/bilstm-distill_fine_infer", remove_unused_columns=False, logging_dir=f"~/logs/{DATASET}/bilstm-distill_fine_infer")

In [24]:
base.reset_seed()

In [None]:
train_data.reset_format()
eval_data.reset_format()   

In [27]:
trainer = base.DistilTrainerInferText(
    student_model=student_model,
    teacher_model=teacher_model,
    args=training_args,
    train_dataset=train_data,
    eval_dataset=eval_data,
    compute_metrics=base.compute_metrics
)

In [25]:
trainer.train()

Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,1.0,0.208814,0.974768,0.97486,0.974768,0.974726
2,0.2079,0.140011,0.981768,0.981741,0.981768,0.981743
3,0.1658,0.128577,0.982536,0.982567,0.982536,0.982532
4,0.1443,0.112397,0.98425,0.984231,0.98425,0.984234
5,0.1325,0.107218,0.984705,0.984697,0.984705,0.984697


TrainOutput(global_step=17500, training_loss=0.3300956996372768, metrics={'train_runtime': 476.6838, 'train_samples_per_second': 4699.132, 'train_steps_per_second': 36.712, 'total_flos': 0.0, 'train_loss': 0.3300956996372768, 'epoch': 5.0})

In [None]:
student_model = base.BiLSTMClassifier(embedding_matrix=embedding_matrix, embedding_dim=embedding_dim, fc_dim=400, hidden_dim=300, output_dim=14)

In [None]:
training_args = base.get_training_args(output_dir=f"~/results/{DATASET}/bilstm-distill_fine", remove_unused_columns=False, logging_dir=f"~/logs/{DATASET}/bilstm-distill_fine")

In [None]:
base.reset_seed()

In [None]:
all_train_data.set_format(type="torch", columns=["input_ids", "logits", "labels"], device="cpu")
eval_data.set_format(type="torch", columns=["input_ids", "logits", "labels"], device="cpu")

In [None]:
trainer = base.DistilTrainer(
    student_model=student_model,
    args=training_args,
    train_dataset= all_train_data,
    eval_dataset=eval_data,
    compute_metrics=base.compute_metrics
)

In [None]:
trainer.train()

Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.8523,0.162907,0.9795,0.979482,0.9795,0.979473
2,0.2423,0.122056,0.983402,0.983365,0.983402,0.983372
3,0.1967,0.108043,0.98475,0.984772,0.98475,0.984751
4,0.1738,0.098927,0.985777,0.985766,0.985777,0.985759
5,0.1599,0.095644,0.986125,0.986107,0.986125,0.986113


TrainOutput(global_step=34350, training_loss=0.32499586702260735, metrics={'train_runtime': 492.0092, 'train_samples_per_second': 8936.357, 'train_steps_per_second': 69.816, 'total_flos': 0.0, 'train_loss': 0.32499586702260735, 'epoch': 5.0})

In [None]:
student_model = base.BiLSTMClassifier(embedding_matrix=embedding_matrix, embedding_dim=embedding_dim, fc_dim=400, hidden_dim=300, output_dim=14)

In [None]:
training_args = base.get_training_args(output_dir=f"~/results/{DATASET}/bilstm-distill_fine_infer", remove_unused_columns=False, logging_dir=f"~/logs/{DATASET}/bilstm-distill_fine_infer")

In [None]:
base.reset_seed()

In [None]:
all_train_data.reset_format()
eval_data.reset_format()   

In [None]:
trainer = base.DistilTrainerInferText(
    student_model=student_model,
    teacher_model=teacher_model,
    args=training_args,
    train_dataset=all_train_data,
    eval_dataset=eval_data,
    compute_metrics=base.compute_metrics
)

In [None]:
trainer.train()

TrainOutput(global_step=34350, training_loss=0.30965063369950874, metrics={'train_runtime': 816.649, 'train_samples_per_second': 5383.917, 'train_steps_per_second': 42.062, 'total_flos': 0.0, 'train_loss': 0.30965063369950874, 'epoch': 5.0})

In [None]:
student_model = BertForSequenceClassification.from_pretrained("google/bert_uncased_L-2_H-128_A-2", num_labels=14)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
training_args = base.get_training_args(output_dir=f"~/results/{DATASET}/bilstm-distill_fine", remove_unused_columns=False, logging_dir=f"~/logs/{DATASET}/bilstm-distill_fine")

In [None]:
base.reset_seed()

In [None]:
train_data = train_data.remove_columns(["input_ids"])
train_data = train_data.rename_column("teacher_attention", "attention_mask")
train_data = train_data.rename_column("teacher_ids", "input_ids")

eval_data = eval_data.remove_columns(["input_ids"])
eval_data = eval_data.rename_column("teacher_attention", "attention_mask")
eval_data = eval_data.rename_column("teacher_ids", "input_ids")

train_data.set_format(type="torch", columns=["input_ids", "attention_mask", "logits", "labels"], device="cpu")
eval_data.set_format(type="torch", columns=["input_ids", "attention_mask", "logits", "labels"], device="cpu")

In [None]:
trainer = base.DistilTrainer(
    student_model=student_model,
    args=training_args,
    train_dataset=train_data,
    eval_dataset=eval_data,
    compute_metrics=base.compute_metrics
)

In [None]:
trainer.train()

Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,1.381,0.122621,0.98425,0.984286,0.98425,0.98425
2,0.1211,0.083241,0.985929,0.985924,0.985929,0.985919
3,0.0973,0.074958,0.98675,0.986754,0.98675,0.986744
4,0.0878,0.070525,0.98717,0.987171,0.98717,0.987165
5,0.0824,0.070192,0.987134,0.987131,0.987134,0.987129


TrainOutput(global_step=17500, training_loss=0.3539050258091518, metrics={'train_runtime': 331.4675, 'train_samples_per_second': 6757.826, 'train_steps_per_second': 52.796, 'total_flos': 334751155200000.0, 'train_loss': 0.3539050258091518, 'epoch': 5.0})

In [None]:
base.reset_seed()

In [None]:
student_model = BertForSequenceClassification.from_pretrained("google/bert_uncased_L-2_H-128_A-2", num_labels=14)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
training_args = base.get_training_args(output_dir=f"~/results/{DATASET}/bilstm-distill_fine_infer", remove_unused_columns=False, logging_dir=f"~/logs/{DATASET}/bilstm-distill_fine_infer")

In [None]:
trainer = base.DistilTrainerInfer(
    student_model=student_model,
    teacher_model=teacher_model,
    args=training_args,
    train_dataset=train_data,
    eval_dataset=eval_data,
    compute_metrics=base.compute_metrics
)

In [None]:
trainer.train()

Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,1.3603,0.120334,0.984259,0.984272,0.984259,0.984252
2,0.121,0.08213,0.986089,0.986088,0.986089,0.986085
3,0.0977,0.074357,0.986804,0.986815,0.986804,0.986805
4,0.0881,0.071145,0.987116,0.987122,0.987116,0.987114
5,0.0828,0.070289,0.987232,0.987236,0.987232,0.987231


TrainOutput(global_step=17500, training_loss=0.3499939069475446, metrics={'train_runtime': 510.6873, 'train_samples_per_second': 4386.245, 'train_steps_per_second': 34.268, 'total_flos': 334751155200000.0, 'train_loss': 0.3499939069475446, 'epoch': 5.0})

In [None]:
student_model = BertForSequenceClassification.from_pretrained("google/bert_uncased_L-2_H-128_A-2", num_labels=14)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
training_args = base.get_training_args(output_dir=f"~/results/{DATASET}/bilstm-distill_fine", remove_unused_columns=False, logging_dir=f"~/logs/{DATASET}/bilstm-distill_fine")

In [None]:
base.reset_seed()

In [None]:
all_train_data = all_train_data.remove_columns(["input_ids"])
all_train_data = all_train_data.rename_column("teacher_attention", "attention_mask")
all_train_data = all_train_data.rename_column("teacher_ids", "input_ids")

all_train_data.set_format(type="torch", columns=["input_ids", "attention_mask", "logits", "labels"], device="cpu")

In [None]:
trainer = base.DistilTrainer(
    student_model=student_model,
    args=training_args,
    train_dataset=all_train_data,
    eval_dataset=eval_data,
    compute_metrics=base.compute_metrics
)

In [None]:
trainer.train()

Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.8293,0.080949,0.98583,0.985824,0.98583,0.985821
2,0.132,0.070329,0.986902,0.986902,0.986902,0.986891
3,0.1099,0.06539,0.987563,0.987555,0.987562,0.987553
4,0.0987,0.06313,0.987696,0.987695,0.987696,0.98769
5,0.093,0.062406,0.987902,0.987898,0.987902,0.987896


TrainOutput(global_step=34350, training_loss=0.2525936831900473, metrics={'train_runtime': 575.7125, 'train_samples_per_second': 7637.093, 'train_steps_per_second': 59.665, 'total_flos': 657064212789600.0, 'train_loss': 0.2525936831900473, 'epoch': 5.0})

In [None]:
base.reset_seed()

In [None]:
student_model = BertForSequenceClassification.from_pretrained("google/bert_uncased_L-2_H-128_A-2", num_labels=14)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
training_args = base.get_training_args(output_dir=f"~/results/{DATASET}/bilstm-distill_fine_infer", remove_unused_columns=False, logging_dir=f"~/logs/{DATASET}/bilstm-distill_fine_infer")

In [None]:
trainer = base.DistilTrainerInfer(
    student_model=student_model,
    teacher_model=teacher_model,
    args=training_args,
    train_dataset=all_train_data,
    eval_dataset=eval_data,
    compute_metrics=base.compute_metrics
)

In [None]:
trainer.train()

Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.8123,0.08245,0.985705,0.985699,0.985705,0.985696
2,0.1314,0.071979,0.986554,0.98656,0.986554,0.986544
3,0.1099,0.066306,0.987437,0.987444,0.987438,0.987433
4,0.0985,0.063854,0.987652,0.987654,0.987652,0.987647
5,0.0932,0.063007,0.987812,0.987813,0.987812,0.987809


TrainOutput(global_step=34350, training_loss=0.24902768064274017, metrics={'train_runtime': 842.7655, 'train_samples_per_second': 5217.074, 'train_steps_per_second': 40.759, 'total_flos': 657064212789600.0, 'train_loss': 0.24902768064274017, 'epoch': 5.0})