In [1]:
from datasets import concatenate_datasets, load_from_disk
from transformers import BasicTokenizer, BertForSequenceClassification, BertTokenizer
import kagglehub
import torch
import base

[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /home/jovyan/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package punkt to /home/jovyan/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to /home/jovyan/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger_eng to
[nltk_data]     /home/jovyan/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger_eng is already up-to-
[nltk_data]       date!


In [2]:
my_glove = kagglehub.dataset_download("thanakomsn/glove6b300dtxt")
print(my_glove)

/home/jovyan/.cache/kagglehub/datasets/thanakomsn/glove6b300dtxt/versions/1


In [3]:
GLOVE_FILE = f"{my_glove}/glove.6B.300d.txt"
DATASET = "dbpedia"

In [83]:
train_data = load_from_disk(f"~/data/{DATASET}/train-logits")
eval_data = load_from_disk(f"~/data/{DATASET}/eval-logits")
test_data = load_from_disk(f"~/data/{DATASET}/test-logits")

all_train_data = load_from_disk(f"~/data/{DATASET}/train-logits-augmented")

all_data = concatenate_datasets([load_from_disk(file) for file in [f"~/data/{DATASET}/eval-logits", f"~/data/{DATASET}/test-logits", f"~/data/{DATASET}/train-logits-augmented"]])
tokenizer = BasicTokenizer(do_lower_case=True)
teacher_tokenizer = BertTokenizer.from_pretrained("fabriceyhc/bert-base-uncased-dbpedia_14")

In [5]:
if torch.cuda.is_available():
    device = torch.device("cuda")
    print("GPU is available and will be used:", torch.cuda.get_device_name(0))
else:
    device = torch.device("cpu")
    print("GPU is not available, using CPU.")

GPU is available and will be used: NVIDIA A100 80GB PCIe MIG 2g.20gb


In [6]:
train_data_tokens = list(map(lambda e: tokenizer.tokenize(e["sentence"]), train_data))
eval_data_tokens = list(map(lambda e: tokenizer.tokenize(e["sentence"]), eval_data))
test_data_tokens = list(map(lambda e: tokenizer.tokenize(e["sentence"]), test_data))

all_train_data_tokens = list(map(lambda e: tokenizer.tokenize(e["sentence"]), all_train_data))

all_data_tokens = list(map(lambda e: tokenizer.tokenize(e["sentence"]), all_data))

In [7]:
vocab = base.get_vocab(all_data_tokens)

In [8]:
word_index = dict(zip(vocab, range(len(vocab))))

In [9]:
embeddings_index = base.get_embeddings_indeces(GLOVE_FILE)

Found 400000 word vectors.


In [10]:
print(len(vocab))
num_tokens = len(vocab) + 2
embedding_dim = 300

691158


In [11]:
embedding_matrix = base.get_embedding_matrix(num_tokens, embedding_dim, word_index, embeddings_index)

Converted 212978 words (478180) misses


In [12]:
train_data_index = list(map(lambda x: list(map(lambda y: word_index[y], x)),train_data_tokens))
eval_data_index = list(map(lambda x: list(map(lambda y: word_index[y], x)),eval_data_tokens))
test_data_index = list(map(lambda x: list(map(lambda y: word_index[y], x)),test_data_tokens))

all_train_data_index = list(map(lambda x: list(map(lambda y: word_index[y], x)),all_train_data_tokens))

In [13]:
train_padded_data = list(map(lambda x: base.padd(x,60), train_data_index))
eval_padded_data = list(map(lambda x: base.padd(x,60), eval_data_index))
test_padded_data = list(map(lambda x: base.padd(x,60), test_data_index))

all_train_padded_data = list(map(lambda x: base.padd(x,60), all_train_data_index))

In [14]:
train_teacher_data = base.prepare_dataset_teacher(train_data, teacher_tokenizer)
eval_teacher_data = base.prepare_dataset_teacher(eval_data, teacher_tokenizer)
test_teacher_data = base.prepare_dataset_teacher(test_data, teacher_tokenizer)

all_train_teacher_data = base.prepare_dataset_teacher(all_train_data, teacher_tokenizer)

Tokenizing the provided dataset:   0%|          | 0/448000 [00:00<?, ? examples/s]

Tokenizing the provided dataset:   0%|          | 0/112000 [00:00<?, ? examples/s]

Tokenizing the provided dataset:   0%|          | 0/70000 [00:00<?, ? examples/s]

Tokenizing the provided dataset:   0%|          | 0/879371 [00:00<?, ? examples/s]

In [84]:
train_data = train_data.add_column("input_ids", train_padded_data)
train_data = train_data.add_column("teacher_ids", train_teacher_data[0])
train_data = train_data.add_column("teacher_attention", train_teacher_data[1])

eval_data = eval_data.add_column("input_ids", eval_padded_data)
eval_data = eval_data.add_column("teacher_ids", eval_teacher_data[0])
eval_data = eval_data.add_column("teacher_attention", eval_teacher_data[1])

test_data = test_data.add_column("input_ids", test_padded_data)
test_data = test_data.add_column("teacher_ids", test_teacher_data[0])
test_data = test_data.add_column("teacher_attention", test_teacher_data[1])

all_train_data = all_train_data.add_column("input_ids", all_train_padded_data)
all_train_data = all_train_data.add_column("teacher_ids", all_train_teacher_data[0])
all_train_data = all_train_data.add_column("teacher_attention", all_train_teacher_data[1])

In [85]:
student_model = base.BiLSTMClassifier(embedding_matrix=embedding_matrix, embedding_dim=embedding_dim, fc_dim=400, hidden_dim=300, output_dim=14)

In [86]:
training_args = base.get_training_args(output_dir=f"~/results/{DATASET}/bilstm-distill_fine", remove_unused_columns=False, logging_dir=f"~/logs/{DATASET}/bilstm-distill_fine")

In [87]:
base.reset_seed()

In [88]:
train_data.set_format(type="torch", columns=["input_ids", "logits", "labels"], device="cpu")
eval_data.set_format(type="torch", columns=["input_ids", "logits", "labels"], device="cpu")

In [89]:
trainer = base.DistilTrainer(
    student_model=student_model,
    args=training_args,
    train_dataset=train_data,
    eval_dataset=eval_data,
    compute_metrics=base.compute_metrics
)

In [90]:
trainer.train()

Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,1.1695,0.21404,0.974527,0.974705,0.974527,0.974542
2,0.2172,0.150281,0.980589,0.980584,0.980589,0.980559
3,0.1733,0.130726,0.982509,0.982523,0.982509,0.982496
4,0.1512,0.116111,0.983812,0.983797,0.983812,0.9838
5,0.1389,0.111401,0.984241,0.984229,0.984241,0.984229


TrainOutput(global_step=17500, training_loss=0.3700160487583705, metrics={'train_runtime': 293.9027, 'train_samples_per_second': 7621.57, 'train_steps_per_second': 59.544, 'total_flos': 0.0, 'train_loss': 0.3700160487583705, 'epoch': 5.0})

In [91]:
base.reset_seed()

In [92]:
student_model = base.BiLSTMClassifier(embedding_matrix=embedding_matrix, embedding_dim=embedding_dim, fc_dim=400, hidden_dim=300, output_dim=14)
teacher_model = BertForSequenceClassification.from_pretrained("fabriceyhc/bert-base-uncased-dbpedia_14", num_labels=14)
teacher_model.to(device)
teacher_model.eval()

BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSdpaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e

In [93]:
training_args = base.get_training_args(output_dir=f"~/results/{DATASET}/bilstm-distill_fine_infer", remove_unused_columns=False, logging_dir=f"~/logs/{DATASET}/bilstm-distill_fine_infer")

In [94]:
base.reset_seed()

In [95]:
train_data.reset_format()
eval_data.reset_format()   

In [96]:
trainer = base.DistilTrainerInferText(
    student_model=student_model,
    teacher_model=teacher_model,
    args=training_args,
    train_dataset=train_data,
    eval_dataset=eval_data,
    compute_metrics=base.compute_metrics
)

In [97]:
trainer.train()

Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,1.1076,0.236866,0.973,0.973454,0.973,0.973019
2,0.217,0.155023,0.979732,0.979809,0.979732,0.979715
3,0.1721,0.133987,0.981812,0.981825,0.981813,0.98179
4,0.1508,0.116564,0.983429,0.983411,0.983429,0.983401
5,0.1384,0.11118,0.984071,0.984061,0.984071,0.984059


TrainOutput(global_step=17500, training_loss=0.35717176339285717, metrics={'train_runtime': 1559.0717, 'train_samples_per_second': 1436.752, 'train_steps_per_second': 11.225, 'total_flos': 0.0, 'train_loss': 0.35717176339285717, 'epoch': 5.0})

In [98]:
student_model = base.BiLSTMClassifier(embedding_matrix=embedding_matrix, embedding_dim=embedding_dim, fc_dim=400, hidden_dim=300, output_dim=14)

In [30]:
training_args = base.get_training_args(output_dir=f"~/results/{DATASET}/bilstm-distill_fine", remove_unused_columns=False, logging_dir=f"~/logs/{DATASET}/bilstm-distill_fine")

In [31]:
base.reset_seed()

In [32]:
all_train_data.set_format(type="torch", columns=["input_ids", "logits", "labels"], device="cpu")
eval_data.set_format(type="torch", columns=["input_ids", "logits", "labels"], device="cpu")

In [33]:
trainer = base.DistilTrainer(
    student_model=student_model,
    args=training_args,
    train_dataset= all_train_data,
    eval_dataset=eval_data,
    compute_metrics=base.compute_metrics
)

In [34]:
trainer.train()

Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.9213,0.177431,0.977866,0.977863,0.977866,0.977817
2,0.2513,0.12712,0.983143,0.983132,0.983143,0.98313
3,0.2027,0.111748,0.984464,0.984477,0.984464,0.984461
4,0.1787,0.104118,0.985384,0.985384,0.985384,0.985381
5,0.1646,0.100004,0.98558,0.985561,0.98558,0.985566


TrainOutput(global_step=34355, training_loss=0.3437155339242832, metrics={'train_runtime': 458.1501, 'train_samples_per_second': 9596.976, 'train_steps_per_second': 74.986, 'total_flos': 0.0, 'train_loss': 0.3437155339242832, 'epoch': 5.0})

In [35]:
student_model = base.BiLSTMClassifier(embedding_matrix=embedding_matrix, embedding_dim=embedding_dim, fc_dim=400, hidden_dim=300, output_dim=14)

In [36]:
training_args = base.get_training_args(output_dir=f"~/results/{DATASET}/bilstm-distill_fine_infer", remove_unused_columns=False, logging_dir=f"~/logs/{DATASET}/bilstm-distill_fine_infer")

In [37]:
base.reset_seed()

In [38]:
all_train_data.reset_format()
eval_data.reset_format()   

In [39]:
trainer = base.DistilTrainerInferText(
    student_model=student_model,
    teacher_model=teacher_model,
    args=training_args,
    train_dataset=all_train_data,
    eval_dataset=eval_data,
    compute_metrics=base.compute_metrics
)

In [40]:
trainer.train()

Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.9213,0.177431,0.977866,0.977863,0.977866,0.977817
2,0.2515,0.126002,0.983134,0.983116,0.983134,0.983116
3,0.2024,0.111495,0.984509,0.984514,0.984509,0.984501
4,0.1786,0.104205,0.985295,0.985294,0.985295,0.985291
5,0.1646,0.099913,0.98558,0.985563,0.98558,0.985567


TrainOutput(global_step=34355, training_loss=0.3436608607792261, metrics={'train_runtime': 2700.8682, 'train_samples_per_second': 1627.941, 'train_steps_per_second': 12.72, 'total_flos': 0.0, 'train_loss': 0.3436608607792261, 'epoch': 5.0})

In [99]:
student_model = BertForSequenceClassification.from_pretrained("google/bert_uncased_L-2_H-128_A-2", num_labels=14)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [100]:
training_args = base.get_training_args(output_dir=f"~/results/{DATASET}/bilstm-distill_fine", remove_unused_columns=False, logging_dir=f"~/logs/{DATASET}/bilstm-distill_fine")

In [101]:
base.reset_seed()

In [102]:
train_data = train_data.remove_columns(["input_ids"])
train_data = train_data.rename_column("teacher_attention", "attention_mask")
train_data = train_data.rename_column("teacher_ids", "input_ids")

eval_data = eval_data.remove_columns(["input_ids"])
eval_data = eval_data.rename_column("teacher_attention", "attention_mask")
eval_data = eval_data.rename_column("teacher_ids", "input_ids")

train_data.set_format(type="torch", columns=["input_ids", "attention_mask", "logits", "labels"], device="cpu")
eval_data.set_format(type="torch", columns=["input_ids", "attention_mask", "logits", "labels"], device="cpu")

In [45]:
trainer = base.DistilTrainer(
    student_model=student_model,
    args=training_args,
    train_dataset=train_data,
    eval_dataset=eval_data,
    compute_metrics=base.compute_metrics
)

In [46]:
trainer.train()

Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,1.3826,0.121406,0.984563,0.984585,0.984563,0.984566
2,0.122,0.083456,0.985964,0.985951,0.985964,0.98595
3,0.0979,0.074269,0.986875,0.986876,0.986875,0.986872
4,0.0879,0.070338,0.987125,0.987126,0.987125,0.987122
5,0.0822,0.069853,0.987384,0.987385,0.987384,0.987379


TrainOutput(global_step=17500, training_loss=0.35451332135881697, metrics={'train_runtime': 299.2541, 'train_samples_per_second': 7485.278, 'train_steps_per_second': 58.479, 'total_flos': 334751155200000.0, 'train_loss': 0.35451332135881697, 'epoch': 5.0})

In [47]:
base.reset_seed()

In [48]:
student_model = BertForSequenceClassification.from_pretrained("google/bert_uncased_L-2_H-128_A-2", num_labels=14)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [103]:
training_args = base.get_training_args(output_dir=f"~/results/{DATASET}/bilstm-distill_fine_infer", remove_unused_columns=False, logging_dir=f"~/logs/{DATASET}/bilstm-distill_fine_infer")

In [104]:
trainer = base.DistilTrainerInfer(
    student_model=student_model,
    teacher_model=teacher_model,
    args=training_args,
    train_dataset=train_data,
    eval_dataset=eval_data,
    compute_metrics=base.compute_metrics
)

In [105]:
trainer.train()

Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,1.3598,0.119925,0.984321,0.984337,0.984321,0.984316
2,0.1215,0.081011,0.986214,0.986202,0.986214,0.986205
3,0.098,0.074563,0.986563,0.986559,0.986563,0.986556
4,0.0876,0.070689,0.986938,0.986931,0.986937,0.986933
5,0.0827,0.069735,0.987054,0.987052,0.987054,0.98705


TrainOutput(global_step=17500, training_loss=0.3499100428989955, metrics={'train_runtime': 1528.1119, 'train_samples_per_second': 1465.861, 'train_steps_per_second': 11.452, 'total_flos': 334751155200000.0, 'train_loss': 0.3499100428989955, 'epoch': 5.0})

In [52]:
student_model = BertForSequenceClassification.from_pretrained("google/bert_uncased_L-2_H-128_A-2", num_labels=14)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [53]:
training_args = base.get_training_args(output_dir=f"~/results/{DATASET}/bilstm-distill_fine", remove_unused_columns=False, logging_dir=f"~/logs/{DATASET}/bilstm-distill_fine")

In [54]:
base.reset_seed()

In [55]:
all_train_data = all_train_data.remove_columns(["input_ids"])
all_train_data = all_train_data.rename_column("teacher_attention", "attention_mask")
all_train_data = all_train_data.rename_column("teacher_ids", "input_ids")

all_train_data.set_format(type="torch", columns=["input_ids", "attention_mask", "logits", "labels"], device="cpu")

In [56]:
trainer = base.DistilTrainer(
    student_model=student_model,
    args=training_args,
    train_dataset=all_train_data,
    eval_dataset=eval_data,
    compute_metrics=base.compute_metrics
)

In [57]:
trainer.train()

Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.8301,0.081244,0.986161,0.986155,0.986161,0.986148
2,0.132,0.069424,0.986955,0.986953,0.986955,0.986946
3,0.11,0.06517,0.987554,0.987547,0.987554,0.987546
4,0.0987,0.062519,0.987946,0.987944,0.987946,0.987941
5,0.0929,0.062191,0.98783,0.98783,0.98783,0.987825


TrainOutput(global_step=34355, training_loss=0.25274641503067696, metrics={'train_runtime': 491.0509, 'train_samples_per_second': 8953.97, 'train_steps_per_second': 69.962, 'total_flos': 657076915400400.0, 'train_loss': 0.25274641503067696, 'epoch': 5.0})

In [58]:
base.reset_seed()

In [59]:
student_model = BertForSequenceClassification.from_pretrained("google/bert_uncased_L-2_H-128_A-2", num_labels=14)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [60]:
training_args = base.get_training_args(output_dir=f"~/results/{DATASET}/bilstm-distill_fine_infer", remove_unused_columns=False, logging_dir=f"~/logs/{DATASET}/bilstm-distill_fine_infer")

In [61]:
trainer = base.DistilTrainerInfer(
    student_model=student_model,
    teacher_model=teacher_model,
    args=training_args,
    train_dataset=all_train_data,
    eval_dataset=eval_data,
    compute_metrics=base.compute_metrics
)

In [62]:
trainer.train()

Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.8136,0.081995,0.986125,0.986117,0.986125,0.986115
2,0.1315,0.069635,0.987268,0.987268,0.987268,0.987261
3,0.1094,0.066111,0.987616,0.987614,0.987616,0.987609
4,0.0988,0.063145,0.987902,0.987906,0.987902,0.9879
5,0.0932,0.06266,0.987839,0.987839,0.987839,0.987835


TrainOutput(global_step=34355, training_loss=0.2493221464505485, metrics={'train_runtime': 2655.1507, 'train_samples_per_second': 1655.972, 'train_steps_per_second': 12.939, 'total_flos': 657076915400400.0, 'train_loss': 0.2493221464505485, 'epoch': 5.0})