In [1]:
from datasets import concatenate_datasets, load_from_disk
from transformers import BasicTokenizer, Trainer, EarlyStoppingCallback
from torch.utils.data import DataLoader
import kagglehub
import torch
import base
import os
import copy

[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /home/jovyan/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package punkt to /home/jovyan/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to /home/jovyan/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger_eng to
[nltk_data]     /home/jovyan/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger_eng is already up-to-
[nltk_data]       date!


In [2]:
my_glove = kagglehub.dataset_download("thanakomsn/glove6b300dtxt")
print(my_glove)

/home/jovyan/.cache/kagglehub/datasets/thanakomsn/glove6b300dtxt/versions/1


In [None]:
GLOVE_FILE = f"{my_glove}/glove.6B.300d.txt"
DATASET = "dbpedia"

In [4]:
train_data = load_from_disk(f"~/data/{DATASET}/train-logits")
eval_data = load_from_disk(f"~/data/{DATASET}/eval-logits")
test_data = load_from_disk(f"~/data/{DATASET}/test-logits")

all_train_data = load_from_disk(f"~/data/{DATASET}/train-logits-augmented")

all_data = concatenate_datasets([load_from_disk(file) for file in [f"~/data/{DATASET}/eval-logits", f"~/data/{DATASET}/test-logits", f"~/data/{DATASET}/train-logits-augmented"]])
tokenizer = BasicTokenizer(do_lower_case=True)

In [5]:
if torch.cuda.is_available():
    device = torch.device("cuda")
    print("GPU is available and will be used:", torch.cuda.get_device_name(0))
else:
    device = torch.device("cpu")
    print("GPU is not available, using CPU.")

GPU is available and will be used: NVIDIA A100 80GB PCIe MIG 2g.20gb


In [6]:
train_data_tokens = list(map(lambda e: tokenizer.tokenize(e["sentence"]), train_data))
eval_data_tokens = list(map(lambda e: tokenizer.tokenize(e["sentence"]), eval_data))
test_data_tokens = list(map(lambda e: tokenizer.tokenize(e["sentence"]), test_data))

all_data_tokens = list(map(lambda e: tokenizer.tokenize(e["sentence"]), all_data))

In [7]:
vocab = base.get_vocab(all_data_tokens)

In [8]:
word_index = dict(zip(vocab, range(len(vocab))))

In [9]:
embeddings_index = base.get_embeddings_indeces(GLOVE_FILE)

Found 400000 word vectors.


In [None]:
print(len(vocab))
num_tokens = len(vocab) + 2
embedding_dim = 300

691158


In [11]:
embedding_matrix = base.get_embedding_matrix(num_tokens, embedding_dim, word_index, embeddings_index)

Converted 212978 words (478180) misses


In [12]:
train_data_index = list(map(lambda x: list(map(lambda y: word_index[y], x)),train_data_tokens))
eval_data_index = list(map(lambda x: list(map(lambda y: word_index[y], x)),eval_data_tokens))
test_data_index = list(map(lambda x: list(map(lambda y: word_index[y], x)),test_data_tokens))

In [13]:
train_padded_data = list(map(lambda x: base.padd(x,300), train_data_index))
eval_padded_data = list(map(lambda x: base.padd(x,300), eval_data_index))
test_padded_data = list(map(lambda x: base.padd(x,300), test_data_index))

In [14]:
train_data = train_data.add_column("input_ids", train_padded_data)
eval_data = eval_data.add_column("input_ids", eval_padded_data)
test_data = test_data.add_column("input_ids", test_padded_data)

In [15]:
train_data_gpu = copy.deepcopy(train_data)
train_data_gpu.set_format(type="torch", columns=["input_ids"], device="cuda")
gpu_data_loader = DataLoader(train_data_gpu, batch_size=1, shuffle=False)

train_data_cpu = copy.deepcopy(train_data)
train_data_cpu.set_format(type="torch", columns=["input_ids"], device="cpu")
cpu_data_loader = DataLoader(train_data_cpu, batch_size=1, shuffle=False)

In [16]:
model = base.BiLSTMClassifier(embedding_matrix=embedding_matrix, embedding_dim=embedding_dim, fc_dim=400, hidden_dim=300, output_dim=14, freeze_embed=False)

In [17]:
print(model)

BiLSTMClassifier(
  (embedding): Embedding(691160, 300)
  (lstm): LSTM(300, 300, batch_first=True, bidirectional=True)
  (fc1): Linear(in_features=600, out_features=400, bias=True)
  (dropout): Dropout(p=0.2, inplace=False)
  (fc2): Linear(in_features=400, out_features=14, bias=True)
)


In [18]:
training_args = base.get_training_args(output_dir=f"~/results/{DATASET}/bilstm-base", logging_dir=f"~/logs/{DATASET}/bilstm-base", lr=.005, epochs=5, batch_size=128)

In [19]:
base.reset_seed()

In [20]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_data,
    eval_dataset=eval_data,
    compute_metrics=base.compute_metrics,
    callbacks = [EarlyStoppingCallback(early_stopping_patience = 2)]
)

In [21]:
trainer.train()

Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.0823,0.054285,0.98583,0.985865,0.98583,0.985823
2,0.0284,0.060822,0.986054,0.986103,0.986054,0.986063
3,0.013,0.073548,0.986045,0.986103,0.986045,0.986065
4,0.0049,0.094535,0.98642,0.986422,0.98642,0.986419
5,0.0012,0.1087,0.986473,0.986459,0.986473,0.986464


TrainOutput(global_step=17500, training_loss=0.025947300883701868, metrics={'train_runtime': 1527.5484, 'train_samples_per_second': 1466.402, 'train_steps_per_second': 11.456, 'total_flos': 0.0, 'train_loss': 0.025947300883701868, 'epoch': 5.0})

In [22]:
model.eval()

BiLSTMClassifier(
  (embedding): Embedding(691160, 300)
  (lstm): LSTM(300, 300, batch_first=True, bidirectional=True)
  (fc1): Linear(in_features=600, out_features=400, bias=True)
  (dropout): Dropout(p=0.2, inplace=False)
  (fc2): Linear(in_features=400, out_features=14, bias=True)
)

In [23]:
trainer.evaluate(test_data)

{'eval_loss': 0.1051858589053154,
 'eval_accuracy': 0.9866,
 'eval_precision': 0.9865885454736419,
 'eval_recall': 0.9865999999999999,
 'eval_f1': 0.9865898399470471,
 'eval_runtime': 12.8947,
 'eval_samples_per_second': 5428.604,
 'eval_steps_per_second': 42.421,
 'epoch': 5.0}

In [24]:
torch.save(model.state_dict(), f"{os.path.expanduser('~')}/models/{DATASET}/bilstm-base.pth")

In [25]:
student_model = base.BiLSTMClassifier(embedding_matrix=embedding_matrix, embedding_dim=embedding_dim, fc_dim=400, hidden_dim=300, output_dim=14, freeze_embed=False)

In [26]:
training_args = base.get_training_args(output_dir=f"~/results/{DATASET}/bilstm-distill", remove_unused_columns=False, logging_dir=f"~/logs/{DATASET}/bilstm-distill", lr=.005,  epochs=5, batch_size=128, lambda_param=.6, temp=2.5)

In [27]:
base.reset_seed()

In [28]:
trainer = base.DistilTrainer(
    student_model=student_model,
    args=training_args,
    train_dataset=train_data,
    eval_dataset=eval_data,
    compute_metrics=base.compute_metrics,
    callbacks = [EarlyStoppingCallback(early_stopping_patience = 2)]
)

In [29]:
trainer.train()

Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.2103,0.100915,0.987089,0.987089,0.987089,0.987071
2,0.0857,0.084681,0.988295,0.988308,0.988295,0.988293
3,0.0631,0.074932,0.988812,0.988816,0.988813,0.988809
4,0.0502,0.068375,0.989277,0.989284,0.989277,0.989277
5,0.0407,0.064906,0.989232,0.98924,0.989232,0.989234


TrainOutput(global_step=17500, training_loss=0.09000687081473215, metrics={'train_runtime': 1531.5056, 'train_samples_per_second': 1462.613, 'train_steps_per_second': 11.427, 'total_flos': 0.0, 'train_loss': 0.09000687081473215, 'epoch': 5.0})

In [30]:
student_model.eval()

BiLSTMClassifier(
  (embedding): Embedding(691160, 300)
  (lstm): LSTM(300, 300, batch_first=True, bidirectional=True)
  (fc1): Linear(in_features=600, out_features=400, bias=True)
  (dropout): Dropout(p=0.2, inplace=False)
  (fc2): Linear(in_features=400, out_features=14, bias=True)
)

In [31]:
trainer.evaluate(test_data)

{'eval_loss': 0.06774262338876724,
 'eval_accuracy': 0.9895714285714285,
 'eval_precision': 0.9895783297205203,
 'eval_recall': 0.9895714285714288,
 'eval_f1': 0.9895709137053126,
 'eval_runtime': 12.8318,
 'eval_samples_per_second': 5455.18,
 'eval_steps_per_second': 42.628,
 'epoch': 5.0}

In [32]:
torch.save(student_model.state_dict(), f"{os.path.expanduser('~')}/models/{DATASET}/bilstm-distill.pth")

In [33]:
data = train_data.train_test_split(test_size=0.1, seed=42, stratify_by_column="labels")
train_data = data["test"]

In [34]:
model = base.BiLSTMClassifier(embedding_matrix=embedding_matrix, embedding_dim=embedding_dim, fc_dim=400, hidden_dim=300, output_dim=14, freeze_embed=False)

In [35]:
training_args = base.get_training_args(output_dir=f"~/results/{DATASET}/bilstm-base-small", logging_dir=f"~/logs/{DATASET}/bilstm-base-small", lr=.005,  epochs=5, batch_size=128)

In [36]:
base.reset_seed()

In [37]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_data,
    eval_dataset=eval_data,
    compute_metrics=base.compute_metrics,
    callbacks = [EarlyStoppingCallback(early_stopping_patience = 2)]
)

In [38]:
trainer.train()

Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.2763,0.071437,0.981964,0.981997,0.981964,0.981942
2,0.0288,0.076152,0.981527,0.981617,0.981527,0.981526
3,0.0066,0.080642,0.982536,0.982497,0.982536,0.982496
4,0.0014,0.086385,0.983116,0.983083,0.983116,0.983091
5,0.0005,0.089168,0.983116,0.983086,0.983116,0.98309


TrainOutput(global_step=1750, training_loss=0.06271203763144356, metrics={'train_runtime': 292.6884, 'train_samples_per_second': 765.319, 'train_steps_per_second': 5.979, 'total_flos': 0.0, 'train_loss': 0.06271203763144356, 'epoch': 5.0})

In [39]:
model.eval()

BiLSTMClassifier(
  (embedding): Embedding(691160, 300)
  (lstm): LSTM(300, 300, batch_first=True, bidirectional=True)
  (fc1): Linear(in_features=600, out_features=400, bias=True)
  (dropout): Dropout(p=0.2, inplace=False)
  (fc2): Linear(in_features=400, out_features=14, bias=True)
)

In [40]:
trainer.evaluate(test_data)

{'eval_loss': 0.08340625464916229,
 'eval_accuracy': 0.9837285714285714,
 'eval_precision': 0.9837043219206861,
 'eval_recall': 0.9837285714285714,
 'eval_f1': 0.9837015515030005,
 'eval_runtime': 13.0164,
 'eval_samples_per_second': 5377.844,
 'eval_steps_per_second': 42.024,
 'epoch': 5.0}

In [41]:
torch.save(model.state_dict(), f"{os.path.expanduser('~')}/models/{DATASET}/bilstm-base-small.pth")

In [42]:
student_model = base.BiLSTMClassifier(embedding_matrix=embedding_matrix, embedding_dim=embedding_dim, fc_dim=400, hidden_dim=300, output_dim=14, freeze_embed=False)

In [43]:
training_args = base.get_training_args(output_dir=f"~/results/{DATASET}/bilstm-distill-small", remove_unused_columns=False, logging_dir=f"~/logs/{DATASET}/bilstm-distill-small", lr=.004,  epochs=5, batch_size=128, lambda_param=.8, temp=2.5)

In [44]:
base.reset_seed()

In [45]:
trainer = base.DistilTrainer(
    student_model=student_model,
    args=training_args,
    train_dataset=train_data,
    eval_dataset=eval_data,
    compute_metrics=base.compute_metrics,
    callbacks = [EarlyStoppingCallback(early_stopping_patience = 2)]
)

In [46]:
trainer.train()

Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.9897,0.193692,0.981795,0.981813,0.981795,0.981781
2,0.1247,0.141526,0.984652,0.984712,0.984652,0.984659
3,0.0741,0.122206,0.985866,0.985858,0.985866,0.985855
4,0.0524,0.11687,0.985732,0.985725,0.985732,0.985716
5,0.0434,0.11161,0.986286,0.986278,0.986286,0.986276


TrainOutput(global_step=1750, training_loss=0.2568680147443499, metrics={'train_runtime': 285.1512, 'train_samples_per_second': 785.548, 'train_steps_per_second': 6.137, 'total_flos': 0.0, 'train_loss': 0.2568680147443499, 'epoch': 5.0})

In [47]:
student_model.eval()

BiLSTMClassifier(
  (embedding): Embedding(691160, 300)
  (lstm): LSTM(300, 300, batch_first=True, bidirectional=True)
  (fc1): Linear(in_features=600, out_features=400, bias=True)
  (dropout): Dropout(p=0.2, inplace=False)
  (fc2): Linear(in_features=400, out_features=14, bias=True)
)

In [48]:
trainer.evaluate(test_data)

{'eval_loss': 0.10849296301603317,
 'eval_accuracy': 0.9863571428571428,
 'eval_precision': 0.9863476188956245,
 'eval_recall': 0.9863571428571428,
 'eval_f1': 0.9863481664701558,
 'eval_runtime': 12.31,
 'eval_samples_per_second': 5686.414,
 'eval_steps_per_second': 44.435,
 'epoch': 5.0}

In [49]:
torch.save(model.state_dict(), f"{os.path.expanduser('~')}/models/{DATASET}/bilstm-distill-small.pth")

In [50]:
base.count_parameters(student_model)

model size: 797.420MB.
Total Trainable Params: 209038814.


Unnamed: 0,Modules,Parameters
0,embedding.weight,207348000
1,lstm.weight_ih_l0,360000
2,lstm.weight_hh_l0,360000
3,lstm.bias_ih_l0,1200
4,lstm.bias_hh_l0,1200
5,lstm.weight_ih_l0_reverse,360000
6,lstm.weight_hh_l0_reverse,360000
7,lstm.bias_ih_l0_reverse,1200
8,lstm.bias_hh_l0_reverse,1200
9,fc1.weight,240000


In [53]:
cpu_benchmark = base.BenchMarkRunner(student_model, cpu_data_loader, "cpu", 1000)
print(cpu_benchmark.run_benchmark())

<torch.utils.benchmark.utils.common.Measurement object at 0x739f6bf23a00>
self.infer_speed_comp()
  19.75 ms
  1 measurement, 1000 runs , 4 threads


In [54]:
gpu_benchmark = base.BenchMarkRunner(student_model, gpu_data_loader, "cuda", 1000)
print(gpu_benchmark.run_benchmark())

<torch.utils.benchmark.utils.common.Measurement object at 0x73a0f6166080>
self.infer_speed_comp()
  6.04 ms
  1 measurement, 1000 runs , 4 threads
