In [1]:
from datasets import concatenate_datasets, load_from_disk
from transformers import BasicTokenizer, EarlyStoppingCallback, Trainer
import kagglehub
import torch
import base
import os

[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /home/jovyan/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package punkt to /home/jovyan/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to /home/jovyan/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger_eng to
[nltk_data]     /home/jovyan/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger_eng is already up-to-
[nltk_data]       date!


In [2]:
#my_glove = kagglehub.dataset_download("takuok/glove840b300dtxt")
my_glove = kagglehub.dataset_download("thanakomsn/glove6b300dtxt")
print(my_glove)

/home/jovyan/.cache/kagglehub/datasets/thanakomsn/glove6b300dtxt/versions/1


In [3]:
#GLOVE_FILE = f"{my_glove}/glove.840B.300d.txt"
GLOVE_FILE = f"{my_glove}/glove.6B.300d.txt"
DATASET = "dbpedia"

In [4]:
train_data = load_from_disk(f"~/data/{DATASET}/train-logits")
eval_data = load_from_disk(f"~/data/{DATASET}/eval-logits")
test_data = load_from_disk(f"~/data/{DATASET}/test-logits")

data = train_data.train_test_split(test_size=0.1, seed=42, stratify_by_column="labels")
train_data = data["test"]

all_train_data = load_from_disk(f"~/data/{DATASET}/train-logits-augmented")
data = all_train_data.train_test_split(test_size=0.1, seed=42, stratify_by_column="labels")
all_train_data = data["test"]


all_data = concatenate_datasets([load_from_disk(file) for file in [f"~/data/{DATASET}/eval-logits", f"~/data/{DATASET}/test-logits", f"~/data/{DATASET}/train-logits-augmented"]])
tokenizer = BasicTokenizer(do_lower_case=True)

In [5]:
if torch.cuda.is_available():
    device = torch.device("cuda")
    print("GPU is available and will be used:", torch.cuda.get_device_name(0))
else:
    device = torch.device("cpu")
    print("GPU is not available, using CPU.")

GPU is available and will be used: NVIDIA A100 80GB PCIe MIG 2g.20gb


In [6]:
train_data_tokens = list(map(lambda e: tokenizer.tokenize(e["sentence"]), train_data))
eval_data_tokens = list(map(lambda e: tokenizer.tokenize(e["sentence"]), eval_data))
test_data_tokens = list(map(lambda e: tokenizer.tokenize(e["sentence"]), test_data))

all_train_data_tokens = list(map(lambda e: tokenizer.tokenize(e["sentence"]), all_train_data))
all_data_tokens = list(map(lambda e: tokenizer.tokenize(e["sentence"]), all_data))

KeyboardInterrupt: 

In [None]:
vocab = base.get_vocab(all_data_tokens)

In [None]:
word_index = dict(zip(vocab, range(len(vocab))))

In [None]:
embeddings_index = base.get_embeddings_indeces(GLOVE_FILE)

Found 400000 word vectors.


In [None]:
print(len(vocab))
num_tokens = len(vocab) + 2
embedding_dim = 300

691158


In [None]:
embedding_matrix = base.get_embedding_matrix(num_tokens, embedding_dim, word_index, embeddings_index)

Converted 212978 words (478180) misses


In [None]:
train_data_index = list(map(lambda x: list(map(lambda y: word_index[y], x)),train_data_tokens))
eval_data_index = list(map(lambda x: list(map(lambda y: word_index[y], x)),eval_data_tokens))
test_data_index = list(map(lambda x: list(map(lambda y: word_index[y], x)),test_data_tokens))

all_train_data_index = list(map(lambda x: list(map(lambda y: word_index[y], x)),all_train_data_tokens))

In [None]:
train_padded_data = list(map(lambda x: base.padd(x,300), train_data_index))
eval_padded_data = list(map(lambda x: base.padd(x,300), eval_data_index))
test_padded_data = list(map(lambda x: base.padd(x,300), test_data_index))

all_train_padded_data = list(map(lambda x: base.padd(x,300), all_train_data_index))

In [None]:
train_data = train_data.add_column("input_ids", train_padded_data)
eval_data = eval_data.add_column("input_ids", eval_padded_data)
test_data = test_data.add_column("input_ids", test_padded_data)

all_train_data = all_train_data.add_column("input_ids", all_train_padded_data)

Flattening the indices:   0%|          | 0/44800 [00:00<?, ? examples/s]

Flattening the indices:   0%|          | 0/87929 [00:00<?, ? examples/s]

In [None]:
model = base.BiLSTMClassifier(embedding_matrix=embedding_matrix, embedding_dim=embedding_dim, fc_dim=400, hidden_dim=300, output_dim=14)

In [None]:
print(model)

BiLSTMClassifier(
  (embedding): Embedding(691160, 300)
  (lstm): LSTM(300, 300, batch_first=True, bidirectional=True)
  (fc1): Linear(in_features=600, out_features=400, bias=True)
  (dropout): Dropout(p=0.2, inplace=False)
  (fc2): Linear(in_features=400, out_features=14, bias=True)
)


In [None]:
training_args = base.get_training_args(output_dir=f"~/results/{DATASET}/bilstm-base-small", logging_dir=f"~/logs/{DATASET}/bilstm-base-small", lr=.0001,  epochs=3, batch_size=128)

In [None]:
base.reset_seed()

In [None]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_data,
    eval_dataset=eval_data,
    compute_metrics=base.compute_metrics,
    callbacks = [EarlyStoppingCallback(early_stopping_patience = 3)]
)

In [None]:
trainer.train()

Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.7511,0.209696,0.939705,0.941527,0.939705,0.939795
2,0.1465,0.11208,0.968179,0.96828,0.968179,0.968173
3,0.1004,0.098086,0.972473,0.97245,0.972473,0.972445


TrainOutput(global_step=2100, training_loss=0.33268421718052454, metrics={'train_runtime': 133.3024, 'train_samples_per_second': 2016.468, 'train_steps_per_second': 15.754, 'total_flos': 0.0, 'train_loss': 0.33268421718052454, 'epoch': 3.0})

In [None]:
model.eval()

BiLSTMClassifier(
  (embedding): Embedding(691160, 300)
  (lstm): LSTM(300, 300, batch_first=True, bidirectional=True)
  (fc1): Linear(in_features=600, out_features=400, bias=True)
  (dropout): Dropout(p=0.2, inplace=False)
  (fc2): Linear(in_features=400, out_features=14, bias=True)
)

In [None]:
trainer.evaluate(test_data)

{'eval_loss': 0.0990210622549057,
 'eval_accuracy': 0.9717142857142858,
 'eval_precision': 0.9716920080249002,
 'eval_recall': 0.9717142857142858,
 'eval_f1': 0.9716787672081507,
 'eval_runtime': 11.9081,
 'eval_samples_per_second': 5878.362,
 'eval_steps_per_second': 45.935,
 'epoch': 3.0}

In [None]:
torch.save(model.state_dict(), f"{os.path.expanduser('~')}/models/{DATASET}/bilstm-base-small.pth")

In [None]:
student_model = base.BiLSTMClassifier(embedding_matrix=embedding_matrix, embedding_dim=embedding_dim, fc_dim=400, hidden_dim=300, output_dim=14)

In [None]:
training_args = base.get_training_args(output_dir=f"~/results/{DATASET}/bilstm-distill-small", remove_unused_columns=False, logging_dir=f"~/logs/{DATASET}/bilstm-distill-small", lr=.0001,  epochs=3, batch_size=128, lambda_param=.75, temp=5)

In [None]:
base.reset_seed()

In [None]:
trainer = base.DistilTrainer(
    student_model=student_model,
    args=training_args,
    train_dataset=train_data,
    eval_dataset=eval_data,
    compute_metrics=base.compute_metrics,
    callbacks = [EarlyStoppingCallback(early_stopping_patience = 3)]
)

In [None]:
trainer.train()

Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,5.4154,3.341083,0.786312,0.802289,0.786313,0.776738
2,2.613,1.986502,0.877768,0.880041,0.877768,0.875281
3,1.765,1.592329,0.895339,0.898132,0.895339,0.893522


TrainOutput(global_step=1050, training_loss=3.2644549851190474, metrics={'train_runtime': 109.1124, 'train_samples_per_second': 1231.757, 'train_steps_per_second': 9.623, 'total_flos': 0.0, 'train_loss': 3.2644549851190474, 'epoch': 3.0})

In [None]:
student_model.eval()

BiLSTMClassifier(
  (embedding): Embedding(691160, 300)
  (lstm): LSTM(300, 300, batch_first=True, bidirectional=True)
  (fc1): Linear(in_features=600, out_features=400, bias=True)
  (dropout): Dropout(p=0.2, inplace=False)
  (fc2): Linear(in_features=400, out_features=14, bias=True)
)

In [None]:
trainer.evaluate(test_data)

{'eval_loss': 1.5967628955841064,
 'eval_accuracy': 0.8956428571428572,
 'eval_precision': 0.8982861081068234,
 'eval_recall': 0.8956428571428571,
 'eval_f1': 0.8939248319125565,
 'eval_runtime': 12.5712,
 'eval_samples_per_second': 5568.281,
 'eval_steps_per_second': 43.512,
 'epoch': 3.0}

In [None]:
torch.save(student_model.state_dict(), f"{os.path.expanduser('~')}/models/{DATASET}/bilstm-distill-small.pth")

In [None]:
model = base.BiLSTMClassifier(embedding_matrix=embedding_matrix, embedding_dim=embedding_dim, fc_dim=400, hidden_dim=300, output_dim=14)

In [None]:
training_args = base.get_training_args(output_dir=f"~/results/{DATASET}/bilstm-base-aug-small", logging_dir=f"~/logs/{DATASET}/bilstm-base-aug-small", lr=.0001,  epochs=3, batch_size=128)

In [None]:
base.reset_seed()

In [None]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=all_train_data,
    eval_dataset=eval_data,
    compute_metrics=base.compute_metrics,
    callbacks = [EarlyStoppingCallback(early_stopping_patience = 3)]
)

In [None]:
trainer.train()

Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.208,0.06012,0.983232,0.983234,0.983232,0.983224
2,0.0761,0.05049,0.985821,0.985832,0.985821,0.985816
3,0.0621,0.047384,0.986777,0.986779,0.986777,0.986777


TrainOutput(global_step=20610, training_loss=0.11540887454133308, metrics={'train_runtime': 1423.2056, 'train_samples_per_second': 1853.458, 'train_steps_per_second': 14.481, 'total_flos': 0.0, 'train_loss': 0.11540887454133308, 'epoch': 3.0})

In [None]:
model.eval()

BiLSTMClassifier(
  (embedding): Embedding(691160, 300)
  (lstm): LSTM(300, 300, batch_first=True, bidirectional=True)
  (fc1): Linear(in_features=600, out_features=400, bias=True)
  (dropout): Dropout(p=0.2, inplace=False)
  (fc2): Linear(in_features=400, out_features=14, bias=True)
)

In [None]:
trainer.evaluate(test_data)

{'eval_loss': 0.046175818890333176,
 'eval_accuracy': 0.9867428571428571,
 'eval_precision': 0.9867385036592475,
 'eval_recall': 0.9867428571428573,
 'eval_f1': 0.9867395869986717,
 'eval_runtime': 21.7972,
 'eval_samples_per_second': 3211.424,
 'eval_steps_per_second': 25.095,
 'epoch': 3.0}

In [None]:
torch.save(model.state_dict(), f"{os.path.expanduser('~')}/models/{DATASET}/bilstm-base-aug-small.pth")

In [None]:
student_model = base.BiLSTMClassifier(embedding_matrix=embedding_matrix, embedding_dim=embedding_dim, fc_dim=400, hidden_dim=300, output_dim=14)

In [None]:
training_args = base.get_training_args(output_dir=f"~/results/{DATASET}/bilstm-distill-aug-small", remove_unused_columns=False, logging_dir=f"~/logs/{DATASET}/bilstm-distill-aug-small", lr=.0001,  epochs=3, batch_size=128, lambda_param=.75, temp=5)

In [None]:
base.reset_seed()

In [None]:
trainer = base.DistilTrainer(
    student_model=student_model,
    args=training_args,
    train_dataset=all_train_data,
    eval_dataset=eval_data,
    compute_metrics=base.compute_metrics,
    callbacks = [EarlyStoppingCallback(early_stopping_patience = 3)]
)

In [None]:
trainer.train()

Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.8717,0.168142,0.980518,0.980552,0.980518,0.98048
2,0.2417,0.123618,0.983929,0.98395,0.983929,0.983908
3,0.1984,0.107652,0.985446,0.985431,0.985446,0.985435


TrainOutput(global_step=20610, training_loss=0.4372475081309134, metrics={'train_runtime': 1513.0336, 'train_samples_per_second': 1743.419, 'train_steps_per_second': 13.622, 'total_flos': 0.0, 'train_loss': 0.4372475081309134, 'epoch': 3.0})

In [None]:
student_model.eval()

BiLSTMClassifier(
  (embedding): Embedding(691160, 300)
  (lstm): LSTM(300, 300, batch_first=True, bidirectional=True)
  (fc1): Linear(in_features=600, out_features=400, bias=True)
  (dropout): Dropout(p=0.2, inplace=False)
  (fc2): Linear(in_features=400, out_features=14, bias=True)
)

In [None]:
trainer.evaluate(test_data)

{'eval_loss': 0.10681172460317612,
 'eval_accuracy': 0.9854285714285714,
 'eval_precision': 0.9854093760634609,
 'eval_recall': 0.9854285714285714,
 'eval_f1': 0.9854123976057806,
 'eval_runtime': 23.2612,
 'eval_samples_per_second': 3009.306,
 'eval_steps_per_second': 23.516,
 'epoch': 3.0}

In [None]:
torch.save(model.state_dict(), f"{os.path.expanduser('~')}/models/{DATASET}/bilstm-distill-aug-small.pth")