In [1]:
import sys
import os
from logging import getLogger, ERROR

notebook_dir = os.getcwd()
parent_dir = os.path.dirname(notebook_dir)
# Fix module imports
if parent_dir not in sys.path:
    sys.path.append(parent_dir)

# Disable Hugging Face warnings
getLogger("transformers.modeling_utils").setLevel(ERROR)

In [2]:
from model.qgpt2_models import QGPT2ClassificationModel

from pandas import read_csv, DataFrame
from datasets import Dataset, load_metric
from sklearn.metrics import f1_score

from transformers import GPT2ForSequenceClassification, GPT2Tokenizer, Trainer, TrainingArguments

gpt2_model = GPT2ForSequenceClassification.from_pretrained("gpt2", num_labels=3)
tokenizer = GPT2Tokenizer.from_pretrained("openai-community/gpt2")

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
def tokenize_function(examples, padding="max_length"):
    return tokenizer(examples["text"], padding=padding, truncation=True, max_length=128)

tokenizer.pad_token = tokenizer.eos_token
gpt2_model.config.pad_token_id = gpt2_model.config.eos_token_id

In [4]:
df = read_csv("../data/Tweets.csv")
df['airline_sentiment'] = df['airline_sentiment'].replace(["negative", "neutral", "positive"], [0, 1, 2])

dataset = Dataset.from_pandas(df)
dataset = dataset.select_columns(["text", "airline_sentiment"])
dataset = dataset.rename_column("airline_sentiment", "label")

  df['airline_sentiment'] = df['airline_sentiment'].replace(["negative", "neutral", "positive"], [0, 1, 2])


In [5]:
train_and_eval = dataset.train_test_split(test_size=0.1, seed = 42)

train_ds = train_and_eval["train"]
eval_ds = train_and_eval["test"]

gpt2_train_ds = train_ds.map(tokenize_function, batched=True)
gpt2_eval_ds = eval_ds.map(tokenize_function, batched=True)

gpt2_train_ds.set_format(type='torch', columns=['input_ids', 'attention_mask', 'label'])
gpt2_eval_ds.set_format(type='torch', columns=['input_ids', 'attention_mask', 'label'])

Map: 100%|██████████| 13176/13176 [00:02<00:00, 6293.22 examples/s]
Map: 100%|██████████| 1464/1464 [00:00<00:00, 5287.92 examples/s]


In [8]:
def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    f1 = load_metric('f1').compute(predictions=preds, references=labels, average='weighted')
    return {
        'f1': f1,
    }

In [6]:
training_args = TrainingArguments(
    report_to="none",
    output_dir="/data/bz620/model_outputs",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    num_train_epochs=3,
    weight_decay=0.01,
)

trainer = Trainer(
    model=gpt2_model,
    args=training_args,
    train_dataset=gpt2_train_ds,
    eval_dataset=gpt2_eval_ds,
    compute_metrics=compute_metrics,
)

trainer.train()

In [21]:
# Generate labels on an unseen test dataset

predictions = trainer.predict(eval_ds)
preds = predictions.predictions.argmax(-1)
labels = predictions.label_ids

# 4. Create a result dataset
teacher_results = DataFrame({
    'text': eval_ds['text'],
    'true_label': labels,
    'predicted_label': preds
})

You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this metric from the next major release of `datasets`.


In [22]:
f1_score(teacher_results["true_label"], teacher_results["predicted_label"], average="macro")

0.7871517902680556

In [11]:
save_directory = './saved_model'
gpt2_model.save_pretrained(save_directory)

In [12]:
# Load pre-trained weights on our custom GPT-2 
fhe_model = QGPT2ClassificationModel.from_pretrained("./saved_model", n_bits=8,use_cache=False, num_labels=3)
fhe_model.config.pad_token_id = fhe_model.config.eos_token_id


In [20]:
training_args.use_cpu=True

trainer = Trainer(
    model=fhe_model,
    args=training_args,
    train_dataset=gpt2_train_ds,
    eval_dataset=gpt2_eval_ds,
    compute_metrics=compute_metrics,
)

trainer.evaluate()

  f1 = load_metric('f1').compute(predictions=preds, references=labels, average='weighted')
You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this metric from the next major release of `datasets`.


{'eval_loss': 0.45116475224494934,
 'eval_f1': {'f1': 0.836558976317662},
 'eval_runtime': 111.3141,
 'eval_samples_per_second': 13.152,
 'eval_steps_per_second': 1.644}

### Training the student model with teacher labels

In [39]:
from transformers import DistilBertForSequenceClassification, DistilBertTokenizer

# Load the student model and tokenizer
student_model = DistilBertForSequenceClassification.from_pretrained('distilbert-base-uncased', num_labels=3)
tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')

# tokenizer.pad_token = tokenizer.eos_token
student_model.config.pad_token_id = student_model.config.eos_token_id

# REMOVE THIS!
# teacher_results["label"] = teacher_results["predicted_label"].astype("int")

# Tokenize the inputs
student_train = Dataset.from_pandas(teacher_results)
student_train = student_train.map(tokenize_function, batched=True)
student_train.set_format("torch", columns=['input_ids', 'attention_mask', 'label'])


eval_ds = train_and_eval["test"]
eval_ds = eval_ds.map(tokenize_function, batched=True)
eval_ds.set_format("torch", columns=['input_ids', 'attention_mask', 'label'])



training_args = TrainingArguments(
   report_to="none",
   output_dir="/data/bz620/model_outputs",
   learning_rate=2e-5,
   per_device_train_batch_size=16,
   per_device_eval_batch_size=16,
   num_train_epochs=2,
   weight_decay=0.01,
   save_strategy="epoch"
)
 
trainer = Trainer(
   model=student_model,
   args=training_args,
   train_dataset=student_train,
   eval_dataset=eval_ds,
   tokenizer=tokenizer,
   compute_metrics=compute_metrics,
)


trainer.train()


Map: 100%|██████████| 1464/1464 [00:00<00:00, 3018.98 examples/s]
Map: 100%|██████████| 1464/1464 [00:00<00:00, 3030.13 examples/s]


Step,Training Loss


TrainOutput(global_step=184, training_loss=0.5968053237251614, metrics={'train_runtime': 14.5742, 'train_samples_per_second': 200.903, 'train_steps_per_second': 12.625, 'total_flos': 96967865069568.0, 'train_loss': 0.5968053237251614, 'epoch': 2.0})

In [40]:
trainer.evaluate()

You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this metric from the next major release of `datasets`.


{'eval_loss': 0.4768294095993042,
 'eval_f1': {'f1': 0.8169720267083462},
 'eval_runtime': 2.6299,
 'eval_samples_per_second': 556.668,
 'eval_steps_per_second': 34.982,
 'epoch': 2.0}

In [None]:
student_test_ds = test_ds.map(tokenize_function, batched=True)
student_test_ds.set_format(type='torch', columns=['input_ids', 'attention_mask', 'label'])

predictions = trainer.predict(student_test_ds)
preds = predictions.predictions.argmax(-1)
labels = predictions.label_ids

# 4. Create a result dataset
student_results = DataFrame({
    'text': test_ds['text'],
    'true_label': labels,
    'predicted_label': preds
})

f1_score(student_results["true_label"], teacher_results["predicted_label"], average="macro")

### Training the student model with true labels

In [36]:
from transformers import DistilBertForSequenceClassification, DistilBertTokenizer

# Load the student model and tokenizer
student_model = DistilBertForSequenceClassification.from_pretrained('distilbert-base-uncased', num_labels=3)
tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')

# tokenizer.pad_token = tokenizer.eos_token
student_model.config.pad_token_id = student_model.config.eos_token_id

student_train = train_ds
student_train = student_train.map(tokenize_function, batched=True)
student_train.set_format("torch", columns=['input_ids', 'attention_mask', 'label'])


eval_ds = train_and_eval["test"]
eval_ds = eval_ds.map(tokenize_function, batched=True)
eval_ds.set_format("torch", columns=['input_ids', 'attention_mask', 'label'])


training_args = TrainingArguments(
   report_to="none",
   output_dir="/data/bz620/model_outputs",
   learning_rate=2e-5,
   per_device_train_batch_size=16,
   per_device_eval_batch_size=16,
   num_train_epochs=2,
   weight_decay=0.01,
   save_strategy="epoch"
)

trainer = Trainer(
   model=student_model,
   args=training_args,
   train_dataset=student_train,
   eval_dataset=eval_ds,
   tokenizer=tokenizer,
   compute_metrics=compute_metrics,
)

trainer.train()

Map: 100%|██████████| 13176/13176 [00:04<00:00, 2997.81 examples/s]
Map: 100%|██████████| 1464/1464 [00:00<00:00, 2952.34 examples/s]


Step,Training Loss
500,0.5556
1000,0.4135
1500,0.3271


TrainOutput(global_step=1648, training_loss=0.42354471243700936, metrics={'train_runtime': 119.6314, 'train_samples_per_second': 220.277, 'train_steps_per_second': 13.776, 'total_flos': 872710785626112.0, 'train_loss': 0.42354471243700936, 'epoch': 2.0})

In [38]:
trainer.evaluate()

You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this metric from the next major release of `datasets`.


{'eval_loss': 0.39487704634666443,
 'eval_f1': {'f1': 0.8457976964809895},
 'eval_runtime': 2.6028,
 'eval_samples_per_second': 562.479,
 'eval_steps_per_second': 35.347,
 'epoch': 2.0}