In [1]:
import sys
import os
from logging import getLogger, ERROR

notebook_dir = os.getcwd()
parent_dir = os.path.dirname(notebook_dir)
# Fix module imports
if parent_dir not in sys.path:
    sys.path.append(parent_dir)

# Disable Hugging Face warnings
getLogger("transformers.modeling_utils").setLevel(ERROR)

In [2]:
from model.qgpt2_models import MultiHeadsQGPT2Model

from pandas import read_csv, DataFrame
from datasets import Dataset, load_metric
from sklearn.metrics import f1_score

from transformers import GPT2ForSequenceClassification, GPT2Tokenizer, Trainer, TrainingArguments

# model = MultiHeadsQGPT2Model.from_pretrained("gpt2", n_bits=8,use_cache=False).to("cuda")

model = GPT2ForSequenceClassification.from_pretrained("gpt2", num_labels=3)
tokenizer = GPT2Tokenizer.from_pretrained("openai-community/gpt2")

  from .autonotebook import tqdm as notebook_tqdm


In [37]:
def tokenize_function(examples):
    return tokenizer(examples["text"], padding='max_length', truncation=True, max_length=128)

tokenizer.pad_token = tokenizer.eos_token
model.config.pad_token_id = model.config.eos_token_id

In [20]:
df = read_csv("Tweets.csv")
df['airline_sentiment'] = df['airline_sentiment'].replace(["negative", "neutral", "positive"], [0, 1, 2])

dataset = Dataset.from_pandas(df)
dataset = dataset.select_columns(["text", "airline_sentiment"])
dataset = dataset.rename_column("airline_sentiment", "label")

  df['airline_sentiment'] = df['airline_sentiment'].replace(["negative", "neutral", "positive"], [0, 1, 2])


In [22]:
train_eval_test = dataset.train_test_split(test_size=0.1, seed = 42)
train_and_eval = train_eval_test["train"].train_test_split(test_size=0.1, seed = 42)

train_ds = train_and_eval["train"]
eval_ds = train_and_eval["test"]
test_ds = train_eval_test["test"]

train_ds = train_ds.map(tokenize_function, batched=True)
eval_ds = eval_ds.map(tokenize_function, batched=True)

train_ds.set_format(type='torch', columns=['input_ids', 'attention_mask', 'label'])
eval_ds.set_format(type='torch', columns=['input_ids', 'attention_mask', 'label'])

Map: 100%|██████████| 11858/11858 [00:02<00:00, 5300.65 examples/s]
Map: 100%|██████████| 1318/1318 [00:00<00:00, 4962.59 examples/s]


In [39]:
training_args = TrainingArguments(
    report_to="none",
    output_dir="/data/bz620/model_outputs",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    num_train_epochs=3,
    weight_decay=0.01
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_ds,
    eval_dataset=eval_ds,
)

trainer.train()

Step,Training Loss


KeyboardInterrupt: 

In [25]:
# 1. Load the F1 metric
metric = load_metric('f1')

def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    f1 = metric.compute(predictions=preds, references=labels, average='weighted')
    return {
        'f1': f1,
    }

You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this metric from the next major release of `datasets`.
Downloading builder script: 6.50kB [00:00, 8.98MB/s]                   


In [40]:
# Generate labels on an unseen test dataset

gpt2_test_ds = test_ds.map(tokenize_function, batched=True)
gpt2_test_ds.set_format(type='torch', columns=['input_ids', 'attention_mask', 'label'])

predictions = trainer.predict(gpt2_test_ds)
preds = predictions.predictions.argmax(-1)
labels = predictions.label_ids

# 4. Create a result dataset
teacher_results = DataFrame({
    'text': test_ds['text'],
    'label': labels,
    'predicted_label': preds
})


Map:   0%|          | 0/1464 [00:00<?, ? examples/s]


ValueError: Asking to pad but the tokenizer does not have a padding token. Please select a token to use as `pad_token` `(tokenizer.pad_token = tokenizer.eos_token e.g.)` or add a new pad token via `tokenizer.add_special_tokens({'pad_token': '[PAD]'})`.

In [27]:
f1_score(teacher_results["true_label"], teacher_results["predicted_label"], average="macro")

0.7933606939846157

In [None]:
save_directory = './saved_model'
model.save_pretrained(save_directory)

# Save the tokenizer
tokenizer.save_pretrained(save_directory)

('./saved_model/tokenizer_config.json',
 './saved_model/special_tokens_map.json',
 './saved_model/vocab.json',
 './saved_model/merges.txt',
 './saved_model/added_tokens.json')

### Training the student model with teacher labels

In [36]:
from transformers import DistilBertForSequenceClassification, DistilBertTokenizer

# Load the student model and tokenizer
student_model = DistilBertForSequenceClassification.from_pretrained('distilbert-base-uncased', problem_type="multi_label_classification", num_labels=3)
tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')

# tokenizer.pad_token = tokenizer.eos_token
student_model.config.pad_token_id = student_model.config.eos_token_id

# REMOVE THIS!
teacher_results["label"] = teacher_results["true_label"]

# Tokenize the inputs
teacher_results_ds = Dataset.from_pandas(teacher_results)
teacher_results_ds = teacher_results_ds.map(tokenize_function, batched=True)
student_ds = teacher_results_ds.train_test_split(test_size=0.1, seed = 42)
student_ds["train"].set_format("torch", columns=['input_ids', 'attention_mask', 'label'])
student_ds["test"].set_format("torch", columns=['input_ids', 'attention_mask', 'label'])


training_args = TrainingArguments(
   report_to="none",
    output_dir="/data/bz620/model_outputs",
   learning_rate=2e-5,
   per_device_train_batch_size=16,
   per_device_eval_batch_size=16,
   num_train_epochs=2,
   weight_decay=0.01,
   save_strategy="epoch"
)
 
trainer = Trainer(
   model=student_model,
   args=training_args,
   train_dataset=student_ds["train"],
   eval_dataset=student_ds["test"],
   tokenizer=tokenizer,
   compute_metrics=compute_metrics,
)


trainer.train()


Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Map: 100%|██████████| 1464/1464 [00:00<00:00, 2964.00 examples/s]


ValueError: Target size (torch.Size([16])) must be the same as input size (torch.Size([16, 3]))

In [None]:
student_test_ds = test_ds.map(tokenize_function, batched=True)
student_test_ds.set_format(type='torch', columns=['input_ids', 'attention_mask', 'label'])

predictions = trainer.predict(student_test_ds)
preds = predictions.predictions.argmax(-1)
labels = predictions.label_ids

# 4. Create a result dataset
student_results = DataFrame({
    'text': test_ds['text'],
    'true_label': labels,
    'predicted_label': preds
})

f1_score(student_results["true_label"], teacher_results["predicted_label"], average="macro")

### Training the student model with true labels