## Installation

In [1]:
#%pip install "pytorch==1.10.1"
%pip install transformers datasets tensorboard --upgrade

!sudo apt-get install git-lfs

[0mNote: you may need to restart the kernel to use updated packages.
Reading package lists... Done
Building dependency tree       
Reading state information... Done
git-lfs is already the newest version (2.9.2-1).
0 upgraded, 0 newly installed, 0 to remove and 93 not upgraded.


In [2]:
teacher_ft_debert_emotion = "ArafatBHossain/debert_base_fine_tuned_sent140"
teacher_ft_roberta_emotion = "ArafatBHossain/robbert_base_fine_tuned_sent140"
teacher_ft_bert_base_emotion = "ArafatBHossain/bert_base_uncased_fine_tuned_sent140"

# name for our repository on the hub
repo_name = "bert-distilled-multi_teacher_model_flip_twitter_emotion_epoch7_alpha0.8_refined"


In [3]:
from transformers import AutoModel, AutoModelForSequenceClassification
model_bert_base_uncased = AutoModelForSequenceClassification.from_pretrained(teacher_ft_bert_base_emotion)
model_debert = AutoModelForSequenceClassification.from_pretrained(teacher_ft_debert_emotion)
model_robert = AutoModelForSequenceClassification.from_pretrained(teacher_ft_roberta_emotion)



In [4]:
student_id = "ArafatBHossain/distilbert-base-uncased_fine_tuned_sent140"

In [5]:
import torch
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model_bert_base_uncased.to(device)
model_debert.to(device)
model_robert.to(device)

RobertaForSequenceClassification(
  (roberta): RobertaModel(
    (embeddings): RobertaEmbeddings(
      (word_embeddings): Embedding(50265, 768, padding_idx=1)
      (position_embeddings): Embedding(514, 768, padding_idx=1)
      (token_type_embeddings): Embedding(1, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): RobertaEncoder(
      (layer): ModuleList(
        (0): RobertaLayer(
          (attention): RobertaAttention(
            (self): RobertaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): RobertaSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerN

Below are some checks to make sure the `Teacher` & `Student` are creating the same output.

In [6]:
from transformers import AutoTokenizer

# init tokenizer
teacher_tokenizer_debert = AutoTokenizer.from_pretrained("microsoft/deberta-base")
teacher_tokenizer_bert_base = AutoTokenizer.from_pretrained("bert-base-uncased")
teacher_tokenizer_robert = AutoTokenizer.from_pretrained("roberta-base")
student_tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")


In [7]:
from datasets import load_dataset
import pandas as pd
dataset = load_dataset("tweet_eval","emotion")

  0%|          | 0/3 [00:00<?, ?it/s]

In [8]:
dataset

DatasetDict({
    train: Dataset({
        features: ['text', 'label'],
        num_rows: 3257
    })
    test: Dataset({
        features: ['text', 'label'],
        num_rows: 1421
    })
    validation: Dataset({
        features: ['text', 'label'],
        num_rows: 374
    })
})

In [9]:
####MIND####
# temp_train = pd.DataFrame(dataset['train'])
# temp_test = pd.DataFrame(dataset['test'])

# t_train = temp_train[temp_train.label.isin([0,2,9,10,14])]
# t_test = temp_test[temp_test.label.isin([0,2,9,10,14])]

# t_train['label'] = t_train['label'].replace(2,1)
# t_train['label'] = t_train['label'].replace(9,2)
# t_train['label'] = t_train['label'].replace(10,3)
# t_train['label'] = t_train['label'].replace(14,4)

# t_test['label'] = t_test['label'].replace(2,1)
# t_test['label'] = t_test['label'].replace(9,2)
# t_test['label'] = t_test['label'].replace(10,3)
# t_test['label'] = t_test['label'].replace(14,4)

# ###EMOTION###
df_train = pd.DataFrame(dataset['train'])
df_test = pd.DataFrame(dataset['test'])
df_valid = pd.DataFrame(dataset['validation'])

In [10]:
df_train.head()

Unnamed: 0,text,label
0,“Worry is a down payment on a problem you may ...,2
1,My roommate: it's okay that we can't spell bec...,0
2,No but that's so cute. Atsu was probably shy a...,1
3,Rooneys fucking untouchable isn't he? Been fuc...,0
4,it's pretty depressing when u hit pan on ur fa...,3


In [11]:
import nltk
# Uncomment to download "stopwords"
nltk.download("stopwords")
from nltk.corpus import stopwords
import re

def text_preprocessing(s):
    """
    - Lowercase the sentence
    - Change "'t" to "not"
    - Remove "@name"
    - Isolate and remove punctuations except "?"
    - Remove other special characters
    - Remove stop words except "not" and "can"
    - Remove trailing whitespace
    """
    s = s.lower()
    # Change 't to 'not'
    s = re.sub(r"\'t", " not", s)
    # Remove @name
    s = re.sub(r'(@.*?)[\s]', ' ', s)
    # Isolate and remove punctuations except '?'
    s = re.sub(r'([\'\"\.\(\)\!\?\\\/\,])', r' \1 ', s)
    s = re.sub(r'[^\w\s\?]', ' ', s)
    # Remove some special characters
    s = re.sub(r'([\;\:\|•«\n])', ' ', s)
    # Remove stopwords except 'not' and 'can'
    s = " ".join([word for word in s.split()
                  if word not in stopwords.words('english')
                  or word in ['not', 'can']])
    # Remove trailing whitespace
    s = re.sub(r'\s+', ' ', s).strip()
    
    return s

df_train['text'] = df_train['text'].apply(lambda x: text_preprocessing(x))
df_test['text'] = df_test['text'].apply(lambda x: text_preprocessing(x))
df_valid['text'] = df_valid['text'].apply(lambda x: text_preprocessing(x))


[nltk_data] Downloading package stopwords to /usr/share/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [12]:
df_train.head()

Unnamed: 0,text,label
0,worry payment problem may never joyce meyer mo...,2
1,roommate okay can not spell autocorrect terrib...,0
2,cute atsu probably shy photos cherry helped uwu,1
3,rooneys fucking untouchable not ? fucking drea...,0
4,pretty depressing u hit pan ur favourite highl...,3


In [13]:
from datasets import Dataset
data_train = Dataset.from_pandas(df_train)
data_test = Dataset.from_pandas(df_test)
data_valid = Dataset.from_pandas(df_valid)

In [14]:
data_test

Dataset({
    features: ['text', 'label'],
    num_rows: 1421
})

Additionally we add the `truncation=True` and `max_length=512` to align the length and truncate texts that are bigger than the maximum size allowed by the model. 

In [15]:
# def process(examples):
#     tokenized_inputs = student_tokenizer(
#         examples["sentence"], truncation=True, max_length=512
#     )
#     return tokenized_inputs

# tokenized_datasets = dataset.map(process, batched=True)
# tokenized_datasets = tokenized_datasets.rename_column("label","labels")

# tokenized_datasets["test"].features




def process(examples):
    tokenized_inputs = student_tokenizer(
        examples["text"], truncation=True, max_length=512
    )
    return tokenized_inputs

tokenized_datasets_train = data_train.map(process, batched=True).shuffle(seed = 42)
tokenized_datasets_train = tokenized_datasets_train.rename_column("label","labels")

tokenized_datasets_test = data_valid.map(process, batched=True).shuffle(seed = 42)
tokenized_datasets_test = tokenized_datasets_test.rename_column("label","labels")



# tokenized_datasets_valid = data_valid.map(process, batched=True)
# tokenized_datasets_valid = tokenized_datasets_valid.rename_column("label","labels")




  0%|          | 0/4 [00:00<?, ?ba/s]

  0%|          | 0/1 [00:00<?, ?ba/s]

In [16]:
data_test

Dataset({
    features: ['text', 'label'],
    num_rows: 1421
})

## Distilling the model using `PyTorch` and `DistillationTrainer`


Now that our `dataset` is processed, we can distill it. Normally, when fine-tuning a transformer model using PyTorch you should go with the `Trainer-API`. The [Trainer](https://huggingface.co/docs/transformers/v4.16.1/en/main_classes/trainer#transformers.Trainer) class provides an API for feature-complete training in PyTorch for most standard use cases. 

In our example we cannot use the `Trainer` out-of-the-box, since we need to pass in two models, the `Teacher` and the `Student` and compute the loss for both. But we can subclass the `Trainer` to create a `DistillationTrainer` which will take care of it and only overwrite the [compute_loss](https://github.com/huggingface/transformers/blob/c4ad38e5ac69e6d96116f39df789a2369dd33c21/src/transformers/trainer.py#L1962) method as well as the `init` method. In addition to this we also need to subclass the `TrainingArguments` to include the our distillation hyperparameters. 


In [17]:
from transformers import TrainingArguments, Trainer
import torch
import torch.nn as nn
import torch.nn.functional as F

def get_prediction_from_logits(logits):
    list_logit = logits.tolist()[0]
    
    return list_logit.index(max(list_logit))

class DistillationTrainingArguments(TrainingArguments):
    def __init__(self, *args, alpha=0.2, temperature=1.0, **kwargs):
        super().__init__(*args, **kwargs)
        
        self.alpha = alpha
        self.temperature = temperature
        
class DistillationTrainer(Trainer):
    def __init__(self, *args, teacher_model=None, **kwargs):
        super().__init__(*args, **kwargs)
#         self.teacher = teacher_model
        # place teacher on same device as student
#         self._move_model_to_device(self.teacher,self.model.device)
#         self.teacher.eval()
        
# teacher_ft_debert_emotion = "ArafatBHossain/deberta_base_fine_tuned_emotion_dataset"
# teacher_ft_roberta_emotion = "ArafatBHossain/robert_base_fine_tuned_emotion_dataset"
# teacher_ft_bert_base_emotion = "ArafatBHossain/bert_uncased_fine_tuned_emotion_dataset"

    def compute_loss(self, model, inputs, return_outputs=False):
        
        model_debert.eval()
        model_robert.eval()
        model_bert_base_uncased.eval()
        # compute student output
        outputs_student = model(**inputs)
        student_loss=outputs_student.loss
        # compute teacher output
        with torch.no_grad():
          debert_logits = model_debert(**inputs).logits 
          teacher_out_debert = get_prediction_from_logits(debert_logits)
          
          robert_logits = model_robert(**inputs).logits
          teacher_out_robert = get_prediction_from_logits(robert_logits)
          
          bert_base_logits = model_bert_base_uncased(**inputs).logits
          teacher_out_bert_base = get_prediction_from_logits(bert_base_logits)
          
          #0,2bb, 1-d , 3-robert
          if ((teacher_out_bert_base == 0 or teacher_out_bert_base == 2) and (teacher_out_debert == 1) and (teacher_out_robert == 3)) or \
              ((teacher_out_bert_base == 0 or teacher_out_bert_base == 2) and (teacher_out_debert == 1)) or \
              ((teacher_out_debert == 1) and (teacher_out_robert == 3)) or \
              ((teacher_out_bert_base == 0 or teacher_out_bert_base == 2) and (teacher_out_robert == 3)):
            outputs_teacher = bert_base_logits
          else:
            if (teacher_out_bert_base == 0 or teacher_out_bert_base == 2):
                outputs_teacher = bert_base_logits
            elif(teacher_out_debert == 1):
                outputs_teacher = debert_logits
            elif(teacher_out_robert == 3):
                outputs_teacher = robert_logits
            else:
                outputs_teacher = bert_base_logits
            
          
#           outputs_teacher = (debert_logits + robert_logits + bert_base_logits) / 3.0
            
          #pick teacher 0,2bb  1,4d 3,5r for EMOTION
#           if ((teacher_out_bert_base == 0 or teacher_out_bert_base == 2) and (teacher_out_debert == 1 or teacher_out_debert == 4) and (teacher_out_robert == 3 or teacher_out_robert == 5))or \
#             ((teacher_out_bert_base == 0 or teacher_out_bert_base == 2) and (teacher_out_debert == 1 or teacher_out_debert == 4))or \
#             ((teacher_out_debert == 1 or teacher_out_debert == 4) and (teacher_out_robert == 3 or teacher_out_robert == 5)) or \
#             ((teacher_out_bert_base == 0 or teacher_out_bert_base == 2) and (teacher_out_robert == 3 or teacher_out_robert == 5)): 
#             outputs_teacher = bert_base_logits  # chosing general best model
#           else:
#             if (teacher_out_bert_base == 0 or teacher_out_bert_base == 2):
#                 outputs_teacher = bert_base_logits
#             elif (teacher_out_debert == 1 or teacher_out_debert == 4):
#                 outputs_teacher = debert_logits
#             elif (teacher_out_robert == 3 or teacher_out_robert == 5):
#                 outputs_teacher = robert_logits
#             else:
#                 outputs_teacher = bert_base_logits
            
          
       
#         outputs_teacher = bert_base_logits
        # Soften probabilities and compute distillation loss
        loss_function = nn.KLDivLoss(reduction="batchmean")
        loss_logits = (loss_function(
            F.log_softmax(outputs_student.logits / self.args.temperature, dim=-1),
            F.softmax(outputs_teacher / self.args.temperature, dim=-1)) * (self.args.temperature ** 2))
        # Return weighted student loss
#         loss = self.args.alpha * student_loss + (1. - self.args.alpha) * loss_logits
        loss = 0.2 * student_loss + (0.8) * loss_logits
        return (loss, outputs_student) if return_outputs else loss

### Hyperparameter Definition, Model Loading

In [18]:
a = 1
b = 2
c = 6

# if (a == 1 or a == 2) & (b == 3 or b == 4) & (c == 5 or c == 6):
#     print("All teacher predicted their best prediction, so we take general best model")
# elif (b == 3 or b == 4) & (c == 5 or c == 6):
#     print("B and C predicted their best")
# elif (a == 1 or a == 2) & (b == 3 or b == 4):
#     print("A and B predicted their best")
# elif (a == 1 or a == 2) & (c == 5 or c == 6):
#     print("A and C predicted their best")
# else:
#     if (a == 1 or a == 2):
#         print("Only A predicted the best")
#     elif (b == 3 or b == 4):
#         print("Only B predicted the best")
#     elif (c == 5 or c == 6):
#         print("Only C predicted the best")

if ((a == 1 or a == 2) & (b == 3 or b == 4) & (c == 5 or c == 6)):
    print("Chosing the general best model")
elif (a == 1 or a == 2):
    print("Only A predicted the best")
elif (b == 3 or b == 4):
    print("Only B predicted the best")
elif (c == 5 or c == 6):
    print("Only C predicted the best")
else:
    print("Chosing general best model")

Only A predicted the best


In [20]:
from transformers import AutoModelForSequenceClassification, DataCollatorWithPadding
from huggingface_hub import HfFolder

# create label2id, id2label dicts for nice outputs for the model
# labels = tokenized_datasets_train.features["labels"].names
labels = [0,1,2,3]
num_labels = len(labels)
label2id, id2label = dict(), dict()
for i, label in enumerate(labels):
    label2id[label] = str(i)
    id2label[str(i)] = label

# define training args
training_args = DistillationTrainingArguments(
    output_dir=repo_name,
    fp16=True,
    num_train_epochs=7,
    # logging & evaluation strategies
    logging_dir=f"{repo_name}/logs",
    logging_strategy="epoch", # to get more information to TB
    evaluation_strategy="epoch",
    save_strategy="epoch",
    save_total_limit=2,
    load_best_model_at_end=True,
    metric_for_best_model="accuracy",
    push_to_hub = "True",
    alpha=0.5,
    temperature=4.0
    )

# define data_collator
data_collator = DataCollatorWithPadding(tokenizer=student_tokenizer)

# # define model
# teacher_model = AutoModelForSequenceClassification.from_pretrained(
#     teacher_id,
#     num_labels=num_labels, 
#     id2label=id2label,
#     label2id=label2id,
# )

# define student model
student_model = AutoModelForSequenceClassification.from_pretrained(
    student_id,
    num_labels=num_labels, 
    id2label=id2label,
    label2id=label2id,
)

student_model.to(device)

DistilBertForSequenceClassification(
  (distilbert): DistilBertModel(
    (embeddings): Embeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (transformer): Transformer(
      (layer): ModuleList(
        (0): TransformerBlock(
          (attention): MultiHeadSelfAttention(
            (dropout): Dropout(p=0.1, inplace=False)
            (q_lin): Linear(in_features=768, out_features=768, bias=True)
            (k_lin): Linear(in_features=768, out_features=768, bias=True)
            (v_lin): Linear(in_features=768, out_features=768, bias=True)
            (out_lin): Linear(in_features=768, out_features=768, bias=True)
          )
          (sa_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
          (ffn): FFN(
            (dropout): Dropout(p=0.1, inplace=False)
       

### Evaluation metric

we can create a `compute_metrics` function to evaluate our model on the test set. This function will be used during the training process to compute the `accuracy` & `f1` of our model.

In [22]:
from datasets import load_metric
import numpy as np

# define metrics and metrics function
accuracy_metric = load_metric( "accuracy")

def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=1)
    acc = accuracy_metric.compute(predictions=predictions, references=labels)
    return {
        "accuracy": acc["accuracy"],
    }

  """


## Training

Start training with calling `trainer.train`

In [23]:
from huggingface_hub import notebook_login

notebook_login()
#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

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [24]:
trainer = DistillationTrainer(
    student_model,
    training_args,
    teacher_model=None,
    train_dataset=tokenized_datasets_train, 
    eval_dataset=tokenized_datasets_test,
    data_collator=data_collator,
    tokenizer=student_tokenizer,
    compute_metrics=compute_metrics,
)

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Cloning https://huggingface.co/ArafatBHossain/bert-distilled-multi_teacher_model_flip_twitter_emotion_epoch7_alpha0.8_refined into local empty directory.


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Av

Using cuda_amp half precision backend


In [25]:
torch.cuda.empty_cache()

start training using the `DistillationTrainer`.

In [26]:
#bb276722adc6ae292006e86940c94a80b80019a1
trainer.train()

The following columns in the training set don't have a corresponding argument in `DistilBertForSequenceClassification.forward` and have been ignored: text. If text are not expected by `DistilBertForSequenceClassification.forward`,  you can safely ignore this message.
***** Running training *****
  Num examples = 3257
  Num Epochs = 7
  Instantaneous batch size per device = 8
  Total train batch size (w. parallel, distributed & accumulation) = 8
  Gradient Accumulation steps = 1
  Total optimization steps = 2856
Automatic Weights & Biases logging enabled, to disable set os.environ["WANDB_DISABLED"] = "true"
[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
[34m[1mwandb[0m: Paste an API key from your profile and hit enter, or press ctrl+c to quit:

  ········································


[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


You're using a DistilBertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.
  query_layer = query_layer / torch.tensor(scale, dtype=query_layer.dtype)
  p2c_att = torch.matmul(key_layer, torch.tensor(pos_query_layer.transpose(-1, -2), dtype=key_layer.dtype))


Epoch,Training Loss,Validation Loss,Accuracy
1,0.6098,0.929101,0.724599
2,0.6541,0.849538,0.737968
3,0.598,0.762097,0.740642
4,0.5817,0.728244,0.754011
5,0.5494,0.810173,0.745989
6,0.5401,0.827702,0.745989
7,0.5652,0.809954,0.745989


The following columns in the evaluation set don't have a corresponding argument in `DistilBertForSequenceClassification.forward` and have been ignored: text. If text are not expected by `DistilBertForSequenceClassification.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 374
  Batch size = 8
Saving model checkpoint to bert-distilled-multi_teacher_model_flip_twitter_emotion_epoch7_alpha0.8_refined/checkpoint-408
Configuration saved in bert-distilled-multi_teacher_model_flip_twitter_emotion_epoch7_alpha0.8_refined/checkpoint-408/config.json
Model weights saved in bert-distilled-multi_teacher_model_flip_twitter_emotion_epoch7_alpha0.8_refined/checkpoint-408/pytorch_model.bin
tokenizer config file saved in bert-distilled-multi_teacher_model_flip_twitter_emotion_epoch7_alpha0.8_refined/checkpoint-408/tokenizer_config.json
Special tokens file saved in bert-distilled-multi_teacher_model_flip_twitter_emotion_epoch7_alpha0.8_refined/checkpoint-408/s

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Av

  query_layer = query_layer / torch.tensor(scale, dtype=query_layer.dtype)
  p2c_att = torch.matmul(key_layer, torch.tensor(pos_query_layer.transpose(-1, -2), dtype=key_layer.dtype))
The following columns in the evaluation set don't have a corresponding argument in `DistilBertForSequenceClassification.forward` and have been ignored: text. If text are not expected by `DistilBertForSequenceClassification.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 374
  Batch size = 8
Saving model checkpoint to bert-distilled-multi_teacher_model_flip_twitter_emotion_epoch7_alpha0.8_refined/checkpoint-816
Configuration saved in bert-distilled-multi_teacher_model_flip_twitter_emotion_epoch7_alpha0.8_refined/checkpoint-816/config.json
Model weights saved in bert-distilled-multi_teacher_model_flip_twitter_emotion_epoch7_alpha0.8_refined/checkpoint-816/pytorch_model.bin
tokenizer config file saved in bert-distilled-multi_teacher_model_flip_twitter_emotion_epo

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


tokenizer config file saved in bert-distilled-multi_teacher_model_flip_twitter_emotion_epoch7_alpha0.8_refined/tokenizer_config.json
Special tokens file saved in bert-distilled-multi_teacher_model_flip_twitter_emotion_epoch7_alpha0.8_refined/special_tokens_map.json


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Av

  query_layer = query_layer / torch.tensor(scale, dtype=query_layer.dtype)
  p2c_att = torch.matmul(key_layer, torch.tensor(pos_query_layer.transpose(-1, -2), dtype=key_layer.dtype))
The following columns in the evaluation set don't have a corresponding argument in `DistilBertForSequenceClassification.forward` and have been ignored: text. If text are not expected by `DistilBertForSequenceClassification.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 374
  Batch size = 8
Saving model checkpoint to bert-distilled-multi_teacher_model_flip_twitter_emotion_epoch7_alpha0.8_refined/checkpoint-1224
Configuration saved in bert-distilled-multi_teacher_model_flip_twitter_emotion_epoch7_alpha0.8_refined/checkpoint-1224/config.json
Model weights saved in bert-distilled-multi_teacher_model_flip_twitter_emotion_epoch7_alpha0.8_refined/checkpoint-1224/pytorch_model.bin
tokenizer config file saved in bert-distilled-multi_teacher_model_flip_twitter_emotion_

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


tokenizer config file saved in bert-distilled-multi_teacher_model_flip_twitter_emotion_epoch7_alpha0.8_refined/tokenizer_config.json
Special tokens file saved in bert-distilled-multi_teacher_model_flip_twitter_emotion_epoch7_alpha0.8_refined/special_tokens_map.json


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Av

Deleting older checkpoint [bert-distilled-multi_teacher_model_flip_twitter_emotion_epoch7_alpha0.8_refined/checkpoint-408] due to args.save_total_limit


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


  query_layer = query_layer / torch.tensor(scale, dtype=query_layer.dtype)
  p2c_att = torch.matmul(key_layer, torch.tensor(pos_query_layer.transpose(-1, -2), dtype=key_layer.dtype))
The following columns in the evaluation set don't have a corresponding argument in `DistilBertForSequenceClassification.forward` and have been ignored: text. If text are not expected by `DistilBertForSequenceClassification.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 374
  Batch size = 8
Saving model checkpoint to bert-distilled-multi_teacher_model_flip_twitter_emotion_epoch7_alpha0.8_refined/checkpoint-1632
Configuration saved in bert-distilled-multi_teacher_model_flip_twitter_emotion_epoch7_alpha0.8_refined/checkpoint-1632/config.json
Model weights saved in bert-distilled-multi_teacher_model_flip_twitter_emotion_epoch7_alpha0.8_refined/checkpoint-1632/pytorch_model.bin
tokenizer config file saved in bert-distilled-multi_teacher_model_flip_twitter_emotion_

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


tokenizer config file saved in bert-distilled-multi_teacher_model_flip_twitter_emotion_epoch7_alpha0.8_refined/tokenizer_config.json
Special tokens file saved in bert-distilled-multi_teacher_model_flip_twitter_emotion_epoch7_alpha0.8_refined/special_tokens_map.json


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Av

Deleting older checkpoint [bert-distilled-multi_teacher_model_flip_twitter_emotion_epoch7_alpha0.8_refined/checkpoint-816] due to args.save_total_limit


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


  query_layer = query_layer / torch.tensor(scale, dtype=query_layer.dtype)
  p2c_att = torch.matmul(key_layer, torch.tensor(pos_query_layer.transpose(-1, -2), dtype=key_layer.dtype))
The following columns in the evaluation set don't have a corresponding argument in `DistilBertForSequenceClassification.forward` and have been ignored: text. If text are not expected by `DistilBertForSequenceClassification.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 374
  Batch size = 8
Saving model checkpoint to bert-distilled-multi_teacher_model_flip_twitter_emotion_epoch7_alpha0.8_refined/checkpoint-2040
Configuration saved in bert-distilled-multi_teacher_model_flip_twitter_emotion_epoch7_alpha0.8_refined/checkpoint-2040/config.json
Model weights saved in bert-distilled-multi_teacher_model_flip_twitter_emotion_epoch7_alpha0.8_refined/checkpoint-2040/pytorch_model.bin
tokenizer config file saved in bert-distilled-multi_teacher_model_flip_twitter_emotion_

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


tokenizer config file saved in bert-distilled-multi_teacher_model_flip_twitter_emotion_epoch7_alpha0.8_refined/tokenizer_config.json
Special tokens file saved in bert-distilled-multi_teacher_model_flip_twitter_emotion_epoch7_alpha0.8_refined/special_tokens_map.json


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Av

Deleting older checkpoint [bert-distilled-multi_teacher_model_flip_twitter_emotion_epoch7_alpha0.8_refined/checkpoint-1224] due to args.save_total_limit


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


  query_layer = query_layer / torch.tensor(scale, dtype=query_layer.dtype)
  p2c_att = torch.matmul(key_layer, torch.tensor(pos_query_layer.transpose(-1, -2), dtype=key_layer.dtype))
The following columns in the evaluation set don't have a corresponding argument in `DistilBertForSequenceClassification.forward` and have been ignored: text. If text are not expected by `DistilBertForSequenceClassification.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 374
  Batch size = 8
Saving model checkpoint to bert-distilled-multi_teacher_model_flip_twitter_emotion_epoch7_alpha0.8_refined/checkpoint-2448
Configuration saved in bert-distilled-multi_teacher_model_flip_twitter_emotion_epoch7_alpha0.8_refined/checkpoint-2448/config.json
Model weights saved in bert-distilled-multi_teacher_model_flip_twitter_emotion_epoch7_alpha0.8_refined/checkpoint-2448/pytorch_model.bin
tokenizer config file saved in bert-distilled-multi_teacher_model_flip_twitter_emotion_

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


tokenizer config file saved in bert-distilled-multi_teacher_model_flip_twitter_emotion_epoch7_alpha0.8_refined/tokenizer_config.json
Special tokens file saved in bert-distilled-multi_teacher_model_flip_twitter_emotion_epoch7_alpha0.8_refined/special_tokens_map.json


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Av

Deleting older checkpoint [bert-distilled-multi_teacher_model_flip_twitter_emotion_epoch7_alpha0.8_refined/checkpoint-2040] due to args.save_total_limit


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


  query_layer = query_layer / torch.tensor(scale, dtype=query_layer.dtype)
  p2c_att = torch.matmul(key_layer, torch.tensor(pos_query_layer.transpose(-1, -2), dtype=key_layer.dtype))
The following columns in the evaluation set don't have a corresponding argument in `DistilBertForSequenceClassification.forward` and have been ignored: text. If text are not expected by `DistilBertForSequenceClassification.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 374
  Batch size = 8
Saving model checkpoint to bert-distilled-multi_teacher_model_flip_twitter_emotion_epoch7_alpha0.8_refined/checkpoint-2856
Configuration saved in bert-distilled-multi_teacher_model_flip_twitter_emotion_epoch7_alpha0.8_refined/checkpoint-2856/config.json
Model weights saved in bert-distilled-multi_teacher_model_flip_twitter_emotion_epoch7_alpha0.8_refined/checkpoint-2856/pytorch_model.bin
tokenizer config file saved in bert-distilled-multi_teacher_model_flip_twitter_emotion_

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


tokenizer config file saved in bert-distilled-multi_teacher_model_flip_twitter_emotion_epoch7_alpha0.8_refined/tokenizer_config.json
Special tokens file saved in bert-distilled-multi_teacher_model_flip_twitter_emotion_epoch7_alpha0.8_refined/special_tokens_map.json


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Av

Deleting older checkpoint [bert-distilled-multi_teacher_model_flip_twitter_emotion_epoch7_alpha0.8_refined/checkpoint-2448] due to args.save_total_limit


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)




Training completed. Do not forget to share your model on huggingface.co/models =)


Loading best model from bert-distilled-multi_teacher_model_flip_twitter_emotion_epoch7_alpha0.8_refined/checkpoint-1632 (score: 0.7540106951871658).


TrainOutput(global_step=2856, training_loss=0.5854709796210965, metrics={'train_runtime': 506.4014, 'train_samples_per_second': 45.022, 'train_steps_per_second': 5.64, 'total_flos': 128720988929232.0, 'train_loss': 0.5854709796210965, 'epoch': 7.0})

In [None]:
# trainer.push_to_hub()

In [None]:
# def process_label_for_gptmix_sst2(w):
#     w['label'] = w['label'].replace(0,3)
#     w['label'] = w['label'].replace(1,0)
#     w['label'] = w['label'].replace(3,1)
#     return w

# data = load_dataset("gpt3mix/sst2")
# test_data = process_label_for_gptmix_sst2(pd.DataFrame(data['test']))

In [None]:
# clf = pipeline("text-classification", model = m, tokenizer = student_tokenizer)
# clf("I am very sad")

In [28]:
###for MIND###
test_data = df_test

In [29]:
len(test_data)

1421

In [None]:
!pip install pytorch-transformers

In [30]:
from transformers import AutoModel
m = AutoModelForSequenceClassification.from_pretrained("./bert-distilled-multi_teacher_model_flip_twitter_emotion_epoch7_alpha0.8_refined/checkpoint-1632")
from transformers import pipeline
clf = pipeline("text-classification", model = m, tokenizer = student_tokenizer)
import pandas as pd
from collections import defaultdict
for label_match in [0, 1, 2, 3]:
    count = 0
    shrinked_t_test = test_data
    data_test = shrinked_t_test[shrinked_t_test['label']==label_match]
    for i, r in data_test.iterrows():
        text = r['text']
        label = "LABEL_"+ str(r['label'])
        pred = clf(text)[0].get('label')
#         if pred == 'negative':
#             pred = "LABEL_"+ str(0)
# #         elif pred == 'neutral':
# #             pred = "LABEL_"+ str(1)
#         elif pred == 'positive':
#             pred = "LABEL_"+ str(1)
        pred = "LABEL_" + str(pred)
        if pred == label:
            count += 1

    accuracy = count/len(data_test)
    print("Accuracy for Label "+ str(label_match)+" : "+ str(accuracy))
    

loading configuration file ./bert-distilled-multi_teacher_model_flip_twitter_emotion_epoch7_alpha0.8_refined/checkpoint-1632/config.json
Model config DistilBertConfig {
  "_name_or_path": "./bert-distilled-multi_teacher_model_flip_twitter_emotion_epoch7_alpha0.8_refined/checkpoint-1632",
  "activation": "gelu",
  "architectures": [
    "DistilBertForSequenceClassification"
  ],
  "attention_dropout": 0.1,
  "dim": 768,
  "dropout": 0.1,
  "hidden_dim": 3072,
  "id2label": {
    "0": 0,
    "1": 1,
    "2": 2,
    "3": 3
  },
  "initializer_range": 0.02,
  "label2id": {
    "0": "0",
    "1": "1",
    "2": "2",
    "3": "3"
  },
  "max_position_embeddings": 512,
  "model_type": "distilbert",
  "n_heads": 12,
  "n_layers": 6,
  "pad_token_id": 0,
  "problem_type": "single_label_classification",
  "qa_dropout": 0.1,
  "seq_classif_dropout": 0.2,
  "sinusoidal_pos_embds": false,
  "tie_weights_": true,
  "torch_dtype": "float32",
  "transformers_version": "4.23.1",
  "vocab_size": 30522
}


Accuracy for Label 0 : 0.9086021505376344
Accuracy for Label 1 : 0.7793296089385475
Accuracy for Label 2 : 0.4715447154471545
Accuracy for Label 3 : 0.6335078534031413


In [None]:
from transformers import pipeline
# clf = pipeline("text-classification", model = m, tokenizer = student_tokenizer)
