In [None]:
import torch
from transformers.file_utils import is_tf_available, is_torch_available, is_torch_tpu_available
from transformers import AutoTokenizer,AutoModelForSequenceClassification, AutoConfig
from transformers import Trainer, TrainingArguments, DataCollatorWithPadding
import numpy as np
import random
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
import pandas as pd
from sklearn.preprocessing import LabelEncoder
import os 
import time
import re
import seaborn as sbn
from string import ascii_lowercase
os.environ["WANDB_DISABLED"] = "true"

In [None]:
import re
def clean(data):
    data = data.lower() 
#     data = re.sub('j','i',data) 
    with open('../input/datacomments/teencode.txt','r') as file:
      file = file.read()
      lines = file.split('\n')
      for line in lines:
        elements = line.split('\t')
        data = re.sub(r'\b{}+\b'.format(elements[0]), elements[1], data)
    alphabet = 'abcdefghijlmnopqrstuvwxyz'
    for c in alphabet:
      data = re.sub(r'{}+'.format(c), c, data)

    data = re.sub(r'\s+', ' ', data)
    return data



In [None]:

# call the function
df_train = pd.read_excel('../input/datacomments/train.xlsx')
df_test =  pd.read_excel('../input/datacomments/test.xlsx')
df_valid = pd.read_excel('../input/datacomments/valid.xlsx')

df_train['Sentence'] = df_train['Sentence'].apply(clean)
df_test['Sentence'] = df_test['Sentence'].apply(clean)
df_valid['Sentence'] = df_valid['Sentence'].apply(clean)



test_texts = list(df_test['Sentence'])
train_texts = list(df_train['Sentence'])
valid_texts = list(df_valid['Sentence'])

y= LabelEncoder()

train_labels = y.fit_transform(df_train['Emotion'])
valid_labels = y.fit_transform(df_valid['Emotion'])
test_labels = y.fit_transform(df_test['Emotion'])

target_names = list(df_train.Emotion.unique())


In [None]:
# the model we gonna train, base uncased BERT
# check text classification models here: https://huggingface.co/models?filter=text-classification
 
# model_name = "xlm-roberta-base" #66 65 66 256 66 65 66 512
# 
model_name = "vinai/phobert-base" #62 59 52 256 62 59 62 512



# max sequence length for each document/sentence sample
max_length = 512
# load the tokenizer
tokenizer = AutoTokenizer.from_pretrained(model_name, do_lower_case=True)


Downloading (…)lve/main/config.json:   0%|          | 0.00/557 [00:00<?, ?B/s]

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/895k [00:00<?, ?B/s]

Downloading (…)solve/main/bpe.codes:   0%|          | 0.00/1.14M [00:00<?, ?B/s]

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [None]:
# tokenize the dataset, truncate when passed `max_length`, 
# and pad with 0's when less than `max_length`
train_encodings = tokenizer(train_texts, truncation=True, padding=True, max_length=max_length)
valid_encodings = tokenizer(valid_texts, truncation=True, padding=True, max_length=max_length)
test_encodings = tokenizer(test_texts, truncation=True, padding=True, max_length=max_length)


# https://huggingface.co/transformers/v3.4.0/custom_datasets.html
class NewsGroupsDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {k: torch.tensor(v[idx]) for k, v in self.encodings.items()}
        item["labels"] = torch.tensor([self.labels[idx]])
        return item

    def __len__(self):
        return len(self.labels)

# convert our tokenized data into a torch Dataset

train_dataset = NewsGroupsDataset(train_encodings, train_labels)
valid_dataset = NewsGroupsDataset(valid_encodings, valid_labels)
test_dataset = NewsGroupsDataset(test_encodings, test_labels)



In [None]:

label2id = {"Anger": 0, "Disgust": 1, "Enjoyment": 2, "Fear": 3, "Other": 4, "Sadness": 5, "Surprise": 6}
id2label = {0: "Anger", 1: "Disgust", 2: "Enjoyment", 3: "Fear", 4: "Other", 5: "Sadness", 6: "Surprise"}


model = AutoModelForSequenceClassification.from_pretrained(model_name,label2id=label2id,
                        id2label=id2label, num_labels=7)

Downloading (…)"pytorch_model.bin";:   0%|          | 0.00/543M [00:00<?, ?B/s]

Some weights of the model checkpoint at vinai/phobert-base were not used when initializing RobertaForSequenceClassification: ['lm_head.layer_norm.bias', 'lm_head.dense.bias', 'lm_head.layer_norm.weight', 'roberta.pooler.dense.weight', 'roberta.pooler.dense.bias', 'lm_head.bias', 'lm_head.dense.weight', 'lm_head.decoder.weight', 'lm_head.decoder.bias']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at vinai/phobert-base and are newly initialized: ['

In [None]:
def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    # calculate accuracy using sklearn's function
    acc = accuracy_score(labels, preds)
    return {
      'accuracy': acc,
    }


training_args = TrainingArguments(
    output_dir='my_model',          # output directory
    evaluation_strategy="epoch",
    save_strategy='epoch',
    metric_for_best_model="accuracy",
    load_best_model_at_end=True,
    greater_is_better=True,
    optim="adamw_hf",
    num_train_epochs=5,              # total number of training epochs
    per_device_train_batch_size=16,  # batch size per device during training
    per_device_eval_batch_size=16,   # batch size for evaluation
    warmup_steps=500,                # number of warmup steps for learning rate scheduler
    weight_decay=0.1,               # strength of weight decay
    learning_rate=5e-5,
    gradient_accumulation_steps=1,
    lr_scheduler_type="linear",
    logging_steps = round(len(train_dataset) / 16),
    save_total_limit = 2

)
data_collator_ = DataCollatorWithPadding(tokenizer=tokenizer)



trainer = Trainer(
    model=model,                         # the instantiated Transformers model to be trained
    args=training_args,                  # training arguments, defined above
    train_dataset=train_dataset,         # training dataset
    eval_dataset=valid_dataset,          # evaluation dataset
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,     # the callback that computes metrics of interestm,
    data_collator = data_collator_,
    # optimizers=(optimizer, lr_scheduler)
    # callbacks=[early_stopping]
)
# train the model
time_start = time.time()

trainer.train()



# evaluate the current model after training

time_end = time.time()
total_time = time_end - time_start
trainer.evaluate()

Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).
***** Running training *****
  Num examples = 5548
  Num Epochs = 5
  Instantaneous batch size per device = 16
  Total train batch size (w. parallel, distributed & accumulation) = 16
  Gradient Accumulation steps = 1
  Total optimization steps = 1735
  Number of trainable parameters = 135003655


Epoch,Training Loss,Validation Loss,Accuracy
1,1.6341,1.232731,0.552478
2,1.0988,1.063544,0.597668
3,0.7291,1.061392,0.638484
4,0.4357,1.172383,0.641399
5,0.2354,1.348892,0.625364


***** Running Evaluation *****
  Num examples = 686
  Batch size = 16
Saving model checkpoint to my_model/checkpoint-347
Configuration saved in my_model/checkpoint-347/config.json
Model weights saved in my_model/checkpoint-347/pytorch_model.bin
tokenizer config file saved in my_model/checkpoint-347/tokenizer_config.json
Special tokens file saved in my_model/checkpoint-347/special_tokens_map.json
added tokens file saved in my_model/checkpoint-347/added_tokens.json
***** Running Evaluation *****
  Num examples = 686
  Batch size = 16
Saving model checkpoint to my_model/checkpoint-694
Configuration saved in my_model/checkpoint-694/config.json
Model weights saved in my_model/checkpoint-694/pytorch_model.bin
tokenizer config file saved in my_model/checkpoint-694/tokenizer_config.json
Special tokens file saved in my_model/checkpoint-694/special_tokens_map.json
added tokens file saved in my_model/checkpoint-694/added_tokens.json
***** Running Evaluation *****
  Num examples = 686
  Batch size

{'eval_loss': 1.172383189201355,
 'eval_accuracy': 0.641399416909621,
 'eval_runtime': 1.9142,
 'eval_samples_per_second': 358.375,
 'eval_steps_per_second': 22.464,
 'epoch': 5.0}

In [None]:
from sklearn.metrics import classification_report
import numpy as np
pred = trainer.predict(test_dataset)

# print(test_labels)
y_pred = np.argmax(pred.predictions, axis=1)
# print(y_pred)
print(classification_report(test_labels,y_pred, digits=3))

***** Running Prediction *****
  Num examples = 693
  Batch size = 16


              precision    recall  f1-score   support

           0      0.442     0.575     0.500        40
           1      0.612     0.538     0.573       132
           2      0.761     0.710     0.735       193
           3      0.717     0.826     0.768        46
           4      0.549     0.612     0.579       129
           5      0.706     0.724     0.715       116
           6      0.793     0.622     0.697        37

    accuracy                          0.657       693
   macro avg      0.654     0.658     0.652       693
weighted avg      0.664     0.657     0.658       693



In [None]:
trainer.save_model('transformers-phobert')
# tokenizer.save_pretrained("tokenizer-bart")

Saving model checkpoint to trainer-phobert
Configuration saved in trainer-phobert/config.json
Model weights saved in trainer-phobert/pytorch_model.bin
tokenizer config file saved in trainer-phobert/tokenizer_config.json
Special tokens file saved in trainer-phobert/special_tokens_map.json
added tokens file saved in trainer-phobert/added_tokens.json
