In [1]:
import pandas as pd
import numpy as np
import torch
import demoji
import re
from sklearn.model_selection import train_test_split 
from sklearn.metrics import f1_score, accuracy_score, recall_score, classification_report

from torch.utils.data import Dataset
import torch.nn.functional as F

from transformers import pipeline 
from transformers import DistilBertTokenizerFast, DistilBertForSequenceClassification
from transformers import BertTokenizer,BertForSequenceClassification
from transformers import AutoTokenizer,AutoModelForSequenceClassification
from transformers import BertweetTokenizer, AutoModel
from transformers import XLNetTokenizer, XLNetModel
from transformers import Trainer, TrainingArguments

#### 1. Prepare dataset

In [2]:
task='irony'
model_name = f"cardiffnlp/twitter-roberta-base-2021-124m-{task}"

# load data set
df = pd.read_csv('./train.En.csv').sample(frac = 1)
df_test = pd.read_csv('./test.En.csv').sample(frac = 1)
df = df.drop(df.columns[range(4,10)], axis=1)

In [3]:
import string

#preparing punctuations list
more_punct = '''¯ツ'''
english_punctuations = string.punctuation
punc_to_remove = ''.join(set(more_punct + english_punctuations))
punc_to_keep = '''@#!?+.()}{,:/$_-"'''
punc_to_escape = '''[]-^'''
for p in punc_to_keep: punc_to_remove = punc_to_remove.replace(p, '')
for p in punc_to_escape: punc_to_remove = punc_to_remove.replace(p, '\\{}'.format(p))
print(punc_to_remove)
def clean_tags_links(text):
    new_text = []
    for t in text.split(" "):
        new_text.append(t)
    return " ".join(new_text)
def pre_process(text):
    text = text.replace('\\n', ' ').replace('\n', ' ')
    text = demoji.replace_with_desc(text) 
    for p in punc_to_remove: text = text.replace(p, '')  # remove punctuations
    
    link_regex = re.compile(r'https?://[^\s]+')
    text = link_regex.sub(':http:', text)
    text = re.sub(r'\s+', r' ', text)
    return text
def clean_text_data(text_data):
    for i, text in enumerate(text_data):
        try:
            text_data[i] = pre_process(text)
        except:
            print(text_data[i])
    return text_data

\`¯*;=\^%'\[<>~&\]|ツ


In [5]:
text = df["tweet"].astype(str).values.tolist()
text = clean_text_data(text)
pd.DataFrame(text).to_csv('text_cleaned')
labels = df["sarcastic"].to_list()
train_text, val_text, train_labels, val_labels = train_test_split(text, labels, test_size=.2, random_state=32) 

test_text = df_test['text'].to_list()
test_text = clean_text_data(test_text)
test_labels = df_test['sarcastic'].to_list()

In [7]:
print(train_text[0:2])

['Im gonna aspire to be my mirroverse self :relieved face: If thats a good thing or not idk', 'everyone gets all hyped about ancient debris, but wheres my modern debris?']


#### 2. Load Pretrained Tokenizer and encode the dataset 

In [21]:
tokenizer = AutoTokenizer.from_pretrained(model_name)

In [22]:
val_encodings = tokenizer(val_text, truncation=True, padding=True)
train_encodings = tokenizer(train_text, truncation=True, padding=True)
test_encodings = tokenizer(test_text, truncation=True, padding=True)

#### 3. Build Pytorch dataset 

In [23]:
class MYDataset(Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels
        
    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item
    
    def __len__(self):
        return len(self.labels)

In [24]:
train_dataset = MYDataset(train_encodings, train_labels)
val_dataset = MYDataset(val_encodings, val_labels)
test_dataset = MYDataset(test_encodings, test_labels)

#### 4. load pretrained model

In [25]:
model = AutoModelForSequenceClassification.from_pretrained(model_name)

#### 5. Fine Tuning 

In [26]:
training_args = TrainingArguments(
    output_dir='./results',
    num_train_epochs=2,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=64,
    warmup_steps=500,
    learning_rate=5e-5,
    weight_decay=0.01,
    logging_dir='./logs',
    logging_steps=10,
)

In [27]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset
)

In [28]:
training_res = trainer.train()

***** Running training *****
  Num examples = 2774
  Num Epochs = 2
  Instantaneous batch size per device = 16
  Total train batch size (w. parallel, distributed & accumulation) = 16
  Gradient Accumulation steps = 1
  Total optimization steps = 348
  Number of trainable parameters = 124647170
  3%|▎         | 10/348 [00:04<01:31,  3.68it/s]

{'loss': 2.1757, 'learning_rate': 1.0000000000000002e-06, 'epoch': 0.06}


  6%|▌         | 20/348 [00:06<01:22,  3.99it/s]

{'loss': 2.4026, 'learning_rate': 2.0000000000000003e-06, 'epoch': 0.11}


  9%|▊         | 30/348 [00:09<01:19,  4.02it/s]

{'loss': 1.6186, 'learning_rate': 3e-06, 'epoch': 0.17}


 11%|█▏        | 40/348 [00:11<01:17,  3.98it/s]

{'loss': 1.6526, 'learning_rate': 4.000000000000001e-06, 'epoch': 0.23}


 14%|█▍        | 50/348 [00:14<01:15,  3.94it/s]

{'loss': 1.6911, 'learning_rate': 5e-06, 'epoch': 0.29}


 17%|█▋        | 60/348 [00:16<01:12,  3.95it/s]

{'loss': 1.2954, 'learning_rate': 6e-06, 'epoch': 0.34}


 20%|██        | 70/348 [00:19<01:10,  3.93it/s]

{'loss': 0.9961, 'learning_rate': 7.000000000000001e-06, 'epoch': 0.4}


 23%|██▎       | 80/348 [00:21<01:07,  3.98it/s]

{'loss': 0.8129, 'learning_rate': 8.000000000000001e-06, 'epoch': 0.46}


 26%|██▌       | 90/348 [00:24<01:05,  3.95it/s]

{'loss': 0.8718, 'learning_rate': 9e-06, 'epoch': 0.52}


 29%|██▊       | 100/348 [00:26<01:02,  3.99it/s]

{'loss': 0.6233, 'learning_rate': 1e-05, 'epoch': 0.57}


 32%|███▏      | 110/348 [00:29<00:59,  3.97it/s]

{'loss': 0.5425, 'learning_rate': 1.1000000000000001e-05, 'epoch': 0.63}


 34%|███▍      | 120/348 [00:31<00:57,  3.94it/s]

{'loss': 0.4605, 'learning_rate': 1.2e-05, 'epoch': 0.69}


 37%|███▋      | 130/348 [00:34<00:54,  3.97it/s]

{'loss': 0.6533, 'learning_rate': 1.3000000000000001e-05, 'epoch': 0.75}


 40%|████      | 140/348 [00:36<00:52,  3.99it/s]

{'loss': 0.511, 'learning_rate': 1.4000000000000001e-05, 'epoch': 0.8}


 43%|████▎     | 150/348 [00:39<00:49,  4.02it/s]

{'loss': 0.501, 'learning_rate': 1.5e-05, 'epoch': 0.86}


 46%|████▌     | 160/348 [00:41<00:47,  3.99it/s]

{'loss': 0.5901, 'learning_rate': 1.6000000000000003e-05, 'epoch': 0.92}


 49%|████▉     | 170/348 [00:44<00:44,  3.98it/s]

{'loss': 0.5609, 'learning_rate': 1.7000000000000003e-05, 'epoch': 0.98}


 52%|█████▏    | 180/348 [00:46<00:41,  4.05it/s]

{'loss': 0.4145, 'learning_rate': 1.8e-05, 'epoch': 1.03}


 55%|█████▍    | 190/348 [00:49<00:39,  3.98it/s]

{'loss': 0.4595, 'learning_rate': 1.9e-05, 'epoch': 1.09}


 57%|█████▋    | 200/348 [00:51<00:37,  3.98it/s]

{'loss': 0.4746, 'learning_rate': 2e-05, 'epoch': 1.15}


 60%|██████    | 210/348 [00:54<00:34,  3.97it/s]

{'loss': 0.5221, 'learning_rate': 2.1e-05, 'epoch': 1.21}


 63%|██████▎   | 220/348 [00:56<00:32,  4.00it/s]

{'loss': 0.4581, 'learning_rate': 2.2000000000000003e-05, 'epoch': 1.26}


 66%|██████▌   | 230/348 [00:59<00:29,  3.96it/s]

{'loss': 0.4787, 'learning_rate': 2.3000000000000003e-05, 'epoch': 1.32}


 69%|██████▉   | 240/348 [01:01<00:27,  3.94it/s]

{'loss': 0.4351, 'learning_rate': 2.4e-05, 'epoch': 1.38}


 72%|███████▏  | 250/348 [01:04<00:24,  3.97it/s]

{'loss': 0.5303, 'learning_rate': 2.5e-05, 'epoch': 1.44}


 75%|███████▍  | 260/348 [01:07<00:22,  3.93it/s]

{'loss': 0.4213, 'learning_rate': 2.6000000000000002e-05, 'epoch': 1.49}


 78%|███████▊  | 270/348 [01:09<00:19,  3.98it/s]

{'loss': 0.5347, 'learning_rate': 2.7000000000000002e-05, 'epoch': 1.55}


 80%|████████  | 280/348 [01:12<00:17,  3.99it/s]

{'loss': 0.4617, 'learning_rate': 2.8000000000000003e-05, 'epoch': 1.61}


 83%|████████▎ | 290/348 [01:14<00:14,  4.00it/s]

{'loss': 0.4647, 'learning_rate': 2.9e-05, 'epoch': 1.67}


 86%|████████▌ | 300/348 [01:17<00:12,  3.97it/s]

{'loss': 0.5078, 'learning_rate': 3e-05, 'epoch': 1.72}


 89%|████████▉ | 310/348 [01:19<00:09,  3.96it/s]

{'loss': 0.4942, 'learning_rate': 3.1e-05, 'epoch': 1.78}


 92%|█████████▏| 320/348 [01:22<00:07,  3.98it/s]

{'loss': 0.5156, 'learning_rate': 3.2000000000000005e-05, 'epoch': 1.84}


 95%|█████████▍| 330/348 [01:24<00:04,  3.93it/s]

{'loss': 0.4948, 'learning_rate': 3.3e-05, 'epoch': 1.9}


 98%|█████████▊| 340/348 [01:27<00:02,  3.95it/s]

{'loss': 0.4631, 'learning_rate': 3.4000000000000007e-05, 'epoch': 1.95}


100%|██████████| 348/348 [01:29<00:00,  4.53it/s]

Training completed. Do not forget to share your model on huggingface.co/models =)


100%|██████████| 348/348 [01:29<00:00,  3.90it/s]

{'train_runtime': 89.2091, 'train_samples_per_second': 62.191, 'train_steps_per_second': 3.901, 'train_loss': 0.7603383372569906, 'epoch': 2.0}





In [29]:
metrics = trainer.evaluate()
metrics

***** Running Evaluation *****
  Num examples = 694
  Batch size = 64
100%|██████████| 11/11 [00:02<00:00,  4.64it/s]


{'eval_loss': 0.5686506032943726,
 'eval_runtime': 2.3856,
 'eval_samples_per_second': 290.914,
 'eval_steps_per_second': 4.611,
 'epoch': 2.0}

In [30]:
save_dir = './saved_models_sarcasm'
trainer.save_model(save_dir)
tokenizer.save_pretrained(save_dir)

Saving model checkpoint to ./saved_models_sarcasm
Configuration saved in ./saved_models_sarcasm\config.json
Model weights saved in ./saved_models_sarcasm\pytorch_model.bin
tokenizer config file saved in ./saved_models_sarcasm\tokenizer_config.json
Special tokens file saved in ./saved_models_sarcasm\special_tokens_map.json


('./saved_models_sarcasm\\tokenizer_config.json',
 './saved_models_sarcasm\\special_tokens_map.json',
 './saved_models_sarcasm\\vocab.json',
 './saved_models_sarcasm\\merges.txt',
 './saved_models_sarcasm\\added_tokens.json',
 './saved_models_sarcasm\\tokenizer.json')

In [31]:
predictions = trainer.predict(test_dataset)

***** Running Prediction *****
  Num examples = 1400
  Batch size = 64
100%|██████████| 22/22 [00:06<00:00,  3.54it/s]


In [33]:
preds = []
for p in predictions.predictions:
    preds.append(np.argmax(p))

In [34]:
print('f1_score:: ',f1_score(test_dataset.labels, preds))
print(classification_report(test_dataset.labels, preds))

f1_score::  0.5287356321839081
              precision    recall  f1-score   support

           0       0.91      0.95      0.93      1200
           1       0.62      0.46      0.53       200

    accuracy                           0.88      1400
   macro avg       0.77      0.71      0.73      1400
weighted avg       0.87      0.88      0.88      1400

