In [33]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline
import warnings
warnings.filterwarnings('ignore')
from sklearn.model_selection import train_test_split

### Import required transformer libraries

In [34]:
!pip install transformers



In [35]:
import torch
from torch.utils.data import Dataset
from transformers import DistilBertTokenizerFast,DistilBertForSequenceClassification
from transformers import Trainer,TrainingArguments
from transformers import DistilBertTokenizerFast, BertForMaskedLM

### Some needed liberaries

In [36]:
from sklearn.metrics import accuracy_score, f1_score
from google.colab import drive
# from dataset import load_metric

## Read Data

In [5]:
# path = 'drive/My Drive/DataLab/sarcasm/train.En.csv'
# drive.mount('/content/drive')
df = pd.read_csv('train.En.csv')
df = df.dropna(subset=['tweet'])

## Split Test and Train

In [37]:
train, test = train_test_split(df, test_size=0.65)

In [38]:
train.head()

Unnamed: 0.1,Unnamed: 0,tweet,sarcastic,rephrase,sarcasm,irony,satire,understatement,overstatement,rhetorical_question
2002,2002,A nice thing about not sharing my office with ...,0,,,,,,,
943,943,I haven’t seen my best friend in over a year I...,0,,,,,,,
822,822,@DetroitPistons @PistonsGT Cade Cunningham is ...,1,"""I'm not a Piston's fan but it would be really...",1.0,0.0,1.0,0.0,0.0,0.0
909,909,@BioMarkDarrah @Bio_Warner @GambleMike @BenIrv...,0,,,,,,,
181,181,hey quick question is it fall or spring???,1,Why is it so frickin warm in November someone lmk,0.0,1.0,0.0,0.0,0.0,1.0


In [39]:
test.head()

Unnamed: 0.1,Unnamed: 0,tweet,sarcastic,rephrase,sarcasm,irony,satire,understatement,overstatement,rhetorical_question
1602,1602,Less than 5 minutes after getting my phone bac...,0,,,,,,,
805,805,Love seeing 15 year olds being vaccinated in t...,1,I don't like the fact that younger people in a...,1.0,0.0,0.0,0.0,0.0,0.0
864,864,I get a lot of boy who cried wolf vibes from t...,1,The red cross is always needy.,0.0,1.0,0.0,0.0,0.0,0.0
321,321,im about to just walk into a place and start w...,1,I wish someone would hire me so I don't have t...,1.0,0.0,0.0,0.0,0.0,0.0
1299,1299,"Well, my vaccinated therapist tested positive ...",0,,,,,,,


## Extract Features and Labels

In [40]:
train_tweets = train['tweet'].values.tolist()
train_labels = train['sarcastic'].values.tolist()
test_tweets = test['tweet'].values.tolist()

## Split the training sample into train and validation set

In [58]:
train_tweets, val_tweets, train_labels, val_labels = train_test_split(train_tweets, train_labels, 
                                                                    test_size=0.2,random_state=42,stratify=train_labels)

## Steps for Fine Tuning model

<ul>
<li>Prepare dataset</li>
<li>Load pretrained tokenizer,call it with dataset</li>
<li>Build Pytorch datasets with encodings</li>
<li>Load pretrained Model</li>
<li> Load Trainer and train it </li>
    Instead of Trainer we could've use native Pytorch training pipline.
</ul>

### Set Model Name

In [59]:
model_name = 'detecting-sarcasim'

## Tokenization

In [60]:
tokenizer = DistilBertTokenizerFast.from_pretrained('bert-base-cased',num_labels=2)

# number of labels here is 2

loading file https://huggingface.co/bert-base-cased/resolve/main/vocab.txt from cache at /root/.cache/huggingface/transformers/6508e60ab3c1200bffa26c95f4b58ac6b6d95fba4db1f195f632fa3cd7bc64cc.437aa611e89f6fc6675a049d2b5545390adbc617e7d655286421c191d2be2791
loading file https://huggingface.co/bert-base-cased/resolve/main/tokenizer.json from cache at /root/.cache/huggingface/transformers/226a307193a9f4344264cdc76a12988448a25345ba172f2c7421f3b6810fddad.3dab63143af66769bbb35e3811f75f7e16b2320e12b7935e216bd6159ce6d9a6
loading file https://huggingface.co/bert-base-cased/resolve/main/added_tokens.json from cache at None
loading file https://huggingface.co/bert-base-cased/resolve/main/special_tokens_map.json from cache at None
loading file https://huggingface.co/bert-base-cased/resolve/main/tokenizer_config.json from cache at /root/.cache/huggingface/transformers/ec84e86ee39bfe112543192cf981deebf7e6cbe8c91b8f7f8f63c9be44366158.ec5c189f89475aac7d8cbd243960a0655cfadc3d0474da8ff2ed0bf1699c2a5f
lo

<blockquote> The difference between a “fast” and a “non-fast” tokenizer is computation speed but there is no functional difference between them.
<blockqoute> FastTokenizers are implemented in Rust and are factors faster than the Python based tokenizers. Apart from that their encoding methods should behave the same. However, they are not functionally identical.</blockqoute>

In [61]:
train_encodings = tokenizer(train_tweets, truncation=True, padding=True,return_tensors = 'pt')

Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


In [62]:
val_encodings = tokenizer(val_tweets, truncation=True, padding=True,return_tensors = 'pt')

In [63]:
test_encodings = tokenizer(test_tweets, truncation=True, padding=True,return_tensors = 'pt')

<ul>
    <li>setting truncation = True will eliminate tokens that exceed the max_length(512) in case of BERT.</li>
    <li>setting padding =True will pad documents that have length less than max_length with empty tokens i.e. 0, ensuring that all of our sequences are padded to the same length.</li>
    <li>setting return_tensors = ‘pt’ will return the encodings as pytorch tensors.</li>
    <li>This will allow us to feed batches of sequences into the model at the same time.</li>
</ul>

## Turn labels and encodings into a Dataset object

<ul>
    <li>Wrap the tokenized data into a torch dataset.</li>
    <li>In PyTorch, this is done by subclassing a torch.utils.data.Dataset object and implementing len and getitem.</li>
<ul>   

In [64]:
class SarcasimDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)
    
## Test Dataset
class SarcasimTestDataset(torch.utils.data.Dataset):
    def __init__(self, encodings):
        self.encodings = encodings

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        return item
    def __len__(self):
        return len(self.encodings)

## Genearte DataLoaders

In [65]:
train_dataset = SarcasimDataset(train_encodings, train_labels)

In [66]:
val_dataset = SarcasimDataset(val_encodings, val_labels)

In [67]:
test_dataset = SarcasimTestDataset(test_encodings)

## Define a Simple Metrics Function

In [68]:
def compute_metrics(p):
    pred, labels = p
    pred = np.argmax(pred, axis=1)

    accuracy = accuracy_score(y_true=labels, y_pred=pred)
    #recall = recall_score(y_true=labels, y_pred=pred)
    #precision = precision_score(y_true=labels, y_pred=pred)
    f1 = f1_score(labels, pred, average='weighted')

    return {"accuracy": accuracy,"f1_score":f1}

In [69]:
training_args = TrainingArguments(
    output_dir='./res', evaluation_strategy="steps", num_train_epochs=5, per_device_train_batch_size=32,
    per_device_eval_batch_size=64, warmup_steps=500, weight_decay=0.01,logging_dir='./logs4',
    #logging_steps=10,
    load_best_model_at_end=True,
)

using `logging_steps` to initialize `eval_steps` to 500
PyTorch: setting up devices
The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).


<ul>
    <li> output_dir = output directory</li>
    <li> num_train_epochs = total number of training epochs</li>
    <li> per_device_train_batch_size = batch size per device during training</li>
    <li> per_device_eval_batch_size = batch size for evaluation</li>
    <li> warmup_steps = number of warmup steps for learning rate scheduler</li>
    <li> weight_decay = strength of weight decay</li>
    <li> logging_dir = directory for storing logs</li>
<ul>

## Fine Tuning with Trainer

In [70]:
model = DistilBertForSequenceClassification.from_pretrained("distilbert-base-uncased",num_labels=2)
# distilbert-base-uncased
# bert-base-cased

loading configuration file https://huggingface.co/distilbert-base-uncased/resolve/main/config.json from cache at /root/.cache/huggingface/transformers/23454919702d26495337f3da04d1655c7ee010d5ec9d77bdb9e399e00302c0a1.91b885ab15d631bf9cee9dc9d25ece0afd932f2f5130eba28f2055b2220c0333
Model config DistilBertConfig {
  "activation": "gelu",
  "architectures": [
    "DistilBertForMaskedLM"
  ],
  "attention_dropout": 0.1,
  "dim": 768,
  "dropout": 0.1,
  "hidden_dim": 3072,
  "initializer_range": 0.02,
  "max_position_embeddings": 512,
  "model_type": "distilbert",
  "n_heads": 12,
  "n_layers": 6,
  "pad_token_id": 0,
  "qa_dropout": 0.1,
  "seq_classif_dropout": 0.2,
  "sinusoidal_pos_embds": false,
  "tie_weights_": true,
  "transformers_version": "4.15.0",
  "vocab_size": 30522
}

loading weights file https://huggingface.co/distilbert-base-uncased/resolve/main/pytorch_model.bin from cache at /root/.cache/huggingface/transformers/9c169103d7e5a73936dd2b627e42851bec0831212b677c637033ee4bce9

In [71]:
from torch import nn
from transformers import Trainer

class CustomTrainer(Trainer):
    def compute_loss(self, model, inputs, return_outputs=False):
        labels = inputs.get("labels")
        # forward pass
        outputs = model(**inputs)
        logits = outputs.get('logits')
        # compute custom loss
        loss_fct = nn.CrossEntropyLoss(weight=torch.tensor([0.1, 0.3]))
        loss = loss_fct(logits.view(-1, self.model.config.num_labels), labels.view(-1))
        return (loss, outputs) if return_outputs else loss

In [72]:
trainer = Trainer(
    model=model, args=training_args, train_dataset=train_dataset,
    eval_dataset=val_dataset,
    compute_metrics=compute_metrics,
)

<ul>
    <li> model = the instantiated hugging-face Transformers model to be trained </li>
    <li> args=training_args = training arguments, defined above</li>
    <li> train_dataset = training dataset</li>
    <li> eval_dataset = evaluation dataset</li>
<ul>

In [73]:
trainer.train()

***** Running training *****
  Num examples = 872
  Num Epochs = 5
  Instantaneous batch size per device = 32
  Total train batch size (w. parallel, distributed & accumulation) = 32
  Gradient Accumulation steps = 1
  Total optimization steps = 140


Step,Training Loss,Validation Loss




Training completed. Do not forget to share your model on huggingface.co/models =)




TrainOutput(global_step=140, training_loss=0.5858716147286551, metrics={'train_runtime': 23.823, 'train_samples_per_second': 183.016, 'train_steps_per_second': 5.877, 'total_flos': 118444482626400.0, 'train_loss': 0.5858716147286551, 'epoch': 5.0})

In [74]:
trainer.evaluate()

***** Running Evaluation *****
  Num examples = 219
  Batch size = 64


{'epoch': 5.0,
 'eval_accuracy': 0.7534246575342466,
 'eval_f1_score': 0.6474743150684933,
 'eval_loss': 0.5559355020523071,
 'eval_runtime': 0.3894,
 'eval_samples_per_second': 562.392,
 'eval_steps_per_second': 10.272}

In [75]:
# test['sarcastic'] = 0
# test_tweets = test['tweet'].values.tolist() 
# test_labels = test['sarcastic'].values.tolist() 
# test_encodings = tokenizer(test_tweets,
#                            truncation=True, 
#                            padding=True,
#                            return_tensors = 'pt').to("cuda") 
# test_dataset = SentimentDataset(test_encodings, test_labels)

## Test

In [76]:
pin_memory=False
preds = trainer.predict(test_dataset=test_dataset)

***** Running Prediction *****
  Num examples = 2
  Batch size = 64


In [77]:
probs = torch.from_numpy(preds[0]).softmax(1)

# convert tensors to numpy array
predictions = probs.numpy()

In [78]:
newdf = pd.DataFrame(predictions,columns=['Neutral_1','Positive_2'])

In [79]:
newdf.head()

Unnamed: 0,Neutral_1,Positive_2
0,0.752623,0.247377
1,0.756322,0.243678


In [80]:
def labels(x):
  if x == 0:
    return 'Negative_1'
  else:
    return 'Positive_1'

results = np.argmax(predictions,axis=1)
# test['sarcastic'] = results
test['sarcastic'] = test['sarcastic'].map(labels)
test.head()

Unnamed: 0.1,Unnamed: 0,tweet,sarcastic,rephrase,sarcasm,irony,satire,understatement,overstatement,rhetorical_question
1602,1602,Less than 5 minutes after getting my phone bac...,Negative_1,,,,,,,
805,805,Love seeing 15 year olds being vaccinated in t...,Positive_1,I don't like the fact that younger people in a...,1.0,0.0,0.0,0.0,0.0,0.0
864,864,I get a lot of boy who cried wolf vibes from t...,Positive_1,The red cross is always needy.,0.0,1.0,0.0,0.0,0.0,0.0
321,321,im about to just walk into a place and start w...,Positive_1,I wish someone would hire me so I don't have t...,1.0,0.0,0.0,0.0,0.0,0.0
1299,1299,"Well, my vaccinated therapist tested positive ...",Negative_1,,,,,,,
