In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline
import warnings
warnings.filterwarnings('ignore')
from sklearn.model_selection import train_test_split

### Import required transformer libraries

In [None]:
!pip install transformers

Collecting transformers
  Downloading transformers-4.15.0-py3-none-any.whl (3.4 MB)
[K     |████████████████████████████████| 3.4 MB 5.1 MB/s 
Collecting huggingface-hub<1.0,>=0.1.0
  Downloading huggingface_hub-0.4.0-py3-none-any.whl (67 kB)
[K     |████████████████████████████████| 67 kB 5.2 MB/s 
Collecting pyyaml>=5.1
  Downloading PyYAML-6.0-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl (596 kB)
[K     |████████████████████████████████| 596 kB 49.8 MB/s 
Collecting sacremoses
  Downloading sacremoses-0.0.47-py2.py3-none-any.whl (895 kB)
[K     |████████████████████████████████| 895 kB 66.6 MB/s 
[?25hCollecting tokenizers<0.11,>=0.10.1
  Downloading tokenizers-0.10.3-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl (3.3 MB)
[K     |████████████████████████████████| 3.3 MB 47.3 MB/s 
Installing collected packages: pyyaml, tokenizers, sacremoses, huggingface-hub, transformers
  A

In [None]:
import torch
from torch.utils.data import Dataset
from transformers import DistilBertTokenizerFast,DistilBertForSequenceClassification
from transformers import Trainer,TrainingArguments
from transformers import DistilBertTokenizerFast, BertForMaskedLM
from transformers import AutoModelForSequenceClassification
from transformers import TFAutoModelForSequenceClassification
from transformers import AutoTokenizer

### Some needed liberaries

In [None]:
from sklearn.metrics import accuracy_score, f1_score
from google.colab import drive
from torch import nn
from transformers import Trainer

# from dataset import load_metric

## Read Data

In [None]:
# path = 'drive/My Drive/DataLab/sarcasm/train.En.csv'
path = 'drive/My Drive/DataLab/sarcasm/train.En.csv'
drive.mount('/content/drive')
df = pd.read_csv(path)
df = df.dropna(subset=['tweet'])

Mounted at /content/drive


## Split Test and Train

In [None]:
train, test = train_test_split(df, test_size=0.1)

In [None]:
train.head()

Unnamed: 0.1,Unnamed: 0,tweet,sarcastic,rephrase,sarcasm,irony,satire,understatement,overstatement,rhetorical_question
1875,1875,True bliss is laying in an ice cold bath durin...,0,,,,,,,
1036,1036,"this narrative of ""protection"" that women requ...",0,,,,,,,
2928,2928,The fact that I went to high school with someo...,0,,,,,,,
844,844,Old people don’t deserve rights,1,old people suck,1.0,0.0,0.0,0.0,0.0,0.0
2592,2592,The Wanted Making a comeback has officially ma...,0,,,,,,,


In [None]:
test.head()

Unnamed: 0.1,Unnamed: 0,tweet,sarcastic,rephrase,sarcasm,irony,satire,understatement,overstatement,rhetorical_question
3068,3068,Omg flashbacks to emma blackery girl balls mer...,0,,,,,,,
1364,1364,What I’d do to be able to get the sims on my S...,0,,,,,,,
2946,2946,I work in DC. I live 15 minutes from the Capit...,0,,,,,,,
3213,3213,feeling so nostalgic today,0,,,,,,,
124,124,did you actually fly on a plane if you didn’t ...,1,You don’t need to post a picture from airplane...,1.0,0.0,0.0,0.0,0.0,1.0


## Extract Features and Labels

In [None]:
train_tweets = train['tweet'].values.tolist()
train_labels = train['sarcastic'].values.tolist()
test_tweets = test['tweet'].values.tolist()
test_labels = test['sarcastic']

## Split the training sample into train and validation set

In [None]:
train_tweets, val_tweets, train_labels, val_labels = train_test_split(train_tweets, train_labels, 
                                                                    test_size=0.1,random_state=42,stratify=train_labels)

## Steps for Fine Tuning model

<ul>
<li>Prepare dataset</li>
<li>Load pretrained tokenizer,call it with dataset</li>
<li>Build Pytorch datasets with encodings</li>
<li>Load pretrained Model</li>
<li> Load Trainer and train it </li>
    Instead of Trainer we could've use native Pytorch training pipline.
</ul>

### Set Model Name

In [None]:
model_name = 'detecting-sarcasim'

## Tokenization

In [None]:
task='sentiment'
MODEL = f"cardiffnlp/twitter-roberta-base-{task}"

tokenizer = AutoTokenizer.from_pretrained(MODEL,
                                           num_labels=2,
                                           loss_function_params={"weight": [0.75, 0.25]}
                                                    )

# number of labels here is 2

Downloading:   0%|          | 0.00/747 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/878k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/446k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/150 [00:00<?, ?B/s]

<blockquote> The difference between a “fast” and a “non-fast” tokenizer is computation speed but there is no functional difference between them.
<blockqoute> FastTokenizers are implemented in Rust and are factors faster than the Python based tokenizers. Apart from that their encoding methods should behave the same. However, they are not functionally identical.</blockqoute>

In [None]:
train_encodings = tokenizer(train_tweets, truncation=True, padding=True,return_tensors = 'pt')

Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


In [None]:
val_encodings = tokenizer(val_tweets, truncation=True, padding=True,return_tensors = 'pt')

In [None]:
test_encodings = tokenizer(test_tweets, truncation=True, padding=True,return_tensors = 'pt')

<ul>
    <li>setting truncation = True will eliminate tokens that exceed the max_length(512) in case of BERT.</li>
    <li>setting padding =True will pad documents that have length less than max_length with empty tokens i.e. 0, ensuring that all of our sequences are padded to the same length.</li>
    <li>setting return_tensors = ‘pt’ will return the encodings as pytorch tensors.</li>
    <li>This will allow us to feed batches of sequences into the model at the same time.</li>
</ul>

## Turn labels and encodings into a Dataset object

<ul>
    <li>Wrap the tokenized data into a torch dataset.</li>
    <li>In PyTorch, this is done by subclassing a torch.utils.data.Dataset object and implementing len and getitem.</li>
<ul>   

In [None]:
class SarcasimDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)
    
## Test Dataset
class SarcasimTestDataset(torch.utils.data.Dataset):
    def __init__(self, encodings):
        self.encodings = encodings

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        return item
    def __len__(self):
        return len(self.encodings)

## Genearte DataLoaders

In [None]:
train_dataset = SarcasimDataset(train_encodings, train_labels)

In [None]:
val_dataset = SarcasimDataset(val_encodings, val_labels)

In [None]:
test_dataset = SarcasimTestDataset(test_encodings)

## Define a Simple Metrics Function

In [None]:
def compute_metrics(p):
    pred, labels = p
    pred = np.argmax(pred, axis=1)

    accuracy = accuracy_score(y_true=labels, y_pred=pred)
    #recall = recall_score(y_true=labels, y_pred=pred)
    #precision = precision_score(y_true=labels, y_pred=pred)
    f1 = f1_score(labels, pred, average='weighted')

    return {"accuracy": accuracy,"f1_score":f1}

In [None]:
training_args = TrainingArguments(
    output_dir='./res', evaluation_strategy="steps", num_train_epochs=5, per_device_train_batch_size=32,
    per_device_eval_batch_size=64, warmup_steps=500, weight_decay=0.01,logging_dir='./logs4',
    #logging_steps=10,
    load_best_model_at_end=True,
)

<ul>
    <li> output_dir = output directory</li>
    <li> num_train_epochs = total number of training epochs</li>
    <li> per_device_train_batch_size = batch size per device during training</li>
    <li> per_device_eval_batch_size = batch size for evaluation</li>
    <li> warmup_steps = number of warmup steps for learning rate scheduler</li>
    <li> weight_decay = strength of weight decay</li>
    <li> logging_dir = directory for storing logs</li>
<ul>

## Fine Tuning with Trainer

In [None]:
model = AutoModelForSequenceClassification.from_pretrained(MODEL)


Downloading:   0%|          | 0.00/476M [00:00<?, ?B/s]

### Saving model 

this part wasn't used in bert session

In [None]:
model.save_pretrained(MODEL)

In [None]:
class CustomTrainer(Trainer):
    def compute_loss(self, model, inputs, return_outputs=False):
        labels = inputs.get("labels")
        # forward pass
        outputs = model(**inputs)
        logits = outputs.get('logits')
        # compute custom loss
        loss_fct = nn.CrossEntropyLoss(weight=torch.tensor([0.1, 0.3]))
        loss = loss_fct(logits.view(-1, self.model.config.num_labels), labels.view(-1))
        return (loss, outputs) if return_outputs else loss

In [None]:
trainer = Trainer(
    model=model, args=training_args, train_dataset=train_dataset,
    eval_dataset=val_dataset,
    compute_metrics=compute_metrics,
)


<ul>
    <li> model = the instantiated hugging-face Transformers model to be trained </li>
    <li> args=training_args = training arguments, defined above</li>
    <li> train_dataset = training dataset</li>
    <li> eval_dataset = evaluation dataset</li>
<ul>

In [None]:
trainer.train()

***** Running training *****
  Num examples = 2808
  Num Epochs = 5
  Instantaneous batch size per device = 32
  Total train batch size (w. parallel, distributed & accumulation) = 32
  Gradient Accumulation steps = 1
  Total optimization steps = 440


Step,Training Loss,Validation Loss




Training completed. Do not forget to share your model on huggingface.co/models =)




TrainOutput(global_step=440, training_loss=0.4919604561545632, metrics={'train_runtime': 194.137, 'train_samples_per_second': 72.32, 'train_steps_per_second': 2.266, 'total_flos': 1031754045001680.0, 'train_loss': 0.4919604561545632, 'epoch': 5.0})

In [None]:
trainer.evaluate()

***** Running Evaluation *****
  Num examples = 312
  Batch size = 64


{'epoch': 5.0,
 'eval_accuracy': 0.7756410256410257,
 'eval_f1_score': 0.7442989967765965,
 'eval_loss': 0.8274389505386353,
 'eval_runtime': 1.0748,
 'eval_samples_per_second': 290.295,
 'eval_steps_per_second': 4.652}

In [None]:
# test['sarcastic'] = 0
# test_tweets = test['tweet'].values.tolist() 
# test_labels = test['sarcastic'].values.tolist() 
# test_encodings = tokenizer(test_tweets,
#                            truncation=True, 
#                            padding=True,
#                            return_tensors = 'pt').to("cuda") 
# test_dataset = SentimentDataset(test_encodings, test_labels)

## Test

In [None]:
pin_memory=False
preds = trainer.predict(test_dataset=test_dataset)

***** Running Prediction *****
  Num examples = 2
  Batch size = 64


In [None]:
probs = torch.from_numpy(preds[0]).softmax(1)

# convert tensors to numpy array
predictions = probs.numpy()

In [None]:
newdf = pd.DataFrame(predictions,columns=['Negative_1','Positive_2'])

In [None]:
newdf.head()

Unnamed: 0,Negative_1,Positive_2
0,0.953176,0.046824
1,0.969698,0.030302


In [None]:
def labels(x):
  if x == 0:
    return 0
  else:
    return 1

results = np.argmax(predictions,axis=1)
# test['sarcastic'] = results
# test['sarcastic'] = test['sarcastic'].map(labels)
test['sarcastic_result'] =  test['sarcastic'].map(labels)
test.head()

Unnamed: 0.1,Unnamed: 0,tweet,sarcastic,rephrase,sarcasm,irony,satire,understatement,overstatement,rhetorical_question,sarcastic_result
2700,2700,i still can't believe maisie and i are seeing ...,0,,,,,,,,0
1756,1756,this Brazilian dude always likes my pictures o...,0,,,,,,,,0
1186,1186,https://t.co/jpgi5N4U9C,0,,,,,,,,0
3352,3352,@NotMikeRNG @DynastyDegener1 @myost73 @nolan_s...,0,,,,,,,,0
469,469,Hands up who remembers when EVERY blogger and ...,1,Who remembers when some bloggers were brand am...,1.0,0.0,0.0,0.0,1.0,0.0,1


In [None]:
predictions[0][0]/(predictions[0][0]+(0.5*(predictions[0][1] + predictions[1][0])))

0.6522184449004566

In [None]:
from sklearn.metrics import f1_score
f1_score(test_labels, test['sarcastic_result'])

1.0

In [None]:
model.predict(test['tweet'])

AttributeError: ignored