In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline
import warnings
warnings.filterwarnings('ignore')
from sklearn.model_selection import train_test_split

### Import required transformer libraries

In [None]:
!pip install transformers

Collecting transformers
  Using cached transformers-4.15.0-py3-none-any.whl (3.4 MB)
Collecting sacremoses
  Using cached sacremoses-0.0.47-py2.py3-none-any.whl (895 kB)
Collecting huggingface-hub<1.0,>=0.1.0
  Using cached huggingface_hub-0.4.0-py3-none-any.whl (67 kB)
Collecting tokenizers<0.11,>=0.10.1
  Using cached tokenizers-0.10.3-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl (3.3 MB)
Collecting pyyaml>=5.1
  Using cached PyYAML-6.0-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl (596 kB)
Installing collected packages: pyyaml, tokenizers, sacremoses, huggingface-hub, transformers
  Attempting uninstall: pyyaml
    Found existing installation: PyYAML 3.13
    Uninstalling PyYAML-3.13:
      Successfully uninstalled PyYAML-3.13
Successfully installed huggingface-hub-0.4.0 pyyaml-6.0 sacremoses-0.0.47 tokenizers-0.10.3 transformers-4.15.0


In [None]:
import torch
from torch.utils.data import Dataset
from transformers import DistilBertTokenizerFast,DistilBertForSequenceClassification
from transformers import Trainer,TrainingArguments
from transformers import DistilBertTokenizerFast, BertForMaskedLM
from transformers import AutoConfig
from transformers import AutoModel

### Some needed liberaries

In [None]:
from sklearn.metrics import accuracy_score, f1_score
from google.colab import drive
from torch import nn
from transformers import Trainer

# from dataset import load_metric

## Read Data

In [None]:
# path = 'drive/My Drive/DataLab/sarcasm/train.En.csv'
path = 'drive/My Drive/DataLab/sarcasm/train.En.csv'
path_test = 'drive/My Drive/DataLab/sarcasm/taskA.En.input.csv'
drive.mount('/content/drive')
df = pd.read_csv(path)
test = pd.read_csv(path_test)
df = df.dropna(subset=['tweet'])

Mounted at /content/drive


## Split Test and Train

In [None]:
# train, test = train_test_split(df, test_size=0.1)
train = df

In [None]:
train.head()

Unnamed: 0.1,Unnamed: 0,tweet,sarcastic,rephrase,sarcasm,irony,satire,understatement,overstatement,rhetorical_question
0,0,The only thing I got from college is a caffein...,1,"College is really difficult, expensive, tiring...",0.0,1.0,0.0,0.0,0.0,0.0
1,1,I love it when professors draw a big question ...,1,I do not like when professors don‚Äôt write out ...,1.0,0.0,0.0,0.0,0.0,0.0
2,2,Remember the hundred emails from companies whe...,1,"I, at the bare minimum, wish companies actuall...",0.0,1.0,0.0,0.0,0.0,0.0
3,3,Today my pop-pop told me I was not ‚Äúforced‚Äù to...,1,"Today my pop-pop told me I was not ""forced"" to...",1.0,0.0,0.0,0.0,0.0,0.0
4,4,@VolphanCarol @littlewhitty @mysticalmanatee I...,1,I would say Ted Cruz is an asshole and doesn‚Äôt...,1.0,0.0,0.0,0.0,0.0,0.0


In [None]:
test.head()

Unnamed: 0,text
0,"Size on the the Toulouse team, That pack is mo..."
1,Pinball!
2,So the Scottish Government want people to get ...
3,villainous pro tip : change the device name on...
4,I would date any of these men ü•∫


## Extract Features and Labels

In [None]:
train_tweets = train['tweet'].values.tolist()
train_labels = train['sarcastic'].values.tolist()
test_tweets = test['text'].values.tolist()
# test_labels = test['sarcastic']

## Split the training sample into train and validation set

In [None]:
train_tweets, val_tweets, train_labels, val_labels = train_test_split(train_tweets, train_labels, 
                                                                    test_size=0.1,random_state=42,stratify=train_labels)

## Steps for Fine Tuning model

<ul>
<li>Prepare dataset</li>
<li>Load pretrained tokenizer,call it with dataset</li>
<li>Build Pytorch datasets with encodings</li>
<li>Load pretrained Model</li>
<li> Load Trainer and train it </li>
    Instead of Trainer we could've use native Pytorch training pipline.
</ul>

### Set Model Name

In [None]:
model_name = 'detecting-sarcasim'

## Tokenization

In [None]:
tokenizer = DistilBertTokenizerFast.from_pretrained('bert-base-cased',
                                                    num_labels=2,
                                                    loss_function_params={"weight": [0.75, 0.25]}
                                                    )
# 'bert-base-cased'
# 'bert-base-uncased'
# number of labels here is 2

Downloading:   0%|          | 0.00/208k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/426k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/29.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/570 [00:00<?, ?B/s]

The tokenizer class you load from this checkpoint is not the same type as the class this function is called from. It may result in unexpected tokenization. 
The tokenizer class you load from this checkpoint is 'BertTokenizer'. 
The class this function is called from is 'DistilBertTokenizerFast'.


<blockquote> The difference between a ‚Äúfast‚Äù and a ‚Äúnon-fast‚Äù tokenizer is computation speed but there is no functional difference between them.
<blockqoute> FastTokenizers are implemented in Rust and are factors faster than the Python based tokenizers. Apart from that their encoding methods should behave the same. However, they are not functionally identical.</blockqoute>

In [None]:
train_encodings = tokenizer(train_tweets, truncation=True, padding=True,return_tensors = 'pt')

Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


In [None]:
val_encodings = tokenizer(val_tweets, truncation=True, padding=True,return_tensors = 'pt')

In [None]:
test_encodings = tokenizer(test_tweets, truncation=True, padding=True,return_tensors = 'pt')

<ul>
    <li>setting truncation = True will eliminate tokens that exceed the max_length(512) in case of BERT.</li>
    <li>setting padding =True will pad documents that have length less than max_length with empty tokens i.e. 0, ensuring that all of our sequences are padded to the same length.</li>
    <li>setting return_tensors = ‚Äòpt‚Äô will return the encodings as pytorch tensors.</li>
    <li>This will allow us to feed batches of sequences into the model at the same time.</li>
</ul>

## Turn labels and encodings into a Dataset object

<ul>
    <li>Wrap the tokenized data into a torch dataset.</li>
    <li>In PyTorch, this is done by subclassing a torch.utils.data.Dataset object and implementing len and getitem.</li>
<ul>   

In [None]:
class SarcasimDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)
    
## Test Dataset
class SarcasimTestDataset(torch.utils.data.Dataset):
    def __init__(self, encodings):
        self.encodings = encodings

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        return item
    def __len__(self):
        return len(self.encodings)

## Genearte DataLoaders

In [None]:
train_dataset = SarcasimDataset(train_encodings, train_labels)

In [None]:
val_dataset = SarcasimDataset(val_encodings, val_labels)

In [None]:
test_dataset = SarcasimTestDataset(test_encodings)

## Define a Simple Metrics Function

In [None]:
def compute_metrics(p):
    pred, labels = p
    pred = np.argmax(pred, axis=1)

    accuracy = accuracy_score(y_true=labels, y_pred=pred)
    #recall = recall_score(y_true=labels, y_pred=pred)
    #precision = precision_score(y_true=labels, y_pred=pred)
    f1 = f1_score(labels, pred, average='weighted')

    return {"accuracy": accuracy,"f1_score":f1}

In [None]:
training_args = TrainingArguments(
    output_dir='./res', evaluation_strategy="steps", num_train_epochs=5, per_device_train_batch_size=32,
    per_device_eval_batch_size=64, warmup_steps=500, weight_decay=0.01,logging_dir='./logs4',
    #logging_steps=10,
    load_best_model_at_end=True,
)

<ul>
    <li> output_dir = output directory</li>
    <li> num_train_epochs = total number of training epochs</li>
    <li> per_device_train_batch_size = batch size per device during training</li>
    <li> per_device_eval_batch_size = batch size for evaluation</li>
    <li> warmup_steps = number of warmup steps for learning rate scheduler</li>
    <li> weight_decay = strength of weight decay</li>
    <li> logging_dir = directory for storing logs</li>
<ul>

## Fine Tuning with Trainer

In [None]:
# config = AutoConfig.from_pretrained('bert-base-uncased')
# model =  AutoModel.from_config(config)

model = DistilBertForSequenceClassification.from_pretrained("bert-base-cased",num_labels=2)
# distilbert-base-uncased
# bert-base-cased

You are using a model of type bert to instantiate a model of type distilbert. This is not supported for all configurations of models and can yield errors.


Downloading:   0%|          | 0.00/416M [00:00<?, ?B/s]

Some weights of the model checkpoint at bert-base-cased were not used when initializing DistilBertForSequenceClassification: ['bert.encoder.layer.4.intermediate.dense.bias', 'bert.encoder.layer.7.attention.output.dense.weight', 'bert.encoder.layer.6.output.dense.bias', 'bert.encoder.layer.4.attention.output.dense.bias', 'bert.encoder.layer.3.output.dense.bias', 'bert.encoder.layer.4.attention.self.value.bias', 'bert.encoder.layer.4.output.LayerNorm.bias', 'bert.encoder.layer.5.attention.self.key.weight', 'bert.encoder.layer.5.attention.output.LayerNorm.bias', 'bert.encoder.layer.1.attention.output.dense.bias', 'bert.encoder.layer.5.output.LayerNorm.bias', 'bert.encoder.layer.7.output.dense.weight', 'bert.encoder.layer.2.output.LayerNorm.weight', 'bert.encoder.layer.9.attention.output.dense.weight', 'bert.encoder.layer.3.attention.self.query.weight', 'bert.encoder.layer.6.attention.self.value.weight', 'bert.encoder.layer.4.attention.self.key.weight', 'bert.encoder.layer.1.intermediate.d

In [None]:
class CustomTrainer(Trainer):
    def compute_loss(self, model, inputs, return_outputs=False):
        labels = inputs.get("labels")
        # forward pass
        outputs = model(**inputs)
        logits = outputs.get('logits')
        # compute custom loss
        loss_fct = nn.CrossEntropyLoss(weight=torch.tensor([0.1, 0.3]))
        loss = loss_fct(logits.view(-1, self.model.config.num_labels), labels.view(-1))
        return (loss, outputs) if return_outputs else loss

In [None]:
trainer = Trainer(
    model=model, args=training_args, train_dataset=train_dataset,
    eval_dataset=val_dataset,
    compute_metrics=compute_metrics,
)


<ul>
    <li> model = the instantiated hugging-face Transformers model to be trained </li>
    <li> args=training_args = training arguments, defined above</li>
    <li> train_dataset = training dataset</li>
    <li> eval_dataset = evaluation dataset</li>
<ul>

In [None]:
trainer.train()

***** Running training *****
  Num examples = 3120
  Num Epochs = 5
  Instantaneous batch size per device = 32
  Total train batch size (w. parallel, distributed & accumulation) = 32
  Gradient Accumulation steps = 1
  Total optimization steps = 490


Step,Training Loss,Validation Loss




Training completed. Do not forget to share your model on huggingface.co/models =)




TrainOutput(global_step=490, training_loss=0.5101543504364636, metrics={'train_runtime': 189.6043, 'train_samples_per_second': 82.277, 'train_steps_per_second': 2.584, 'total_flos': 1018116450936000.0, 'train_loss': 0.5101543504364636, 'epoch': 5.0})

In [None]:
trainer.evaluate()

***** Running Evaluation *****
  Num examples = 347
  Batch size = 64


{'epoch': 5.0,
 'eval_accuracy': 0.6801152737752162,
 'eval_f1_score': 0.6711017062736543,
 'eval_loss': 0.6923450231552124,
 'eval_runtime': 1.2987,
 'eval_samples_per_second': 267.193,
 'eval_steps_per_second': 4.62}

In [None]:
# test['sarcastic'] = 0
# test_tweets = test['tweet'].values.tolist() 
# test_labels = test['sarcastic'].values.tolist() 
# test_encodings = tokenizer(test_tweets,
#                            truncation=True, 
#                            padding=True,
#                            return_tensors = 'pt').to("cuda") 
# test_dataset = SentimentDataset(test_encodings, test_labels)

## Test

In [None]:
test_dataset

<__main__.SarcasimTestDataset at 0x7f9e6d87c390>

In [None]:
# pin_memory=False
preds = trainer.predict(test_dataset=test_dataset)

***** Running Prediction *****
  Num examples = 2
  Batch size = 64


In [None]:
probs = torch.from_numpy(preds[0]).softmax(1)

# convert tensors to numpy array
predictions = probs.numpy()

In [None]:
newdf = pd.DataFrame(predictions,columns=['Negative_1','Positive_2'])

In [None]:
newdf.head()

Unnamed: 0,Negative_1,Positive_2
0,0.606067,0.393933
1,0.584924,0.415076


In [None]:
def labels(x):
  if x == 0:
    return 0
  else:
    return 1

results = np.argmax(predictions,axis=1)
# test['sarcastic'] = results
# test['sarcastic'] = test['sarcastic'].map(labels)
# test['sarcastic_result'] =  test['sarcastic'].map(labels)
test.head()

Unnamed: 0,text
0,"Size on the the Toulouse team, That pack is mo..."
1,Pinball!
2,So the Scottish Government want people to get ...
3,villainous pro tip : change the device name on...
4,I would date any of these men ü•∫


In [None]:
predictions[0][0]/(predictions[0][0]+(0.5*(predictions[0][1] + predictions[1][0])))

0.6522184449004566

In [None]:
from sklearn.metrics import f1_score
f1_score(test_labels, test['sarcastic_result'])

1.0

In [None]:
model.predict(test_dataset) 

AttributeError: ignored