In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline
import warnings
warnings.filterwarnings('ignore')
from sklearn.model_selection import train_test_split

### Import required transformer libraries

In [2]:
!pip install transformers

Collecting transformers
  Downloading transformers-4.15.0-py3-none-any.whl (3.4 MB)
[K     |████████████████████████████████| 3.4 MB 4.4 MB/s 
Collecting sacremoses
  Downloading sacremoses-0.0.46-py3-none-any.whl (895 kB)
[K     |████████████████████████████████| 895 kB 43.3 MB/s 
Collecting tokenizers<0.11,>=0.10.1
  Downloading tokenizers-0.10.3-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl (3.3 MB)
[K     |████████████████████████████████| 3.3 MB 22.1 MB/s 
Collecting pyyaml>=5.1
  Downloading PyYAML-6.0-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl (596 kB)
[K     |████████████████████████████████| 596 kB 22.8 MB/s 
[?25hCollecting huggingface-hub<1.0,>=0.1.0
  Downloading huggingface_hub-0.2.1-py3-none-any.whl (61 kB)
[K     |████████████████████████████████| 61 kB 338 kB/s 
Installing collected packages: pyyaml, tokenizers, sacremoses, huggingface-hub, transformers
  Attem

In [3]:
import torch
from torch.utils.data import Dataset
from transformers import DistilBertTokenizerFast,DistilBertForSequenceClassification
from transformers import Trainer,TrainingArguments
from transformers import DistilBertTokenizerFast, BertForMaskedLM

### Some needed liberaries

In [6]:
from sklearn.metrics import accuracy_score, f1_score
from google.colab import drive
# from dataset import load_metric

## Read Data

In [17]:
path = 'drive/My Drive/DataLab/sarcasm/train.En.csv'
drive.mount('/content/drive')
df = pd.read_csv(path)
df = df.dropna(subset=['tweet'])

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


## Split Test and Train

In [18]:
train, test = train_test_split(df, test_size=0.1)

In [19]:
train.head()

Unnamed: 0.1,Unnamed: 0,tweet,sarcastic,rephrase,sarcasm,irony,satire,understatement,overstatement,rhetorical_question
1494,1494,I reckon zoella will get engaged in New York t...,0,,,,,,,
1877,1877,"""You think you're better than us because you'r...",0,,,,,,,
2897,2897,Only food can turn this day around,0,,,,,,,
231,231,is it illegal to discriminate against someone ...,1,This guy has a joker tattoo and that's off-put...,1.0,0.0,0.0,0.0,0.0,0.0
3117,3117,THREE of my friends are doing their vivas this...,0,,,,,,,


In [20]:
test.head()

Unnamed: 0.1,Unnamed: 0,tweet,sarcastic,rephrase,sarcasm,irony,satire,understatement,overstatement,rhetorical_question
2812,2812,eddie kendricks discography is crazy,0,,,,,,,
492,492,Every time you complain about neopronouns or x...,1,Stop complaining about peoples individual gend...,1.0,0.0,0.0,0.0,0.0,0.0
1361,1361,Got razor burn on my mf armpits so that’s it t...,0,,,,,,,
541,541,its 3 30 am and I have decided the only redeem...,1,I love putting my freezing cold toes on men to...,1.0,0.0,0.0,0.0,1.0,0.0
1201,1201,☯️&amp;🤍 https://t.co/Uz2XTyp9Dg,0,,,,,,,


## Extract Features and Labels

In [21]:
train_tweets = train['tweet'].values.tolist()
train_labels = train['sarcastic'].values.tolist()
test_tweets = test['tweet'].values.tolist()

## Split the training sample into train and validation set

In [22]:
train_tweets, val_tweets, train_labels, val_labels = train_test_split(train_tweets, train_labels, 
                                                                    test_size=0.1,random_state=42,stratify=train_labels)

## Steps for Fine Tuning model

<ul>
<li>Prepare dataset</li>
<li>Load pretrained tokenizer,call it with dataset</li>
<li>Build Pytorch datasets with encodings</li>
<li>Load pretrained Model</li>
<li> Load Trainer and train it </li>
    Instead of Trainer we could've use native Pytorch training pipline.
</ul>

### Set Model Name

In [23]:
model_name = 'detecting-sarcasim'

## Tokenization

In [24]:
tokenizer = DistilBertTokenizerFast.from_pretrained('bert-base-cased',num_labels=2)

# number of labels here is 2

Downloading:   0%|          | 0.00/208k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/426k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/29.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/570 [00:00<?, ?B/s]

The tokenizer class you load from this checkpoint is not the same type as the class this function is called from. It may result in unexpected tokenization. 
The tokenizer class you load from this checkpoint is 'BertTokenizer'. 
The class this function is called from is 'DistilBertTokenizerFast'.


<blockquote> The difference between a “fast” and a “non-fast” tokenizer is computation speed but there is no functional difference between them.
<blockqoute> FastTokenizers are implemented in Rust and are factors faster than the Python based tokenizers. Apart from that their encoding methods should behave the same. However, they are not functionally identical.</blockqoute>

In [25]:
train_encodings = tokenizer(train_tweets, truncation=True, padding=True,return_tensors = 'pt')

Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


In [26]:
val_encodings = tokenizer(val_tweets, truncation=True, padding=True,return_tensors = 'pt')

In [27]:
test_encodings = tokenizer(test_tweets, truncation=True, padding=True,return_tensors = 'pt')

<ul>
    <li>setting truncation = True will eliminate tokens that exceed the max_length(512) in case of BERT.</li>
    <li>setting padding =True will pad documents that have length less than max_length with empty tokens i.e. 0, ensuring that all of our sequences are padded to the same length.</li>
    <li>setting return_tensors = ‘pt’ will return the encodings as pytorch tensors.</li>
    <li>This will allow us to feed batches of sequences into the model at the same time.</li>
</ul>

## Turn lavles and encodings into a Dataset object

<ul>
    <li>Wrap the tokenized data into a torch dataset.</li>
    <li>In PyTorch, this is done by subclassing a torch.utils.data.Dataset object and implementing len and getitem.</li>
<ul>   

In [28]:
class SentimentDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)
    
## Test Dataset
class SentimentTestDataset(torch.utils.data.Dataset):
    def __init__(self, encodings):
        self.encodings = encodings

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        return item
    def __len__(self):
        return len(self.encodings)

## Genearte DataLoaders

In [29]:
train_dataset = SentimentDataset(train_encodings, train_labels)

In [30]:
val_dataset = SentimentDataset(val_encodings, val_labels)

In [31]:
test_dataset = SentimentTestDataset(test_encodings)

## Define a Simple Metrics Function

In [32]:
def compute_metrics(p):
    pred, labels = p
    pred = np.argmax(pred, axis=1)

    accuracy = accuracy_score(y_true=labels, y_pred=pred)
    #recall = recall_score(y_true=labels, y_pred=pred)
    #precision = precision_score(y_true=labels, y_pred=pred)
    f1 = f1_score(labels, pred, average='weighted')

    return {"accuracy": accuracy,"f1_score":f1}

In [33]:
training_args = TrainingArguments(
    output_dir='./res', evaluation_strategy="steps", num_train_epochs=5, per_device_train_batch_size=32,
    per_device_eval_batch_size=64, warmup_steps=500, weight_decay=0.01,logging_dir='./logs4',
    #logging_steps=10,
    load_best_model_at_end=True,
)

<ul>
    <li> output_dir = output directory</li>
    <li> num_train_epochs = total number of training epochs</li>
    <li> per_device_train_batch_size = batch size per device during training</li>
    <li> per_device_eval_batch_size = batch size for evaluation</li>
    <li> warmup_steps = number of warmup steps for learning rate scheduler</li>
    <li> weight_decay = strength of weight decay</li>
    <li> logging_dir = directory for storing logs</li>
<ul>

## Fine Tuning with Trainer

In [34]:
model = DistilBertForSequenceClassification.from_pretrained("bert-base-cased",num_labels=2)
# distilbert-base-uncased

You are using a model of type bert to instantiate a model of type distilbert. This is not supported for all configurations of models and can yield errors.


Downloading:   0%|          | 0.00/416M [00:00<?, ?B/s]

Some weights of the model checkpoint at bert-base-cased were not used when initializing DistilBertForSequenceClassification: ['bert.encoder.layer.8.output.dense.bias', 'bert.encoder.layer.0.intermediate.dense.bias', 'bert.encoder.layer.2.intermediate.dense.weight', 'bert.encoder.layer.6.attention.self.query.weight', 'bert.encoder.layer.9.attention.self.value.bias', 'bert.pooler.dense.bias', 'bert.encoder.layer.10.attention.output.LayerNorm.bias', 'bert.encoder.layer.1.intermediate.dense.bias', 'bert.encoder.layer.8.output.dense.weight', 'bert.encoder.layer.8.attention.output.dense.bias', 'bert.encoder.layer.11.output.dense.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.dense.weight', 'bert.encoder.layer.7.attention.output.dense.bias', 'bert.encoder.layer.8.intermediate.dense.weight', 'bert.encoder.layer.0.output.dense.weight', 'cls.seq_relationship.bias', 'bert.encoder.layer.2.attention.self.key.weight', 'bert.encoder.layer.3.intermediate.dense.bias', 'bert.encoder

In [35]:
trainer = Trainer(
    model=model, args=training_args, train_dataset=train_dataset,
    eval_dataset=val_dataset,
    compute_metrics=compute_metrics,
)

<ul>
    <li> model = the instantiated hugging-face Transformers model to be trained </li>
    <li> args=training_args = training arguments, defined above</li>
    <li> train_dataset = training dataset</li>
    <li> eval_dataset = evaluation dataset</li>
<ul>

In [36]:
trainer.train()

***** Running training *****
  Num examples = 2808
  Num Epochs = 5
  Instantaneous batch size per device = 32
  Total train batch size (w. parallel, distributed & accumulation) = 32
  Gradient Accumulation steps = 1
  Total optimization steps = 440


Step,Training Loss,Validation Loss




Training completed. Do not forget to share your model on huggingface.co/models =)




TrainOutput(global_step=440, training_loss=0.5262665835293856, metrics={'train_runtime': 582.8211, 'train_samples_per_second': 24.09, 'train_steps_per_second': 0.755, 'total_flos': 916304805842400.0, 'train_loss': 0.5262665835293856, 'epoch': 5.0})

In [37]:
trainer.evaluate()

***** Running Evaluation *****
  Num examples = 312
  Batch size = 64


{'epoch': 5.0,
 'eval_accuracy': 0.6858974358974359,
 'eval_f1_score': 0.668867202449292,
 'eval_loss': 0.6081863045692444,
 'eval_runtime': 3.4412,
 'eval_samples_per_second': 90.666,
 'eval_steps_per_second': 1.453}

In [None]:
# test['sarcastic'] = 0
# test_tweets = test['tweet'].values.tolist() 
# test_labels = test['sarcastic'].values.tolist() 
# test_encodings = tokenizer(test_tweets,
#                            truncation=True, 
#                            padding=True,
#                            return_tensors = 'pt').to("cuda") 
# test_dataset = SentimentDataset(test_encodings, test_labels)

## Test

In [38]:
pin_memory=False
preds = trainer.predict(test_dataset=test_dataset)

***** Running Prediction *****
  Num examples = 2
  Batch size = 64


In [40]:
probs = torch.from_numpy(preds[0]).softmax(1)

# convert tensors to numpy array
predictions = probs.numpy()

In [43]:
newdf = pd.DataFrame(predictions,columns=['Neutral_1','Positive_2'])

In [44]:
newdf.head()

Unnamed: 0,Neutral_1,Positive_2
0,0.939547,0.060453
1,0.718229,0.281771


In [48]:
def labels(x):
  if x == 0:
    return 'Negative_1'
  else:
    return 'Positive_1'

results = np.argmax(predictions,axis=1)
# test['sarcastic'] = results
test['sarcastic'] = test['sarcastic'].map(labels)
test.head()

Unnamed: 0.1,Unnamed: 0,tweet,sarcastic,rephrase,sarcasm,irony,satire,understatement,overstatement,rhetorical_question
2812,2812,eddie kendricks discography is crazy,Positive_1,,,,,,,
492,492,Every time you complain about neopronouns or x...,Positive_1,Stop complaining about peoples individual gend...,1.0,0.0,0.0,0.0,0.0,0.0
1361,1361,Got razor burn on my mf armpits so that’s it t...,Positive_1,,,,,,,
541,541,its 3 30 am and I have decided the only redeem...,Positive_1,I love putting my freezing cold toes on men to...,1.0,0.0,0.0,0.0,1.0,0.0
1201,1201,☯️&amp;🤍 https://t.co/Uz2XTyp9Dg,Positive_1,,,,,,,
