## libraries

In [2]:
#!pip install transformers

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting transformers
  Downloading transformers-4.26.1-py3-none-any.whl (6.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m6.3/6.3 MB[0m [31m22.2 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting huggingface-hub<1.0,>=0.11.0
  Downloading huggingface_hub-0.12.1-py3-none-any.whl (190 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m190.3/190.3 KB[0m [31m11.6 MB/s[0m eta [36m0:00:00[0m
Collecting tokenizers!=0.11.3,<0.14,>=0.11.1
  Downloading tokenizers-0.13.2-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (7.6 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.6/7.6 MB[0m [31m22.0 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: tokenizers, huggingface-hub, transformers
Successfully installed huggingface-hub-0.12.1 tokenizers-0.13.2 transformers-4.26.1


In [3]:
import torch.nn.functional as F 
from transformers import AutoTokenizer, AutoModelForSequenceClassification
import numpy as np
import pandas as pd
from tensorflow.keras.optimizers import Adam
import torch
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
from sklearn.model_selection import train_test_split
from tabulate import tabulate
from tqdm import trange
import random

## Load Pretrained model

In [4]:
#Tokenizer
tokenizer = AutoTokenizer.from_pretrained("assemblyai/bert-large-uncased-sst2") 
model = AutoModelForSequenceClassification.from_pretrained("assemblyai/bert-large-uncased-sst2")


Downloading (…)okenizer_config.json:   0%|          | 0.00/301 [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/621 [00:00<?, ?B/s]

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

Downloading (…)"pytorch_model.bin";:   0%|          | 0.00/1.34G [00:00<?, ?B/s]

In [None]:
tokenized_segments = tokenizer(["It's important to remember that healing from the pain of being hurt is a process"], return_tensors="pt", padding=True, truncation=True)
tokenized_segments_input_ids, tokenized_segments_attention_mask = tokenized_segments.input_ids, tokenized_segments.attention_mask
model_predictions = F.softmax(model(input_ids=tokenized_segments_input_ids, attention_mask=tokenized_segments_attention_mask)['logits'], dim=1)


In [None]:
print("Positive probability: "+str(model_predictions[0][1].item()*100)+"%")
print("Negative probability: "+str(model_predictions[0][0].item()*100)+"%")

Positive probability: 97.50133156776428%
Negative probability: 2.4986738339066505%


## Load Data

In [None]:
data = pd.read_csv('Book1.csv')
labels = data.sentiment.values
data.head()

Unnamed: 0,text,sentiment
0,"It's hard to forgive someone who hurt you, but...",0
1,The pain of being hurt by someone you love is ...,0
2,Seeing you hurt breaks my heart into pieces.,1
3,Whoever hurt you deserves to be punished.,1
4,You don't deserve to be treated poorly by anyone.,1


#retrain the model

In [None]:
#input ids: a sequence of integers identifying each input token to its index number in the tokenizer
#attention mask: (optional) a sequence of 1s and 0s, with 1s for all input tokens and 0s for all padding tokens 
tokenized_segments_input_idss=[]
tokenized_segments_attention_masks=[]
for i in range(len(data["text"])):
  tokenized_segments=tokenizer(data["text"][i], return_tensors="pt", max_length = 32,pad_to_max_length = True)
  tokenized_segments_input_idss.append(tokenized_segments.input_ids)
  tokenized_segments_attention_masks.append(tokenized_segments.attention_mask)
 

Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


In [None]:
token_id = torch.cat(tokenized_segments_input_idss, dim = 0)
attention_masks = torch.cat(tokenized_segments_attention_masks, dim = 0)
labels = torch.tensor(labels)

In [None]:
test_ratio=0.2
lenth_data=len(labels)
bt=16
epochs = 3
optimizer = torch.optim.AdamW(model.parameters(),lr = 5e-5,eps = 1e-08)


In [None]:
# Indices of the train and validation splits stratified by labels
train_idx, val_idx = train_test_split(
    np.arange(lenth_data),
    test_size = test_ratio,
    shuffle = True,
    stratify = labels)

# Train and validation sets
train_set = TensorDataset(token_id[train_idx], 
                          attention_masks[train_idx], 
                          labels[train_idx])

val_set = TensorDataset(token_id[val_idx], 
                        attention_masks[val_idx], 
                        labels[val_idx])
# Prepare DataLoader
train_dataloader = DataLoader(
            train_set,
            sampler = RandomSampler(train_set),
            batch_size = bt
        )

validation_dataloader = DataLoader(
            val_set,
            sampler = SequentialSampler(val_set),
            batch_size = bt
        )

In [None]:
def b_metrics(preds, labels):
  
  preds = np.argmax(preds, axis = 1).flatten()
  labels = labels.flatten()
  tp=sum([preds == labels and preds == 1 for preds, labels in zip(preds, labels)])
  tn = sum([preds == labels and preds == 0 for preds, labels in zip(preds, labels)])
  b_accuracy = (tp + tn) / len(labels)
  return b_accuracy

In [None]:
for _ in trange(epochs, desc = 'Epoch'):
    
# ========== Training ==========
    
    model.train()
    
    # Tracking variables
    tr_loss = 0
    nb_tr_examples, nb_tr_steps = 0, 0

    for step, batch in enumerate(train_dataloader):
        
        b_input_ids, b_input_mask, b_labels = batch
        optimizer.zero_grad()
        # Forward pass
        train_output = model(b_input_ids, 
                             token_type_ids = None, 
                             attention_mask = b_input_mask, 
                             labels = b_labels)
        # Backward pass
        train_output.loss.backward()
        optimizer.step()
        # Update tracking variables
        tr_loss += train_output.loss.item()
        nb_tr_examples += b_input_ids.size(0)
        nb_tr_steps += 1
# ========== Validation ==========

    model.eval()
    
    val_accuracy = []

    for batch in validation_dataloader:
        b_input_ids, b_input_mask, b_labels = batch
        with torch.no_grad():
          # Forward pass
          eval_output = model(b_input_ids, 
                              token_type_ids = None, 
                              attention_mask = b_input_mask)
        logits = eval_output.logits.detach().cpu().numpy()
        label_ids = b_labels.to('cpu').numpy()
        # Calculate validation metrics
        b_accuracy = b_metrics(logits, label_ids)
        val_accuracy.append(b_accuracy)
    print(' - Train loss: {:.2f}'.format(tr_loss / nb_tr_steps))
    print(' - Validation Accuracy: {:.2f}'.format(sum(val_accuracy)/len(val_accuracy)))    

Epoch:  33%|███▎      | 1/3 [01:13<02:26, 73.13s/it]

 - Train loss: 0.61
 - Validation Accuracy: 0.89


Epoch:  67%|██████▋   | 2/3 [02:04<01:00, 60.09s/it]

 - Train loss: 0.12
 - Validation Accuracy: 1.00


Epoch: 100%|██████████| 3/3 [02:44<00:00, 55.00s/it]

 - Train loss: 0.10
 - Validation Accuracy: 1.00





In [5]:
#test 
def test(text):
    tokenized_segments_input_ids=[]
    tokenized_segments_attention_mask=[]
    tokenized_segment=tokenizer([text], return_tensors="pt", max_length = 32,pad_to_max_length = True)
    tokenized_segments_input_ids.append(tokenized_segment.input_ids)
    tokenized_segments_attention_mask.append(tokenized_segment.attention_mask)
    test_ids = torch.cat(tokenized_segments_input_ids, dim = 0)
    test_attention_mask = torch.cat(tokenized_segments_attention_mask, dim = 0)
    with torch.no_grad():
      output = model(test_ids, token_type_ids = None, attention_mask = test_attention_mask)
    return output

# Test Sentences 

In [6]:
output = test("It's important to remember that healing from the pain of being hurt is a process")
model_predictions = F.softmax(output['logits'], dim=1)
print("It's important to remember that healing from the pain of being hurt is a process")
print("Positive probability: "+str(model_predictions[0][1].item()*100)+"%")
print("Negative probability: "+str(model_predictions[0][0].item()*100)+"%")

Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


It's important to remember that healing from the pain of being hurt is a process
Positive probability: 97.50133156776428%
Negative probability: 2.498672716319561%


In [9]:
output = test("I hate the selfishness in you")
model_predictions = F.softmax(output['logits'], dim=1)
print("I hate the selfishness in you")
print("Positive probability: "+str(model_predictions[0][1].item()*100)+"%")
print("Negative probability: "+str(model_predictions[0][0].item()*100)+"%")

I hate the selfishness in you
Positive probability: 32.879284024238586%
Negative probability: 67.12071299552917%


In [8]:
output = test("I hate any one who can hurt you")
model_predictions = F.softmax(output['logits'], dim=1)
print("I hate any one who can hurt you")
print("Positive probability: "+str(model_predictions[0][1].item()*100)+"%")
print("Negative probability: "+str(model_predictions[0][0].item()*100)+"%")

I hate any one who can hurt you
Positive probability: 99.60909485816956%
Negative probability: 0.39090043865144253%
