In [1]:
!pip install transformers
!pip install datasets
!pip install evaluate
!pip install pytorch-lightning==1.8.1


Installing collected packages: tokenizers, huggingface-hub, transformers
Successfully installed huggingface-hub-0.13.4 tokenizers-0.13.3 transformers-4.28.1
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting datasets
  Downloading datasets-2.11.0-py3-none-any.whl (468 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m468.7/468.7 kB[0m [31m16.2 MB/s[0m eta [36m0:00:00[0m
Collecting aiohttp
  Downloading aiohttp-3.8.4-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.0 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.0/1.0 MB[0m [31m57.9 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting xxhash
  Downloading xxhash-3.2.0-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (212 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m212.2/212.2 kB[0m [31m26.1 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting dill<0.3.7,>=0.3.0
  Downloading dill-0.3.6-py3-none-any.whl

In [2]:
import torch
from transformers import AutoConfig, AutoTokenizer, AutoModel, pipeline
from transformers import logging as hflogging


# HF name of the selected pre-trained language model (PLM):
plm_name = 'bert-base-uncased'

# If you want to know the exact python class of the model, use the following 2 lines:
# obj = pipeline(model=plm_name)
# type(obj.model)

# But to download and instantiate the model, we will use the generic Auto* classes, so 
# that we don't have to change the code when we select another model with a different 
# transformer class.

# Load the config, the tokenizer and the model itself:
lmconfig = AutoConfig.from_pretrained(plm_name)
lmtokenizer = AutoTokenizer.from_pretrained(plm_name)
lm = AutoModel.from_pretrained(plm_name, output_attentions=False)


Downloading (…)lve/main/config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

Downloading (…)okenizer_config.json:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight', 'cls.predictions.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).



#### Downloading the dataset

This `imdb` dataset, like many others, is available on the [HuggingFace (HF) dataset hub](https://huggingface.co/datasets). You can find its card [here](https://huggingface.co/datasets/imdb).

In order to download the dataset from the HF hub, we use the HF `datasets` python module.

Downloading the `train` and `test` splits of the dataset:

In [3]:
import datasets
import pandas as pd

#df_train = datasets.load_dataset("imdb", split='train')
#ds_test = datasets.load_dataset("imdb", split='test')

ds_train = pd.read_csv('traindata.csv', sep='\t', header = None)
ds_test = pd.read_csv('devdata.csv', sep='\t', header = None)

# df_train.reset_index(drop=True, inplace=True)

Let's have a look at some random examples from the train split:

In [4]:
import pandas as pd
pd.options.display.max_colwidth=120

#df_train = ds_train.shuffle().to_pandas()
print(ds_train.head())
print(ds_test.head())

          0                   1          2       3  \
0  positive    AMBIENCE#GENERAL    seating   18:25   
1  positive    AMBIENCE#GENERAL  trattoria   25:34   
2  positive        FOOD#QUALITY       food  98:102   
3  negative     SERVICE#GENERAL      STAFF    5:10   
4  positive  FOOD#STYLE_OPTIONS       menu     4:8   

                                                                                                                         4  
0                                                       short and sweet – seating is great:it's romantic,cozy and private.  
1                                        This quaint and romantic trattoria is at the top of my Manhattan restaurant list.  
2  The have over 100 different beers to offer thier guest so that made my husband very happy and the food was delicious...  
3                                                                                              THIS STAFF SHOULD BE FIRED.  
4                             The menu looked grea

#### Tokenization

We tokenize the texts and encode them with the tokenizer of the pre-trained language model. Remember: the tokenizer produces a dict with `input_ids` and `attention_mask` tensors (see the first part of this notebook). There are examples in the datasets that are longer than the model max input length (which is 512 in the BERT model we are using), so we set argument `truncation` to True:

Let's tokenize and encode the train and test texts. The encodings are added as new columns to the HF dataset objects. We also remove column 'text' (not needed anymore) and change the column name 'label' to 'labels'. Note that the texts are truncated to the max input length accepted by the model, but they are not padded because we will be using dynamic (batch-wise) padding:

## Create the input sentence

In [5]:
def add_labels(df):
  res = df.copy()
  list_labels = []
  for i in range(len(df[0])):
    if df[0].iloc[i] == "positive":
      list_labels.append(2)
    elif df[0].iloc[i] == "neutral":
      list_labels.append(1)
    else:
      list_labels.append(0)
  
  res["labels"] = list_labels
  return res

In [74]:
len(ds_train[0])

1503

In [6]:
df_train = add_labels(ds_train)
df_train

Unnamed: 0,0,1,2,3,4,labels
0,positive,AMBIENCE#GENERAL,seating,18:25,"short and sweet – seating is great:it's romantic,cozy and private.",2
1,positive,AMBIENCE#GENERAL,trattoria,25:34,This quaint and romantic trattoria is at the top of my Manhattan restaurant list.,2
2,positive,FOOD#QUALITY,food,98:102,The have over 100 different beers to offer thier guest so that made my husband very happy and the food was delicious...,2
3,negative,SERVICE#GENERAL,STAFF,5:10,THIS STAFF SHOULD BE FIRED.,0
4,positive,FOOD#STYLE_OPTIONS,menu,4:8,"The menu looked great, and the waiter was very nice, but when the food came, it was average.",2
...,...,...,...,...,...,...
1498,positive,DRINKS#QUALITY,expresso,29:37,One of us actually liked the expresso - that's it.,2
1499,negative,SERVICE#GENERAL,waitress,20:28,The hostess and the waitress were incredibly rude and did everything they could to rush us out.,0
1500,positive,RESTAURANT#PRICES,place,12:17,this little place has a cute interior decor and affordable city prices.,2
1501,positive,RESTAURANT#GENERAL,restaurant,30:40,Nice Family owned traditional restaurant.,2


In [7]:
def add_processed_col(df):
  res = df.copy()
  list_input = []
  for i in range(len(df[0])):
      string = ""
      string += df[4].iloc[i]
      string += " [SEP] "
      string += df[2].iloc[i]
      string += " [SEP] "
      string += df[1].iloc[i]
      list_input.append(string)
  res["processed_input"] = list_input
  return res

In [8]:
df_train_processed = add_processed_col(df_train)

In [9]:
df_test = add_labels(ds_test)
df_test_processed = add_processed_col(df_test)

In [10]:
df_train_processed.head()

Unnamed: 0,0,1,2,3,4,labels,processed_input
0,positive,AMBIENCE#GENERAL,seating,18:25,"short and sweet – seating is great:it's romantic,cozy and private.",2,"short and sweet – seating is great:it's romantic,cozy and private. [SEP] seating [SEP] AMBIENCE#GENERAL"
1,positive,AMBIENCE#GENERAL,trattoria,25:34,This quaint and romantic trattoria is at the top of my Manhattan restaurant list.,2,This quaint and romantic trattoria is at the top of my Manhattan restaurant list. [SEP] trattoria [SEP] AMBIENCE#GEN...
2,positive,FOOD#QUALITY,food,98:102,The have over 100 different beers to offer thier guest so that made my husband very happy and the food was delicious...,2,The have over 100 different beers to offer thier guest so that made my husband very happy and the food was delicious...
3,negative,SERVICE#GENERAL,STAFF,5:10,THIS STAFF SHOULD BE FIRED.,0,THIS STAFF SHOULD BE FIRED. [SEP] STAFF [SEP] SERVICE#GENERAL
4,positive,FOOD#STYLE_OPTIONS,menu,4:8,"The menu looked great, and the waiter was very nice, but when the food came, it was average.",2,"The menu looked great, and the waiter was very nice, but when the food came, it was average. [SEP] menu [SEP] FOOD#S..."


In [11]:
df_test_processed.head()

Unnamed: 0,0,1,2,3,4,labels,processed_input
0,positive,LOCATION#GENERAL,neighborhood,54:66,"great food, great wine list, great service in a great neighborhood...",2,"great food, great wine list, great service in a great neighborhood... [SEP] neighborhood [SEP] LOCATION#GENERAL"
1,negative,RESTAURANT#GENERAL,place,15:20,I thought this place was totally overrated.,0,I thought this place was totally overrated. [SEP] place [SEP] RESTAURANT#GENERAL
2,positive,FOOD#QUALITY,Fish,0:4,Fish is so very fresh.,2,Fish is so very fresh. [SEP] Fish [SEP] FOOD#QUALITY
3,negative,SERVICE#GENERAL,manager,19:26,"I showed it to the manager, and he smilingly apologized and brought us two free desserts (but did not ask us what we...",0,"I showed it to the manager, and he smilingly apologized and brought us two free desserts (but did not ask us what we..."
4,neutral,DRINKS#QUALITY,margaritas,63:73,"The food we ordered was excellent, although I wouldn't say the margaritas were anything to write home about.",1,"The food we ordered was excellent, although I wouldn't say the margaritas were anything to write home about. [SEP] m..."


## The model

In [48]:
class TransformerBinaryClassifier(torch.nn.Module):

    def __init__(self, plm_name: str):
        super(TransformerBinaryClassifier, self).__init__()
        self.lmconfig = AutoConfig.from_pretrained(plm_name)
        self.lmtokenizer = AutoTokenizer.from_pretrained(plm_name,add_special_tokens=True)
        self.lm = AutoModel.from_pretrained(plm_name, output_attentions=False)
        self.emb_dim = self.lmconfig.hidden_size
        self.output_size = 3
        self.classifier = torch.nn.Sequential(
            #torch.nn.Dropout(0.3),
            torch.nn.Linear(self.emb_dim, self.output_size),
            torch.nn.Softmax()
        )
        self.loss_fn = torch.nn.CrossEntropyLoss(reduction='mean')


    def forward(self, x):
        x : torch.Tensor = self.lm(x['input_ids'], x['attention_mask']).last_hidden_state
        cls_vects = x[:,0,:] # extract the [CLS] token of each sequence
        x = self.classifier(cls_vects) 
        return x.squeeze(-1)
    
    def compute_loss(self, predictions, target):
        return self.loss_fn(predictions, target)


model = TransformerBinaryClassifier(plm_name)

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight', 'cls.predictions.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [49]:
from transformers import DataCollatorWithPadding
from torch.utils.data import DataLoader

def tokenize_function(examples):
    return model.lmtokenizer(examples["processed_input"], truncation=True, add_special_tokens=True)

# ds_train = ds_train.rename_column("label", "labels")
# ds_test = ds_test.rename_column("label", "labels")

from datasets import Dataset
ds_train = Dataset.from_pandas(df_train_processed[["labels", "processed_input"]])
ds_test = Dataset.from_pandas(df_test_processed[["labels", "processed_input"]])

# tokenize datasets
tok_ds_train = ds_train.map(tokenize_function, batched=True)
tok_ds_test = ds_test.map(tokenize_function, batched=True)

tok_ds_train = tok_ds_train.remove_columns(["processed_input"])
tok_ds_test = tok_ds_test.remove_columns(["processed_input"])

#tok_ds_train = tok_ds_train.rename_column("label", "labels")
#tok_ds_test = tok_ds_test.rename_column("label", "labels")

data_collator = DataCollatorWithPadding(tokenizer=model.lmtokenizer, padding=True, return_tensors='pt')

train_dataloader = DataLoader(tok_ds_train, shuffle=True, batch_size=32, collate_fn=data_collator)
eval_dataloader = DataLoader(tok_ds_test, batch_size=32, collate_fn=data_collator)

# just for testing
for b in train_dataloader:
    print(b['labels'])
    break

Map:   0%|          | 0/1503 [00:00<?, ? examples/s]

Map:   0%|          | 0/376 [00:00<?, ? examples/s]

You're using a BertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


tensor([2, 2, 2, 0, 2, 1, 2, 0, 2, 2, 2, 0, 2, 0, 2, 1, 2, 0, 2, 2, 0, 2, 2, 2,
        2, 2, 0, 0, 2, 0, 2, 0])


In [50]:
from torch.optim import Adam
from transformers import get_scheduler

optimizer = Adam(model.parameters(), lr=3e-5)


num_epochs = 10
num_training_steps = num_epochs * len(train_dataloader)
lr_scheduler = get_scheduler(
    name="linear", optimizer=optimizer, num_warmup_steps=0, num_training_steps=num_training_steps
)

device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")

# device = 'cpu'

model.to(device)

lr_scheduler

<torch.optim.lr_scheduler.LambdaLR at 0x7f27ce6a7a30>

Training the model:

In [51]:
from tqdm.auto import tqdm

progress_bar = tqdm(range(num_training_steps))

#odel = TransformerBinaryClassifier(plm_name)
#model.to(device)

model.train()
for epoch in range(num_epochs):
    for batch in train_dataloader:
        batch = {k: v.to(device) for k, v in batch.items()}
        predictions = model(batch)
        #labels = labels.type(torch.LongTensor)
        loss = model.loss_fn(predictions, batch['labels'])
        loss.backward()

        optimizer.step()
        lr_scheduler.step()
        optimizer.zero_grad()
        progress_bar.update(1)
    print(loss)
    #predict_accuracy(model, list(df_test_processed["processed_input"]))       

  0%|          | 0/329 [00:00<?, ?it/s]

  input = module(input)


tensor(0.8741, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(0.8094, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(0.7114, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(0.7280, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(0.6882, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(0.6922, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(0.6298, device='cuda:0', grad_fn=<NllLossBackward0>)


In [18]:
import numpy as np
def predict_accuracy(model, texts):
  model.eval()
  labelz = ["negative","neutral","positive"]
  encoded_texts = model.lmtokenizer(texts, truncation=True, padding=True, return_attention_mask=True, return_tensors='pt',add_special_tokens=True)
  with torch.no_grad():
    output = model(encoded_texts.to(device)).tolist()
    pred_labels = [labelz[np.argmax(p)] for p in output]
    return list(zip(texts, pred_labels))


We use the prediction method to label a couple of texts:

In [52]:
predz = predict_accuracy(model, list(df_test_processed["processed_input"]))

In [53]:
text, label = predz[0]
print(label)

positive


In [54]:
y_pred = []
for i in range(len(df_test_processed["processed_input"])):
  text, label = predz[i]
  if label == df_test_processed[0].iloc[i]:
    y_pred.append(1)
  else:
    y_pred.append(0)
print(np.sum(y_pred))
print(len(y_pred))
print("Accuracy : " + str(np.sum(y_pred)/len(y_pred)*100))


320
376
Accuracy : 85.1063829787234
