In [1]:
import pandas as pd
from datasets import load_dataset
from transformers import AutoTokenizer
from torch.utils.data import DataLoader, Dataset
import numpy as np
import torch

In [2]:
df = pd.read_csv('data\\traindata.csv',sep='\t',header=None)

In [3]:
df.columns = ['Polarity','Aspect_category','Aspect_term','Character_offset','Text']

In [4]:
df

Unnamed: 0,Polarity,Aspect_category,Aspect_term,Character_offset,Text
0,positive,AMBIENCE#GENERAL,seating,18:25,short and sweet – seating is great:it's romant...
1,positive,AMBIENCE#GENERAL,trattoria,25:34,This quaint and romantic trattoria is at the t...
2,positive,FOOD#QUALITY,food,98:102,The have over 100 different beers to offer thi...
3,negative,SERVICE#GENERAL,STAFF,5:10,THIS STAFF SHOULD BE FIRED.
4,positive,FOOD#STYLE_OPTIONS,menu,4:8,"The menu looked great, and the waiter was very..."
...,...,...,...,...,...
1498,positive,DRINKS#QUALITY,expresso,29:37,One of us actually liked the expresso - that's...
1499,negative,SERVICE#GENERAL,waitress,20:28,The hostess and the waitress were incredibly r...
1500,positive,RESTAURANT#PRICES,place,12:17,this little place has a cute interior decor an...
1501,positive,RESTAURANT#GENERAL,restaurant,30:40,Nice Family owned traditional restaurant.


In [5]:
label_names = list(df.Aspect_category.unique())
label_names.sort()
dict_labels = {label_names[i]:i for i in range(len(label_names))}

In [6]:
def create_vector(row):
    label = 1 if row.Polarity == 'positive' else 0
    index_category = dict_labels[row.Aspect_category]
    result = [0]*(len(label_names)*2)
    if label == 1:
        idx_non_null = index_category
    else :
        idx_non_null = index_category + len(label_names)
    
    result[idx_non_null] = 1
    return np.array(result)

In [7]:
df['labels'] = df.apply(create_vector,axis=1)

In [8]:
df_dataset = df.groupby('Text').agg({'labels':'sum'}).reset_index()

In [9]:
tokenizer = AutoTokenizer.from_pretrained('bert-base-cased')

def tokenize(row):
    return tokenizer(row.Text,padding='max_length',truncation=True)

In [10]:
df_dataset['tokenize'] = df_dataset.apply(tokenize,axis=1)
df_dataset['input_ids'] = df_dataset.tokenize.apply(lambda dico : dico['input_ids'])
df_dataset['token_type_ids'] = df_dataset.tokenize.apply(lambda dico : dico['token_type_ids'])
df_dataset['attention_mask'] = df_dataset.tokenize.apply(lambda dico : dico['attention_mask'])

In [11]:
df_dataset.drop(columns=['tokenize','Text'],inplace=True)

In [12]:
df_dataset

Unnamed: 0,labels,input_ids,token_type_ids,attention_mask
0,"[0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[101, 109, 1406, 1111, 1155, 1128, 1169, 3940,...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ..."
1,"[0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, ...","[101, 109, 127, 1105, 1175, 1110, 1277, 27629,...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ..."
2,"[0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[101, 113, 10672, 2367, 1103, 18343, 1111, 110...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ..."
3,"[0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[101, 113, 1109, 1112, 17482, 28026, 1116, 117...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ..."
4,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[101, 138, 6888, 4382, 1149, 1104, 7738, 1219,...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, ..."
...,...,...,...,...
1058,"[0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[101, 1103, 19359, 1116, 1132, 13108, 117, 124...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ..."
1059,"[1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, ...","[101, 1142, 1376, 1282, 1144, 170, 10509, 4604...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ..."
1060,"[0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, ...","[101, 1142, 1282, 1125, 18589, 2162, 1103, 133...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ..."
1061,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[101, 1343, 12205, 1127, 1992, 117, 1133, 1136...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ..."


In [13]:
torch_tensor = torch.tensor(df_dataset['input_ids'])

In [14]:
data_loader = DataLoader(df_dataset)

In [15]:
data_loader.dataset

Unnamed: 0,labels,input_ids,token_type_ids,attention_mask
0,"[0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[101, 109, 1406, 1111, 1155, 1128, 1169, 3940,...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ..."
1,"[0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, ...","[101, 109, 127, 1105, 1175, 1110, 1277, 27629,...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ..."
2,"[0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[101, 113, 10672, 2367, 1103, 18343, 1111, 110...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ..."
3,"[0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[101, 113, 1109, 1112, 17482, 28026, 1116, 117...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ..."
4,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[101, 138, 6888, 4382, 1149, 1104, 7738, 1219,...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, ..."
...,...,...,...,...
1058,"[0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[101, 1103, 19359, 1116, 1132, 13108, 117, 124...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ..."
1059,"[1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, ...","[101, 1142, 1376, 1282, 1144, 170, 10509, 4604...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ..."
1060,"[0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, ...","[101, 1142, 1282, 1125, 18589, 2162, 1103, 133...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ..."
1061,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[101, 1343, 12205, 1127, 1992, 117, 1133, 1136...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ..."


In [16]:
class My_Dataset(Dataset):
    def __init__(self, df, transform = None):
    
        self.df = df
        self.transform = transform

    def __len__(self):
        return len(self.df)

    def __getitem__(self, index):
        labels = self.df['labels'].iloc[index]
        input_ids = self.df['input_ids'].iloc[index]
        token_type_ids = self.df['token_type_ids'].iloc[index]
        attention_mask = self.df['attention_mask'].iloc[index]

        return {'labels':torch.tensor(labels), 
                'input_ids': torch.tensor(input_ids), 
                'token_type_ids': torch.tensor(token_type_ids), 
                'attention_mask':torch.tensor(attention_mask)}

In [17]:
dataset = My_Dataset(df_dataset)

In [18]:
data_loader = DataLoader(dataset, batch_size=5)

In [19]:
for data in data_loader:
    print(data)
    break

{'labels': tensor([[0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
        [0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0]],
       dtype=torch.int32), 'input_ids': tensor([[  101,   109,  1406,  ...,     0,     0,     0],
        [  101,   109,   127,  ...,     0,     0,     0],
        [  101,   113, 10672,  ...,     0,     0,     0],
        [  101,   113,  1109,  ...,     0,     0,     0],
        [  101,   138,  6888,  ...,     0,     0,     0]]), 'token_type_ids': tensor([[0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0]]), 'attention_mask': tensor([[1, 1, 1,  ..., 0, 0, 0],
    

In [20]:
from transformers import AutoModelForSequenceClassification


In [21]:
model = AutoModelForSequenceClassification.from_pretrained('bert-base-cased',num_labels=24)

Some weights of the model checkpoint at bert-base-cased were not used when initializing BertForSequenceClassification: ['cls.predictions.transform.LayerNorm.weight', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at b

In [22]:
from torch.optim import AdamW

optimizer = AdamW(model.parameters(), lr=5e-5)

In [23]:
from transformers import get_scheduler

num_epochs = 3
num_training_steps = num_epochs * len(data_loader)
lr_scheduler = get_scheduler(
    name="linear", optimizer=optimizer, num_warmup_steps=0, num_training_steps=num_training_steps
)

In [24]:
import torch

device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
model.to(device)

BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(28996, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, element

In [25]:
from tqdm.auto import tqdm

progress_bar = tqdm(range(num_training_steps))

model.train()
for epoch in range(num_epochs):
    for batch in data_loader:

        batch = {k: v.to(device) for k, v in batch.items() if k != 'labels'}

        outputs = model(**batch)
        print(outputs)

        loss = outputs.loss
        loss.backward()

        optimizer.step()
        lr_scheduler.step()
        optimizer.zero_grad()
        progress_bar.update(1)

  0%|          | 0/639 [00:00<?, ?it/s]

torch.Size([5, 512])
SequenceClassifierOutput(loss=None, logits=tensor([[ 0.3508,  0.3762,  0.2202,  0.1380,  0.9804,  0.2379, -0.1257, -0.7990,
          0.8973,  0.5323,  0.4577, -0.9001,  0.0751,  0.2186, -0.9703, -0.2591,
         -0.8446,  0.1080, -0.1358,  0.2235, -0.2563, -0.0274, -0.1187,  0.3050],
        [ 0.6165,  0.4959,  0.0921,  0.0694,  0.8838,  0.2068, -0.3353, -0.3062,
          0.7113,  0.2726,  0.2015, -0.5908,  0.0687,  0.3663, -0.8884,  0.1119,
         -0.7387, -0.1243, -0.0178,  0.3180, -0.0712, -0.1927, -0.2058,  0.1677],
        [ 0.5349,  0.4183,  0.1192,  0.0797,  0.7562,  0.4060, -0.2598, -0.5735,
          0.9369,  0.5139,  0.3894, -0.5890,  0.0316,  0.3922, -0.9587, -0.1093,
         -1.1459,  0.0450,  0.1029,  0.1920, -0.3037,  0.1778, -0.4098,  0.2952],
        [ 0.4903,  0.4486,  0.2353,  0.2811,  1.0546,  0.1728, -0.4883, -0.7570,
          0.7735,  0.5800,  0.3057, -0.5466,  0.3257,  0.3550, -0.9464, -0.0930,
         -0.9615, -0.2901,  0.1969,  0.420

AttributeError: 'NoneType' object has no attribute 'backward'

In [1]:
torch.cuda.get_device_name(0)

NameError: name 'torch' is not defined