In [4]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from transformers import AutoTokenizer, BertTokenizer,AutoModelForSequenceClassification,BertForSequenceClassification
from transformers import pipeline

In [5]:
df=pd.read_csv(r"/content/synthetic_absa_dataset.csv")
df.head()

Unnamed: 0,review,aspect,sentiment
0,The camera is terrible.,camera,negative
1,I'm disappointed with the price.,price,negative
2,The performance is neither good nor bad.,performance,neutral
3,Excellent design and worth the price.,design,positive
4,Nothing special about the screen.,screen,neutral


In [6]:
df.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 3 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   review     1000 non-null   object
 1   aspect     1000 non-null   object
 2   sentiment  1000 non-null   object
dtypes: object(3)
memory usage: 23.6+ KB


In [7]:
df['Input_text']=df.apply(lambda x: f"what is the review {x['review']} [sep] for the aspect {x['aspect']}",axis=1)
df.head()

Unnamed: 0,review,aspect,sentiment,Input_text
0,The camera is terrible.,camera,negative,what is the review The camera is terrible. [se...
1,I'm disappointed with the price.,price,negative,what is the review I'm disappointed with the p...
2,The performance is neither good nor bad.,performance,neutral,what is the review The performance is neither ...
3,Excellent design and worth the price.,design,positive,what is the review Excellent design and worth ...
4,Nothing special about the screen.,screen,neutral,what is the review Nothing special about the s...


In [8]:
#import labelencoder
from sklearn.preprocessing import LabelEncoder
le=LabelEncoder()
df['sentiment']=le.fit_transform(df['sentiment'])
df.head()

Unnamed: 0,review,aspect,sentiment,Input_text
0,The camera is terrible.,camera,0,what is the review The camera is terrible. [se...
1,I'm disappointed with the price.,price,0,what is the review I'm disappointed with the p...
2,The performance is neither good nor bad.,performance,1,what is the review The performance is neither ...
3,Excellent design and worth the price.,design,2,what is the review Excellent design and worth ...
4,Nothing special about the screen.,screen,1,what is the review Nothing special about the s...


In [9]:
df1=df[['Input_text','sentiment']]
df1.head()

Unnamed: 0,Input_text,sentiment
0,what is the review The camera is terrible. [se...,0
1,what is the review I'm disappointed with the p...,0
2,what is the review The performance is neither ...,1
3,what is the review Excellent design and worth ...,2
4,what is the review Nothing special about the s...,1


In [10]:
from sklearn.model_selection import train_test_split
x_train,x_test,y_train,y_test=train_test_split(df1['Input_text'],df1['sentiment'],test_size=0.2,random_state=42)

In [11]:
print(x_train)

29     what is the review Highly satisfied with the p...
535    what is the review Worst experience with the p...
695    what is the review Worst experience with the p...
557    what is the review The screen is neither good ...
836    what is the review Highly satisfied with the d...
                             ...                        
106    what is the review Average camera, not too bad...
270    what is the review Excellent price and worth t...
860    what is the review Poor performance, not expec...
435    what is the review The camera is okay. [sep] f...
102    what is the review Average design, not too bad...
Name: Input_text, Length: 800, dtype: object


In [12]:
x_test

Unnamed: 0,Input_text
521,what is the review I'm disappointed with the p...
737,what is the review The performance is amazing!...
740,what is the review Excellent performance and w...
660,what is the review Worst experience with the b...
411,"what is the review Average design, not too bad..."
...,...
408,what is the review The battery is okay. [sep] ...
332,"what is the review Average design, not too bad..."
208,what is the review Highly satisfied with the s...
613,what is the review Highly satisfied with the s...


In [13]:
y_train

Unnamed: 0,sentiment
29,2
535,0
695,0
557,1
836,2
...,...
106,1
270,2
860,0
435,1


In [14]:
from transformers import BertTokenizer
tokenizer=BertTokenizer.from_pretrained('bert-base-uncased')

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

In [15]:
train_encoding = tokenizer(list(x_train), truncation=True, padding=True, max_length=100)
test_encoding = tokenizer(list(x_test), truncation=True, padding=True, max_length=100)

In [16]:
#print(train_encoding.attention_mask)
print(list(train_encoding.keys()))
#print(train_encoding.items())
#print(len(y_train.values))


['input_ids', 'token_type_ids', 'attention_mask']


In [17]:
import torch
#train_dataset=torch.utils.data.TensorDataset(torch.tensor(train_encoding['input_ids']),torch.tensor(train_encoding['attention_mask']),torch.tensor(y_train.values))
#test_encoding=torch.utils.data.TensorDataset(torch.tensor(test_encoding['input_ids']),torch.tensor(test_encoding['attention_mask']),torch.tensor(y_test.values))
#we can use above meathod to create tensors and

# we can create a class as well
class model_data(torch.utils.data.Dataset):
  def __init__(self,encodings,labels):
    self.encodings=encodings
    self.labels=labels
  def __len__(self):
    return len(self.labels)
  def __getitem__(self,idx):
    item={key:torch.tensor(val[idx]) for key,val in self.encodings.items()}
    item['labels']=torch.tensor(self.labels[idx])
    return item

In [18]:
train_dataset=model_data(train_encoding,y_train.values)
test_dataset=model_data(test_encoding,y_test.values)

In [19]:
train_dataset[0]

{'input_ids': tensor([  101,  2054,  2003,  1996,  3319,  3811,  8510,  2007,  1996,  3976,
          1012,  1031, 19802,  1033,  2005,  1996,  7814,  3976,   102,     0,
             0]),
 'token_type_ids': tensor([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]),
 'attention_mask': tensor([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0]),
 'labels': tensor(2)}

In [20]:
from transformers import BertForSequenceClassification, Trainer, TrainingArguments

model=BertForSequenceClassification.from_pretrained('bert-base-uncased',num_labels=3)

args=TrainingArguments(output_dir='./results',num_train_epochs=3,
                       per_device_train_batch_size=16,
                       per_device_eval_batch_size=64,warmup_steps=500,
                       weight_decay=0.01,logging_dir='./logs',
                       logging_steps=10)

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [21]:
trainer=Trainer(model=model,args=args,train_dataset=train_dataset,eval_dataset=test_dataset)

In [22]:
trainer.train()

  | |_| | '_ \/ _` / _` |  _/ -_)


<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize?ref=models
wandb: Paste an API key from your profile and hit enter:

 ··········


[34m[1mwandb[0m: No netrc file found, creating one.
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc
[34m[1mwandb[0m: Currently logged in as: [33majay-suprcool[0m ([33majay-suprcool-edureka[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


Step,Training Loss
10,1.1673
20,1.0982
30,1.0996
40,1.0782
50,0.9993
60,0.9058
70,0.7921
80,0.6978
90,0.5594
100,0.48


TrainOutput(global_step=150, training_loss=0.6419486663738887, metrics={'train_runtime': 87.6081, 'train_samples_per_second': 27.395, 'train_steps_per_second': 1.712, 'total_flos': 25900227057600.0, 'train_loss': 0.6419486663738887, 'epoch': 3.0})

In [23]:
eval=trainer.evaluate()
print(eval)

{'eval_loss': 0.01948023959994316, 'eval_runtime': 0.3073, 'eval_samples_per_second': 650.787, 'eval_steps_per_second': 13.016, 'epoch': 3.0}


In [24]:
pred=trainer.predict(test_dataset)
print(pred)

PredictionOutput(predictions=array([[ 3.1707103, -1.8478428, -1.6253127],
       [-1.9045125, -1.4977285,  3.5218172],
       [-1.7260624, -1.6428754,  3.4678476],
       [ 3.1958487, -1.7954843, -1.6413969],
       [-1.6857259,  2.54159  , -1.2603887],
       [ 3.171596 , -1.8185548, -1.6515981],
       [-1.892623 , -1.4461577,  3.5314884],
       [ 3.1722646, -1.7903587, -1.7016352],
       [ 3.211609 , -1.8046563, -1.601573 ],
       [ 3.1679158, -1.8344667, -1.6122499],
       [-1.892623 , -1.4461577,  3.5314884],
       [-1.6889381,  2.6420062, -1.5098714],
       [-1.709166 ,  2.6330576, -1.48468  ],
       [-1.7260624, -1.6428754,  3.4678476],
       [-1.8561199, -1.5104988,  3.5406477],
       [-1.9045125, -1.4977285,  3.5218172],
       [-1.6840069,  2.5144703, -1.237733 ],
       [ 3.1722646, -1.7903587, -1.7016352],
       [-1.6916403,  2.650176 , -1.477316 ],
       [-1.7456621,  2.6436145, -1.4129196],
       [-1.7561723,  2.631862 , -1.4783663],
       [-1.6857259,  2.541

In [25]:
y_pred=pred.predictions.argmax(-1)
print(y_pred)
pred_label=pred.label_ids
print(pred_label)

[0 2 2 0 1 0 2 0 0 0 2 1 1 2 2 2 1 0 1 1 1 1 1 2 0 2 0 0 0 2 0 0 0 0 0 0 1
 1 0 1 1 0 1 0 0 1 0 0 1 1 2 0 0 0 1 2 1 1 1 2 0 2 1 1 0 1 1 1 2 0 2 0 2 1
 1 0 0 2 2 1 1 0 1 2 1 0 0 2 2 0 1 2 2 2 1 2 0 0 1 1 1 0 2 0 0 0 1 2 2 1 0
 0 2 1 1 2 2 0 2 1 0 1 0 1 2 0 2 1 2 0 0 2 2 0 2 2 1 1 2 2 1 1 2 1 1 1 2 1
 2 0 2 0 1 2 2 0 0 2 2 2 1 0 0 1 1 2 1 1 1 0 0 1 0 1 1 2 0 1 0 1 0 1 2 2 2
 2 0 0 0 2 2 1 2 2 0 1 1 2 2 0]
[0 2 2 0 1 0 2 0 0 0 2 1 1 2 2 2 1 0 1 1 1 1 1 2 0 2 0 0 0 2 0 0 0 0 0 0 1
 1 0 1 1 0 1 0 0 1 0 0 1 1 2 0 0 0 1 2 1 1 1 2 0 2 1 1 0 1 1 1 2 0 2 0 2 1
 1 0 0 2 2 1 1 0 1 2 1 0 0 2 2 0 1 2 2 2 1 2 0 0 1 1 1 0 2 0 0 0 1 2 2 1 0
 0 2 1 1 2 2 0 2 1 0 1 0 1 2 0 2 1 2 0 0 2 2 0 2 2 1 1 2 2 1 1 2 1 1 1 2 1
 2 0 2 0 1 2 2 0 0 2 2 2 1 0 0 1 1 2 1 1 1 0 0 1 0 1 1 2 0 1 0 1 0 1 2 2 2
 2 0 0 0 2 2 1 2 2 0 1 1 2 2 0]


In [26]:
id2label={0:'negative',1:'neutral',2:'positive'}
readable_pred_label=[id2label[i] for i in y_pred]
readable_actual_labels=[id2label[i] for i in y_test.values]


In [27]:
print(len(readable_pred_label))
print(len(readable_actual_labels))
print(len(x_test))

200
200
200


In [28]:
x_test_df=pd.DataFrame(x_test)
print(type(x_test_df))
x_test_df.columns=['review']
print(x_test_df)
x_test_df['actual_label']=readable_actual_labels
x_test_df['predicted_label']=readable_pred_label
x_test_df.head()

<class 'pandas.core.frame.DataFrame'>
                                                review
521  what is the review I'm disappointed with the p...
737  what is the review The performance is amazing!...
740  what is the review Excellent performance and w...
660  what is the review Worst experience with the b...
411  what is the review Average design, not too bad...
..                                                 ...
408  what is the review The battery is okay. [sep] ...
332  what is the review Average design, not too bad...
208  what is the review Highly satisfied with the s...
613  what is the review Highly satisfied with the s...
78   what is the review Worst experience with the d...

[200 rows x 1 columns]


Unnamed: 0,review,actual_label,predicted_label
521,what is the review I'm disappointed with the p...,negative,negative
737,what is the review The performance is amazing!...,positive,positive
740,what is the review Excellent performance and w...,positive,positive
660,what is the review Worst experience with the b...,negative,negative
411,"what is the review Average design, not too bad...",neutral,neutral


In [29]:
x_test.iloc[0]

"what is the review I'm disappointed with the price. [sep] for the aspect price"

In [30]:
test_dataset[0]

{'input_ids': tensor([  101,  2054,  2003,  1996,  3319,  1045,  1005,  1049,  9364,  2007,
          1996,  3976,  1012,  1031, 19802,  1033,  2005,  1996,  7814,  3976,
           102]),
 'token_type_ids': tensor([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]),
 'attention_mask': tensor([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]),
 'labels': tensor(0)}

Using Hugging face transfomer pipeline

In [31]:
from transformers import pipeline
classifier=pipeline('sentiment-analysis',model='yangheng/deberta-v3-base-absa-v1.1',tokenizer='yangheng/deberta-v3-base-absa-v1.1')
#apply classifier on whole x_train
aspects = ['camera','price','design','screen']
result=x_train.apply(lambda x: {a: classifier(x,text_pair=a)[0]['label'] for a in aspects})
print(result)

config.json: 0.00B [00:00, ?B/s]

model.safetensors:   0%|          | 0.00/738M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/372 [00:00<?, ?B/s]

spm.model:   0%|          | 0.00/2.46M [00:00<?, ?B/s]

added_tokens.json:   0%|          | 0.00/18.0 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/156 [00:00<?, ?B/s]

Device set to use cuda:0
You seem to be using the pipelines sequentially on GPU. In order to maximize efficiency please use a dataset


29     {'camera': 'Positive', 'price': 'Positive', 'd...
535    {'camera': 'Negative', 'price': 'Negative', 'd...
695    {'camera': 'Negative', 'price': 'Negative', 'd...
557    {'camera': 'Neutral', 'price': 'Neutral', 'des...
836    {'camera': 'Positive', 'price': 'Positive', 'd...
                             ...                        
106    {'camera': 'Neutral', 'price': 'Neutral', 'des...
270    {'camera': 'Positive', 'price': 'Positive', 'd...
860    {'camera': 'Negative', 'price': 'Negative', 'd...
435    {'camera': 'Positive', 'price': 'Neutral', 'de...
102    {'camera': 'Neutral', 'price': 'Neutral', 'des...
Name: Input_text, Length: 800, dtype: object


In [32]:
print(result[435])

{'camera': 'Positive', 'price': 'Neutral', 'design': 'Positive', 'screen': 'Positive'}


In [33]:
result1=classifier(x_train[3],text_pair='design')
print(result1)

[{'label': 'Positive', 'score': 0.9978616833686829}]
