In [1]:
import pandas as pd
import numpy as np
import re
import nltk
from nltk.corpus import stopwords

nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')
stop_words = stopwords.words('english')

from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
!pip install transformers==4.22.2

from sklearn.model_selection import train_test_split
from sklearn import preprocessing
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import classification_report
import torch
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\lollo\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\lollo\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\lollo\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!




In [2]:
path = r'.\data\traindata.csv'
pathdev = r'.\data/devdata.csv'
df = pd.read_csv(path,sep='\t', header=None)
df_dev = pd.read_csv(pathdev,sep='\t', header=None)

In [3]:
df.columns = ['polarity', 'aspect_category', 'target_term', 'character_offsets', 'sentence']
df_dev.columns = ['polarity', 'aspect_category', 'target_term', 'character_offsets', 'sentence']

In [5]:
df.head()

Unnamed: 0,polarity,aspect_category,target_term,character_offsets,sentence
0,positive,AMBIENCE#GENERAL,seating,18:25,short and sweet – seating is great:it's romant...
1,positive,AMBIENCE#GENERAL,trattoria,25:34,This quaint and romantic trattoria is at the t...
2,positive,FOOD#QUALITY,food,98:102,The have over 100 different beers to offer thi...
3,negative,SERVICE#GENERAL,STAFF,5:10,THIS STAFF SHOULD BE FIRED.
4,positive,FOOD#STYLE_OPTIONS,menu,4:8,"The menu looked great, and the waiter was very..."


In [6]:
df.polarity.value_counts()

positive    1055
negative     390
neutral       58
Name: polarity, dtype: int64

In [7]:
df.sentence.duplicated().sum()

440

In [4]:
#label encoding the polarity
lb = preprocessing.LabelEncoder()
df['polarity']=lb.fit_transform(df['polarity'])
df_dev['polarity']=lb.fit_transform(df_dev['polarity'])

In [5]:
def add_sep(text,offset):
    sep1 = ' <target_bos> '
    sep2 = ' <target_eos> '
    split = offset.split(':')
    sentence_left = text[:int(split[0])]
    target = text[int(split[0]):int(split[1])]
    sentence_right = text[int(split[1]):]
    concat = sentence_left+sep1+target+sep2+sentence_right
    return concat

In [6]:
def aspect_categories(aspect):
    
  if aspect == 'AMBIENCE#GENERAL':
    return "What do you think of the <target_bos> ambience <target_eos> ?"

  elif aspect == 'FOOD#QUALITY':
    return "What do you think of the <target_bos> food quality <target_eos> ?"

  elif aspect == 'SERVICE#GENERAL':
    return "What do you think of the <target_bos> service <target_eos> ?"

  elif aspect == 'FOOD#STYLE_OPTIONS':
    return "What do you think of the <target_bos> food choices <target_eos> ?"

  elif aspect == 'DRINKS#QUALITY':
    return "What do you think of the <target_bos> drinks quality <target_eos> ?"

  elif aspect == 'RESTAURANT#MISCELLANEOUS' or aspect == 'RESTAURANT#GENERAL':
    return "What do you think of the <target_bos> restaurant <target_eos> ?"

  elif aspect == 'LOCATION#GENERAL':
    return 'What do you think of the <target_bos> location <target_eos> ?'

  elif aspect == 'DRINKS#STYLE_OPTIONS':
    return "What do you think of the <target_bos> drink choices <target_eos> ?"
  
  else:
    return 'What do you think of the <target_bos> price <target_eos> ?'

In [7]:
df['sentence_sep'] = df.apply(lambda x: add_sep(x['sentence'],x['character_offsets']),axis = 1)
df.head()

Unnamed: 0,polarity,aspect_category,target_term,character_offsets,sentence,sentence_sep
0,2,AMBIENCE#GENERAL,seating,18:25,short and sweet – seating is great:it's romant...,short and sweet – <target_bos> seating <targe...
1,2,AMBIENCE#GENERAL,trattoria,25:34,This quaint and romantic trattoria is at the t...,This quaint and romantic <target_bos> trattor...
2,2,FOOD#QUALITY,food,98:102,The have over 100 different beers to offer thi...,The have over 100 different beers to offer thi...
3,0,SERVICE#GENERAL,STAFF,5:10,THIS STAFF SHOULD BE FIRED.,THIS <target_bos> STAFF <target_eos> SHOULD ...
4,2,FOOD#STYLE_OPTIONS,menu,4:8,"The menu looked great, and the waiter was very...",The <target_bos> menu <target_eos> looked gr...


In [8]:
df_dev['sentence_sep'] = df_dev.apply(lambda x: add_sep(x['sentence'],x['character_offsets']),axis = 1)


In [9]:
df['aspect_category'] = df.apply(lambda x: aspect_categories(x['aspect_category']),axis=1)
df_dev['aspect_category'] = df_dev.apply(lambda x: aspect_categories(x['aspect_category']),axis=1)

In [10]:
df['input'] = df.apply(lambda x: f"{x['aspect_category']} {x['sentence_sep']}", axis=1)
df_dev['input'] = df_dev.apply(lambda x: f"{x['aspect_category']} {x['sentence_sep']}", axis=1)

In [11]:
df.head()

Unnamed: 0,polarity,aspect_category,target_term,character_offsets,sentence,sentence_sep,input
0,2,What do you think of the <target_bos> ambience...,seating,18:25,short and sweet – seating is great:it's romant...,short and sweet – <target_bos> seating <targe...,What do you think of the <target_bos> ambience...
1,2,What do you think of the <target_bos> ambience...,trattoria,25:34,This quaint and romantic trattoria is at the t...,This quaint and romantic <target_bos> trattor...,What do you think of the <target_bos> ambience...
2,2,What do you think of the <target_bos> food qua...,food,98:102,The have over 100 different beers to offer thi...,The have over 100 different beers to offer thi...,What do you think of the <target_bos> food qua...
3,0,What do you think of the <target_bos> service ...,STAFF,5:10,THIS STAFF SHOULD BE FIRED.,THIS <target_bos> STAFF <target_eos> SHOULD ...,What do you think of the <target_bos> service ...
4,2,What do you think of the <target_bos> food cho...,menu,4:8,"The menu looked great, and the waiter was very...",The <target_bos> menu <target_eos> looked gr...,What do you think of the <target_bos> food cho...


In [12]:
# set the device to run the model on (GPU or CPU)
import torch
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')


## Roberta

In [13]:
import torch
import torch.nn as nn
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, classification_report
from transformers import BertTokenizer, RobertaTokenizer, RobertaForSequenceClassification, AdamW, get_linear_schedule_with_warmup
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
from sklearn.utils.class_weight import compute_class_weight
from transformers import BertModel, RobertaModel, RobertaConfig



# initialize the RoBERTa tokenizer and model
tokenizer = RobertaTokenizer.from_pretrained('roberta-base')


roberta = RobertaModel.from_pretrained('roberta-base').to(device)

class CustomRobertaModel(torch.nn.Module):
    def __init__(self, roberta):
        super(CustomRobertaModel, self).__init__()
        self.roberta = roberta
        self.linear1 = torch.nn.Linear(768, 768)
        #self.relu = torch.nn.ReLU()
        self.dropout = torch.nn.Dropout(0.1)
        self.linear2 = torch.nn.Linear(768, 3)

    def forward(self, input_ids, attention_mask=None):
        output = self.roberta(input_ids=input_ids, attention_mask=attention_mask)
        pooled_output = output.pooler_output
        pooled_output = self.linear1(pooled_output)
        #pooled_output = self.relu(pooled_output)
        pooled_output = self.dropout(pooled_output)
        logits = self.linear2(pooled_output)
        return logits

model = CustomRobertaModel(roberta).to(device)
# combine the RoBERTa model and the classifier head into a single model

# tokenize the sentences in the training and validation sets
train_tokens = tokenizer.batch_encode_plus(df['input'].tolist(),
                                           max_length =128,
                                           padding='max_length',
                                           add_special_tokens = True,
                                           truncation=True, 
                                           return_tensors='pt')
val_tokens = tokenizer.batch_encode_plus(df_dev['input'].tolist(),
                                         max_length =128,
                                         padding='max_length',
                                         add_special_tokens = True,
                                         truncation=True, 
                                         return_tensors='pt')

# create PyTorch DataLoader objects for the training and validation sets
train_dataset = TensorDataset(train_tokens['input_ids'], train_tokens['attention_mask'],torch.tensor(df['polarity'].tolist()))
val_dataset = TensorDataset(val_tokens['input_ids'], val_tokens['attention_mask'],torch.tensor(df_dev['polarity'].tolist()))
train_loader = DataLoader(train_dataset, sampler=RandomSampler(train_dataset), batch_size=64)
val_loader = DataLoader(val_dataset, sampler=SequentialSampler(val_dataset), batch_size=32)

# set the optimizer and learning rate scheduler
epochs = 20
# define the class weights as a tensor
class_weights = compute_class_weight('balanced', classes=np.unique(df['polarity']), y=df['polarity'])
class_weights = torch.FloatTensor(class_weights).to(device)

# define the loss function with the weighted cross entropy loss
loss_fn = nn.CrossEntropyLoss(weight=class_weights)
optimizer = torch.optim.AdamW(model.parameters(), lr=2e-5)
scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0, num_training_steps=len(train_loader)*epochs)
#scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=1, gamma=0.9)



Some weights of the model checkpoint at roberta-base were not used when initializing RobertaModel: ['lm_head.decoder.weight', 'lm_head.layer_norm.weight', 'lm_head.dense.bias', 'lm_head.layer_norm.bias', 'lm_head.dense.weight', 'lm_head.bias']
- This IS expected if you are initializing RobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [17]:
# define the training loop
for epoch in range(epochs):
    model.train()
    train_loss = 0
    for batch in train_loader:
        input_ids, attention_mask,  labels = batch
        input_ids = input_ids.to(device)
        attention_mask = attention_mask.to(device)
        labels = labels.to(device)
        optimizer.zero_grad()
        logits = model(input_ids=input_ids, attention_mask=attention_mask)
        print(f"logits {logits.size()} labels {labels.size()}")
        loss = loss_fn(logits, labels)
        train_loss += loss.item()
        loss.backward()
        optimizer.step()
        scheduler.step()
    train_loss /= len(train_loader)
    
    model.eval()
    val_loss = 0
    val_preds = []
    val_labels = []
    with torch.no_grad():
        for batch in val_loader:
            input_ids, attention_mask, labels = batch
            input_ids = input_ids.to(device)
            attention_mask = attention_mask.to(device)
            labels = labels.to(device)
            logits = model(input_ids, attention_mask=attention_mask)
            loss = loss_fn(logits, labels)
            val_loss += loss.item() * input_ids.size(0)
            val_preds += torch.argmax(logits, axis=1).tolist()
            val_labels += labels.tolist()
    val_loss /= len(val_loader.dataset)
    val_accuracy = accuracy_score(val_labels, val_preds)
    val_precision = precision_score(val_labels, val_preds, average='weighted', zero_division=1)
    val_recall = recall_score(val_labels, val_preds, average='weighted')
    val_f1 = f1_score(val_labels, val_preds, average='weighted')
    report = classification_report(val_labels, val_preds, labels=[0, 1, 2], target_names=['Negative', 'Neutral', 'Positive'])
    print(f'Epoch {epoch+1} - Train Loss: {train_loss:.4f} - Val Loss: {val_loss:.4f} - Val Accuracy: {val_accuracy:.4f} - Val Precision: {val_precision:.4f} - Val Recall: {val_recall:.4f} - Val F1: {val_f1:4f}')
    print(report)

logits torch.Size([64, 3]) labels torch.Size([64])
logits torch.Size([64, 3]) labels torch.Size([64])


KeyboardInterrupt: 

In [53]:
import torch
import torch.nn as nn
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, classification_report
from transformers import RobertaTokenizer, RobertaForSequenceClassification, AdamW, get_linear_schedule_with_warmup
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
from sklearn.utils.class_weight import compute_class_weight
from transformers import RobertaModel, RobertaConfig



# initialize the RoBERTa tokenizer and model
tokenizer = RobertaTokenizer.from_pretrained('roberta-large')


roberta = RobertaModel.from_pretrained('roberta-large').to(device)

class CustomRobertaModel(torch.nn.Module):
    def __init__(self, roberta):
        super(CustomRobertaModel, self).__init__()
        self.roberta = roberta
        self.linear1 = torch.nn.Linear(1024, 1024)
        self.dropout = torch.nn.Dropout(0.1)
        self.linear2 = torch.nn.Linear(1024, 3)

    def forward(self, input_ids, attention_mask=None):
        output = self.roberta(input_ids=input_ids, attention_mask=attention_mask)
        pooled_output = output.pooler_output
        pooled_output = self.linear1(pooled_output)
        pooled_output = self.dropout(pooled_output)
        logits = self.linear2(pooled_output)
        return logits

model = CustomRobertaModel(roberta).to(device)
# combine the RoBERTa model and the classifier head into a single model

# tokenize the sentences in the training and validation sets
train_tokens = tokenizer.batch_encode_plus(df['input'].tolist(),
                                           max_length =128,
                                           padding='max_length',
                                           add_special_tokens = True,
                                           truncation=True, 
                                           return_tensors='pt')
val_tokens = tokenizer.batch_encode_plus(df_dev['input'].tolist(),
                                         max_length =128,
                                         padding='max_length',
                                         add_special_tokens = True,
                                         truncation=True, 
                                         return_tensors='pt')

# create PyTorch DataLoader objects for the training and validation sets
train_dataset = TensorDataset(train_tokens['input_ids'], train_tokens['attention_mask'], torch.tensor(df['polarity'].tolist()))
val_dataset = TensorDataset(val_tokens['input_ids'], val_tokens['attention_mask'], torch.tensor(df_dev['polarity'].tolist()))
train_loader = DataLoader(train_dataset, sampler=RandomSampler(train_dataset), batch_size=16)
val_loader = DataLoader(val_dataset, sampler=SequentialSampler(val_dataset), batch_size=8)

# set the optimizer and learning rate scheduler
epochs = 15
# define the class weights as a tensor
class_weights = compute_class_weight('balanced', classes=np.unique(df['polarity']), y=df['polarity'])
class_weights = torch.FloatTensor(class_weights).to(device)
#class_weights = torch.tensor([3., 20., 1.])
# define the loss function with the weighted cross entropy loss
loss_fn = nn.CrossEntropyLoss(weight=class_weights)
optimizer = torch.optim.AdamW(model.parameters(), lr=1e-5)
scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0, num_training_steps=len(train_loader)*epochs)
#scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=1, gamma=0.9)



Some weights of the model checkpoint at roberta-large were not used when initializing RobertaModel: ['lm_head.dense.weight', 'lm_head.layer_norm.bias', 'lm_head.dense.bias', 'lm_head.bias', 'lm_head.decoder.weight', 'lm_head.layer_norm.weight']
- This IS expected if you are initializing RobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [54]:
# define the training loop
for epoch in range(epochs):
    model.train()
    train_loss = 0
    for batch in train_loader:
        input_ids, attention_mask, labels = batch
        input_ids = input_ids.to(device)
        attention_mask = attention_mask.to(device)
        labels = labels.to(device)
        optimizer.zero_grad()
        logits = model(input_ids=input_ids, attention_mask=attention_mask)
        loss = loss_fn(logits, labels)
        train_loss += loss.item()
        loss.backward()
        optimizer.step()
        scheduler.step()
    train_loss /= len(train_loader)
    
    model.eval()
    val_loss = 0
    val_preds = []
    val_labels = []
    with torch.no_grad():
        for batch in val_loader:
            input_ids, attention_mask, labels = batch
            input_ids = input_ids.to(device)
            attention_mask = attention_mask.to(device)
            labels = labels.to(device)
            logits = model(input_ids, attention_mask=attention_mask)
            loss = loss_fn(logits, labels)
            val_loss += loss.item() * input_ids.size(0)
            val_preds += torch.argmax(logits, axis=1).tolist()
            val_labels += labels.tolist()
    val_loss /= len(val_loader.dataset)
    val_accuracy = accuracy_score(val_labels, val_preds)
    val_precision = precision_score(val_labels, val_preds, average='weighted', zero_division=1)
    val_recall = recall_score(val_labels, val_preds, average='weighted')
    val_f1 = f1_score(val_labels, val_preds, average='weighted')
    report = classification_report(val_labels, val_preds, labels=[0, 1, 2], target_names=['Negative', 'Neutral', 'Positive'])
    print(f'Epoch {epoch+1} - Val Loss: {val_loss:.4f} - Val Accuracy: {val_accuracy:.4f} - Val Precision: {val_precision:.4f} - Val Recall: {val_recall:.4f} - Val F1: {val_f1:4f}')
    print(report)

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Epoch 1 - Val Loss: 1.0272 - Val Accuracy: 0.7261 - Val Precision: 0.8443 - Val Recall: 0.7261 - Val F1: 0.730031
              precision    recall  f1-score   support

    Negative       0.49      0.95      0.65        98
     Neutral       0.00      0.00      0.00        14
    Positive       0.97      0.68      0.80       264

    accuracy                           0.73       376
   macro avg       0.49      0.54      0.48       376
weighted avg       0.81      0.73      0.73       376

Epoch 2 - Val Loss: 0.8083 - Val Accuracy: 0.8617 - Val Precision: 0.8538 - Val Recall: 0.8617 - Val F1: 0.856194
              precision    recall  f1-score   support

    Negative       0.73      0.86      0.79        98
     Neutral       0.00      0.00      0.00        14
    Positive       0.94      0.91      0.93       264

    accuracy                           0.86       376
   macro avg       0.56      0.59      0.57       376
weighted avg       0.85      0.86      0.86       376

Epoch 3 - 