In [13]:
from transformers import RobertaConfig, AutoModelForSequenceClassification,RobertaTokenizer, Trainer, TrainingArguments
import pandas as pd
import torch
from sklearn.model_selection import train_test_split
from torch.utils.data import Dataset
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
import numpy as np
import os

In [14]:
class SentimentData(Dataset):
    def __init__(self, dataframe, tokenizer, max_len):
        self.tokenizer = tokenizer
        self.data = dataframe
        self.text = dataframe.text
        self.label = self.data.label
        self.max_len = max_len

    def __len__(self):
        return len(self.text)

    def __getitem__(self, index):
        text = str(self.text[index])
        text = " ".join(text.split())
       
        inputs = self.tokenizer.encode_plus(
            text,
            None,
            truncation = True,
            add_special_tokens=True,
            max_length=self.max_len,
            padding =  'max_length',
            return_token_type_ids=True
        )
        ids = inputs['input_ids']
        mask = inputs['attention_mask']
   


        return {
            'input_ids': torch.tensor(ids, dtype=torch.long),
            'mask': torch.tensor(mask, dtype=torch.long),
            'label': torch.tensor(self.label[index], dtype=torch.float)
        }

In [15]:

def encode_Lables(label):
    temp = np.zeros(5)
    temp[label-1] = 1
    return temp


    
def split_data(path):
    df = pd.read_csv(path)
    df.rename(columns={"Rating":"label"},inplace=True)
    df.rename(columns={"Review_text":"text"},inplace=True)
    df["label"] = df["label"].apply(encode_Lables)
    train_data,test_data = train_test_split(df,test_size=.20,random_state=64)
    train_data = train_data.reset_index(drop=True)
    test_data = test_data.reset_index(drop=True)
    max_length = int(df["text"].str.split().str.len().mean())
    return train_data,test_data,max_length
current_dir = os.getcwd()
parent_dir = os.path.dirname(current_dir)
filename = "data_set_1.csv"
path = parent_dir +"/model_data/" + filename
train,test,max_length= split_data(path)


In [16]:
model = AutoModelForSequenceClassification.from_pretrained("roberta-base", num_labels=5)
tokenizer = RobertaTokenizer.from_pretrained("roberta-base")

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [17]:
training_set = SentimentData(train, tokenizer, max_len=max_length)
testing_set = SentimentData(test, tokenizer, max_len=max_length)

In [18]:
training_set.__getitem__(5)

{'input_ids': tensor([    0,  1178, 39462,     2,     1,     1,     1,     1,     1,     1,
             1,     1,     1,     1]),
 'mask': tensor([1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]),
 'label': tensor([0., 0., 0., 0., 1.])}

In [None]:
def compute_metrics(pred):
    label = torch.argmax(torch.tensor(pred.label_ids))
    preds = torch.argmax(torch.tensor(pred.predictions.argmax(-1)))
    precision, recall, f1, _ = precision_recall_fscore_support(label, preds, average='binary')
    acc = accuracy_score(label, preds)
    return {
        'accuracy': acc,
        'f1': f1,
        'precision': precision,
        'recall': recall
    }

In [20]:
training_args = TrainingArguments(

    learning_rate=2e-5,
    fp16=True,
    per_device_train_batch_size=32,
    per_device_eval_batch_size=8,
    num_train_epochs=1,
    weight_decay=0.01,
    eval_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
    push_to_hub=False,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=training_set,
    eval_dataset=testing_set,
    processing_class=tokenizer,
    compute_metrics=compute_metrics,
)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)


RobertaForSequenceClassification(
  (roberta): RobertaModel(
    (embeddings): RobertaEmbeddings(
      (word_embeddings): Embedding(50265, 768, padding_idx=1)
      (position_embeddings): Embedding(514, 768, padding_idx=1)
      (token_type_embeddings): Embedding(1, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): RobertaEncoder(
      (layer): ModuleList(
        (0-11): 12 x RobertaLayer(
          (attention): RobertaAttention(
            (self): RobertaSdpaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): RobertaSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
         

In [21]:
trainer.train()

Epoch,Training Loss,Validation Loss


TypeError: argmax(): argument 'input' (position 1) must be Tensor, not numpy.ndarray