In [3]:
import pandas as pd

In [54]:
df = pd.read_csv('../Data/quadruplet/model_inference_300_data.csv')
df.head(1)

Unnamed: 0,original_id,content,clean_tweet,final_sentiment,labels,quadruplet,spam,sentiment_label,model_prediction
0,1.64e+18,@tokopedia min aku udah bayar tapi kenapa diba...,min aku udah bayar tapi kenapa dibatalin pesan...,negative,payment; produk;,"(pesananku, udah bayar tapi kenapa dibatalin, ...",,sentiment,"(pesananku, sudah bayar tapi kenapa dibatalin,..."


# Utils

In [5]:
import re

In [41]:
def extract_quadruplet(sequence):
    extractions = []
    # find all matching quadruplet with (); pattern
    quadruplets = re.findall("\(.*?\)", sequence)
    for quadruplet in quadruplets:
        # Remove the in the start "("  and at the end ")".
        quadruplet = quadruplet[1:-1]
        try:
            aspect_term, opinion_term, sentiment, aspect_category = quadruplet.split(', ')
        except ValueError:
            aspect_term, opinion_term, sentiment, aspect_category = '', '', '', ''
        aspect_term = aspect_term.strip().lower()
        opinion_term = opinion_term.strip().lower()
        sentiment = sentiment.strip().lower()
        aspect_category = aspect_category.strip().lower()
        extractions.append((aspect_term, opinion_term, sentiment, aspect_category)) 
    return extractions

# Evaluator

In [52]:
import re

In [53]:
class Evaluator:
    def __init__(self):
        # == Metrics ==
        self.precision_fn = lambda n_tp, n_pred: float(n_tp) / float(n_pred) if n_pred != 0 else 0
        self.recall_fn = lambda n_tp, n_gold: float(n_tp) / float(n_gold) if n_gold != 0 else 0
        self.f1_fn = (
            lambda precision, recall: (2 * precision * recall) / (precision + recall)
            if precision != 0 or recall != 0
            else 0
        )
    def score(self, pred, gold):
        assert len(pred) == len(gold)
        n_tp, n_gold, n_pred = 0, 0, 0

        for i in range(len(pred)):
            n_gold += len(gold[i])
            n_pred += len(pred[i])

            for t in pred[i]:
                if t in gold[i]:
                    n_tp += 1
                

        precision = self.precision_fn(n_tp, n_pred)
        recall = self.recall_fn(n_tp, n_gold)
        f1 = self.f1_fn(precision, recall)
        return {"precision": precision, "recall": recall, "f1": f1}
    
    # == Evaluation ==
    def evaluate(self, pred_seqs, gold_seqs):
        assert len(pred_seqs) == len(gold_seqs)
        num_samples = len(gold_seqs)

        all_labels, all_preds = [], []

        for i in range(num_samples):
            gold_list = extract_quadruplet(gold_seqs[i])
            pred_list = extract_quadruplet(pred_seqs[i])

            all_labels.append(gold_list)
            all_preds.append(pred_list)

        raw_scores = self.score(all_preds, all_labels)
        return raw_scores, all_labels, all_preds

In [55]:
evaluator = Evaluator()
raw_scores, all_labels, all_preds = evaluator.evaluate(df['model_prediction'].astype('str'), df['quadruplet'])

In [56]:
raw_scores

{'precision': 0.3333333333333333,
 'recall': 0.3333333333333333,
 'f1': 0.3333333333333333}

# Preprocess

In [14]:
from Sastrawi.StopWordRemover.StopWordRemoverFactory import StopWordRemoverFactory
from Sastrawi.Stemmer.StemmerFactory import StemmerFactory
import preprocessor as p
from src.slang_word import SLANG_WORDS
#gk kepake aslinya, cuma untuk testing aja
from transformers import (
    AutoTokenizer
)
from datasets import load_dataset

  from .autonotebook import tqdm as notebook_tqdm


In [15]:
class Preprocessor:
    """
    Untuk preprocess sebelum masuk ke fine tuner
    input :
        preprocess_type : p0/p1/p2/p3
        tokenizer : huggingface tokenizer
        tokenizer_max_length : max length hasil tokenizer nantinya
        text_col : kolom text raw yang ingin dibersihkan
        label_col : kolom label yang ingin diprediksi
    out : 
        tokenized_inputs : huggingface dataset hasil pembersihan dan tokenizer
    """
    def __init__(self, 
                 preprocess_type:str , 
                 tokenizer, 
                 tokenizer_max_length:int, 
                 text_col:str,
                 label_col:str
        ):
        self.prepocess_type = preprocess_type
        self.tokenizer = tokenizer
        self.tokenizer_max_length = tokenizer_max_length
        self.text_col = text_col
        self.label_col = label_col
        if self.prepocess_type=='p01' or self.prepocess_type=='p03':
            #create stopword remover
            self.stop_factory = StopWordRemoverFactory()
            self.stopword_remover = self.stop_factory.create_stop_word_remover()
        if self.prepocess_type=='p02' or self.prepocess_type=='p03':
            # create stemmer
            self.factory = StemmerFactory()
            self.stemmer = self.factory.create_stemmer()
        
    def clean_repetitive(self, word):
        prev_char = None
        char_count=-1
        clean_word=''
        for c in word:
            if prev_char!=c:
                prev_char=c
                char_count=0
            else:
                char_count+=1
            if char_count<1:
                clean_word+=c
        #remove word if only 1 char left
        return clean_word if len(clean_word)>1 else ''
    def clean_text(self, text):
        #lower case
        text = text.lower()
        #clean text with tweet-preprocessor
        text = p.clean(text)
        #clean repetitive word
        text = " ".join([self.clean_repetitive(word) for word in text.split()])
        #convert slang word into dictionary
        text = " ".join([SLANG_WORDS[word] if word in SLANG_WORDS else word for word in text.split()])
        return text
    def stem(self, text):
        return self.stemmer.stem(text)
    def stopword_removal(self, text):
        return self.stopword_remover.remove(text)
    def preprocess_dataset(self, examples):
        inputs = examples[self.text_col]
        inputs = [self.clean_text(input) for input in inputs]
        if self.prepocess_type=='p02' or self.prepocess_type=='p04':
            inputs = [self.stopword_removal(input) for input in inputs]
        if self.prepocess_type=='p03' or self.prepocess_type=='p04':
            inputs = [self.stem(input) for input in inputs]
        targets =examples[self.label_col] 
        tokenized_inputs = self.tokenizer(
            inputs, text_target=targets, max_length=self.tokenizer_max_length, truncation=True
        )
        return tokenized_inputs
    

In [16]:
model_pretrained_checkpoint = "Wikidepia/IndoT5-base"
tokenizer = AutoTokenizer.from_pretrained(model_pretrained_checkpoint)

In [19]:
max_length = 128
text_col = 'content'
label_col = 'quadruplet'
preprocessor = Preprocessor('p00', tokenizer, max_length, text_col, label_col)

In [21]:
#dataset
raw_dataset = load_dataset('csv', data_files='../Data/quadruplet/quadruplet_only.csv')
splitted_dataset = raw_dataset['train'].train_test_split(test_size=0.2, seed=42)
tokenized_dataset = splitted_dataset.map(preprocessor.preprocess_dataset, batched=True, remove_columns=splitted_dataset['train'].column_names)

Found cached dataset csv (C:/Users/danendra/.cache/huggingface/datasets/csv/default-bf79bf034d0392b1/0.0.0/6954658bab30a358235fa864b05cf819af0e179325c740e4bc853bcc7ec513e1)
100%|██████████| 1/1 [00:00<00:00, 999.60it/s]
Loading cached split indices for dataset at C:\Users\danendra\.cache\huggingface\datasets\csv\default-bf79bf034d0392b1\0.0.0\6954658bab30a358235fa864b05cf819af0e179325c740e4bc853bcc7ec513e1\cache-571d28bc462ba89a.arrow and C:\Users\danendra\.cache\huggingface\datasets\csv\default-bf79bf034d0392b1\0.0.0\6954658bab30a358235fa864b05cf819af0e179325c740e4bc853bcc7ec513e1\cache-24355355d1083472.arrow
                                                   

In [23]:
tokenizer.decode(tokenized_dataset['test']['labels'][0])

'(pesananku, udah bayar tapi kenapa dibatalin, negative, website&apps);</s>'

# Fine tune

In [24]:
from transformers import (
    T5ForConditionalGeneration,
    Seq2SeqTrainingArguments,
    Seq2SeqTrainer,
    DataCollatorForSeq2Seq,
    AutoTokenizer
)
import torch

In [26]:
class FineTuner:
    def __init__(self, model, save_path, tokenizer, train_dataset, eval_dataset) -> None:
        self.device = "cuda:0" if torch.cuda.is_available() else "cpu"
        self.model = model.to(self.device)
        self.save_path = save_path
        self.tokenizer = tokenizer
        self.train_dataset = train_dataset
        self.eval_dataset = eval_dataset
        print(self.device)
    def fine_tune(self, arg):
        data_collator = DataCollatorForSeq2Seq(self.tokenizer, model=self.model)
        trainer = Seq2SeqTrainer(
            self.model,
            arg,
            train_dataset=self.train_dataset,
            eval_dataset=self.eval_dataset,
            data_collator=data_collator,
            tokenizer=self.tokenizer,
        )
        trainer.train()
        self.model.save_pretrained(self.save_path)

In [27]:
#constant
save_path = f'../models/test_fine_tuner'
model_pretrained_checkpoint = "Wikidepia/IndoT5-base"
tokenizer = AutoTokenizer.from_pretrained(model_pretrained_checkpoint)
model = T5ForConditionalGeneration.from_pretrained(model_pretrained_checkpoint)
#training argument
training_args = Seq2SeqTrainingArguments(
    save_path,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    learning_rate=2e-5,
    weight_decay=0.01,
    resume_from_checkpoint=True,
    num_train_epochs=5,
    save_total_limit=2,
)
finetuner = FineTuner(model=model, save_path=save_path, tokenizer=tokenizer, 
                      train_dataset=tokenized_dataset['train'], eval_dataset=tokenized_dataset['test'])
finetuner.fine_tune(training_args)



cuda:0


  0%|          | 0/195 [00:00<?, ?it/s]You're using a T5TokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.
                                                
 20%|██        | 39/195 [00:09<00:29,  5.34it/s]

{'eval_loss': 3.2513034343719482, 'eval_runtime': 0.4241, 'eval_samples_per_second': 181.551, 'eval_steps_per_second': 23.578, 'epoch': 1.0}


                                                
 40%|████      | 78/195 [00:40<00:22,  5.31it/s]

{'eval_loss': 2.5867531299591064, 'eval_runtime': 0.4142, 'eval_samples_per_second': 185.921, 'eval_steps_per_second': 24.146, 'epoch': 2.0}


                                                 
 60%|██████    | 117/195 [01:15<00:12,  6.06it/s]

{'eval_loss': 2.3198323249816895, 'eval_runtime': 0.3976, 'eval_samples_per_second': 193.646, 'eval_steps_per_second': 25.149, 'epoch': 3.0}


                                                 
 80%|████████  | 156/195 [01:49<00:06,  6.34it/s]

{'eval_loss': 2.185319185256958, 'eval_runtime': 0.3817, 'eval_samples_per_second': 201.746, 'eval_steps_per_second': 26.201, 'epoch': 4.0}


                                                 
100%|██████████| 195/195 [02:23<00:00,  5.48it/s]

{'eval_loss': 2.147050619125366, 'eval_runtime': 0.4064, 'eval_samples_per_second': 189.472, 'eval_steps_per_second': 24.607, 'epoch': 5.0}


100%|██████████| 195/195 [02:53<00:00,  1.12it/s]


{'train_runtime': 173.2776, 'train_samples_per_second': 8.887, 'train_steps_per_second': 1.125, 'train_loss': 2.9481980543870194, 'epoch': 5.0}


# Inference

In [46]:
from transformers import (
    T5ForConditionalGeneration,
    DataCollatorForSeq2Seq,
)
import torch
from tqdm import tqdm

In [47]:
class ModelInference:
    def __init__(self, batch_size, dataset, model, tokenizer) -> None:
        self.batch_size = batch_size
        self.model = model
        self.device = "cuda:0" if torch.cuda.is_available() else "cpu"
        self.model = self.model.to(self.device)
        self.datacollator = DataCollatorForSeq2Seq(tokenizer, model=model)
        self.dataset = dataset
        self.tokenizer = tokenizer
    def inference(self):
        print(len(self.dataset['input_ids']))
        inference_dataset = [self.dataset[i] for i in range(len(self.dataset['input_ids']))]
        inference_dataset = self.datacollator(inference_dataset)
        pred_text = []
        for i in tqdm(range(0, len(inference_dataset['input_ids']), self.batch_size)):
            generated_text = model.generate(inference_dataset['input_ids'][i:i+self.batch_size].to('cuda'), max_length=100)
            decoded_text = self.tokenizer.batch_decode(generated_text, skip_special_tokens=True)
            pred_text+=decoded_text
        return pred_text

In [48]:
#constant
save_path = f'../models/pt-indot5'
model = T5ForConditionalGeneration.from_pretrained(save_path).to('cuda')

In [50]:
model_inference = ModelInference(batch_size=8, dataset=tokenized_dataset['test'], model=model, tokenizer=tokenizer)
pred_text = model_inference.inference()

77


100%|██████████| 10/10 [00:08<00:00,  1.21it/s]


In [51]:
test_dataset = splitted_dataset['test']
test_dataset = test_dataset.add_column(f'model_prediction', pred_text)
test_dataset.to_csv('../Data/quadruplet/model_inference_300_data.csv')

Loading cached processed dataset at C:\Users\danendra\.cache\huggingface\datasets\csv\default-bf79bf034d0392b1\0.0.0\6954658bab30a358235fa864b05cf819af0e179325c740e4bc853bcc7ec513e1\cache-37557ea6cb933a33.arrow
Creating CSV from Arrow format: 100%|██████████| 1/1 [00:00<00:00, 142.94ba/s]


37874

# Test pipeline dengan data sedikit
pakai data yg cuma 300an

In [1]:
from src.finetuner import FineTuner
from src.preprocessor import Preprocessor
from transformers import (
    T5ForConditionalGeneration,
    AutoTokenizer,
    Seq2SeqTrainingArguments
)
from datasets import load_dataset

  from .autonotebook import tqdm as notebook_tqdm


## Dataset

In [2]:
#constant
max_length = 128
text_col = 'content'
label_col = 'quadruplet'
preprocess_type = 'p00'
SAVE_PATH = f'../models/test_fine_tuner_with_300_data'
PRETRAINED_MODEL = "Wikidepia/IndoT5-base"
DATA_PATH = '../Data/quadruplet/quadruplet_only.csv'

In [3]:
tokenizer = AutoTokenizer.from_pretrained(PRETRAINED_MODEL)
preprocessor = Preprocessor(preprocess_type, tokenizer, max_length, text_col, label_col)

In [4]:
raw_dataset = load_dataset('csv', data_files=DATA_PATH)
splitted_dataset = raw_dataset['train'].train_test_split(test_size=0.2, seed=42)
tokenized_dataset = splitted_dataset.map(preprocessor.preprocess_dataset, batched=True, remove_columns=splitted_dataset['train'].column_names)

Found cached dataset csv (C:/Users/danendra/.cache/huggingface/datasets/csv/default-bf79bf034d0392b1/0.0.0/6954658bab30a358235fa864b05cf819af0e179325c740e4bc853bcc7ec513e1)
100%|██████████| 1/1 [00:00<00:00, 677.48it/s]
Loading cached split indices for dataset at C:\Users\danendra\.cache\huggingface\datasets\csv\default-bf79bf034d0392b1\0.0.0\6954658bab30a358235fa864b05cf819af0e179325c740e4bc853bcc7ec513e1\cache-571d28bc462ba89a.arrow and C:\Users\danendra\.cache\huggingface\datasets\csv\default-bf79bf034d0392b1\0.0.0\6954658bab30a358235fa864b05cf819af0e179325c740e4bc853bcc7ec513e1\cache-24355355d1083472.arrow
Loading cached processed dataset at C:\Users\danendra\.cache\huggingface\datasets\csv\default-bf79bf034d0392b1\0.0.0\6954658bab30a358235fa864b05cf819af0e179325c740e4bc853bcc7ec513e1\cache-6635831735983f73.arrow
Loading cached processed dataset at C:\Users\danendra\.cache\huggingface\datasets\csv\default-bf79bf034d0392b1\0.0.0\6954658bab30a358235fa864b05cf819af0e179325c740e4bc853b

In [5]:
tokenized_dataset

DatasetDict({
    train: Dataset({
        features: ['labels', 'input_ids', 'attention_mask'],
        num_rows: 308
    })
    test: Dataset({
        features: ['labels', 'input_ids', 'attention_mask'],
        num_rows: 77
    })
})

In [6]:
splitted_dataset['test']['content'][0]

'@tokopedia min aku udah bayar tapi kenapa dibatalin pesanan ku?'

In [7]:
tokenizer.decode(tokenized_dataset['test']['input_ids'][0])

'min aku sudah bayar tapi kenapa dibatalin pesanan ku?</s>'

## Model training

In [8]:
model = T5ForConditionalGeneration.from_pretrained(PRETRAINED_MODEL)

In [9]:
#training argument
training_args = Seq2SeqTrainingArguments(
    SAVE_PATH,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    learning_rate=2e-5,
    weight_decay=0.01,
    resume_from_checkpoint=True,
    num_train_epochs=100,
    save_total_limit=2,
)

In [10]:
finetuner = FineTuner(model=model, save_path=SAVE_PATH, tokenizer=tokenizer, 
                      train_dataset=tokenized_dataset['train'], eval_dataset=tokenized_dataset['test'])

cuda:0


In [11]:
finetuner.fine_tune(training_args)

  0%|          | 0/3900 [00:00<?, ?it/s]You're using a T5TokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.
                                                  
  1%|          | 39/3900 [00:09<12:09,  5.29it/s]

{'eval_loss': 3.16196346282959, 'eval_runtime': 0.4602, 'eval_samples_per_second': 167.325, 'eval_steps_per_second': 21.73, 'epoch': 1.0}


                                                   
  2%|▏         | 78/3900 [00:40<10:59,  5.79it/s]

{'eval_loss': 2.395500659942627, 'eval_runtime': 0.4113, 'eval_samples_per_second': 187.215, 'eval_steps_per_second': 24.314, 'epoch': 2.0}


                                                   
  3%|▎         | 117/3900 [01:10<11:20,  5.56it/s]

{'eval_loss': 2.009577512741089, 'eval_runtime': 0.4159, 'eval_samples_per_second': 185.134, 'eval_steps_per_second': 24.043, 'epoch': 3.0}


                                                    
  4%|▍         | 156/3900 [01:40<10:52,  5.74it/s]

{'eval_loss': 1.7476584911346436, 'eval_runtime': 0.398, 'eval_samples_per_second': 193.452, 'eval_steps_per_second': 25.124, 'epoch': 4.0}


                                                    
  5%|▌         | 195/3900 [02:14<11:21,  5.43it/s]

{'eval_loss': 1.6053522825241089, 'eval_runtime': 0.4155, 'eval_samples_per_second': 185.309, 'eval_steps_per_second': 24.066, 'epoch': 5.0}


                                                    
  6%|▌         | 234/3900 [02:46<11:03,  5.52it/s]

{'eval_loss': 1.5093351602554321, 'eval_runtime': 0.4228, 'eval_samples_per_second': 182.139, 'eval_steps_per_second': 23.654, 'epoch': 6.0}


                                                    
  7%|▋         | 273/3900 [03:15<10:49,  5.59it/s]

{'eval_loss': 1.451578140258789, 'eval_runtime': 0.399, 'eval_samples_per_second': 193.0, 'eval_steps_per_second': 25.065, 'epoch': 7.0}


                                                    
  8%|▊         | 312/3900 [03:49<10:14,  5.84it/s]

{'eval_loss': 1.394089698791504, 'eval_runtime': 0.4039, 'eval_samples_per_second': 190.627, 'eval_steps_per_second': 24.757, 'epoch': 8.0}


                                                    
  9%|▉         | 351/3900 [04:19<10:16,  5.76it/s]

{'eval_loss': 1.341862678527832, 'eval_runtime': 0.414, 'eval_samples_per_second': 186.005, 'eval_steps_per_second': 24.157, 'epoch': 9.0}


                                                    
 10%|█         | 390/3900 [04:50<11:14,  5.20it/s]

{'eval_loss': 1.3095558881759644, 'eval_runtime': 0.4258, 'eval_samples_per_second': 180.833, 'eval_steps_per_second': 23.485, 'epoch': 10.0}


                                                    
 11%|█         | 429/3900 [05:24<10:54,  5.31it/s]

{'eval_loss': 1.2794133424758911, 'eval_runtime': 0.409, 'eval_samples_per_second': 188.245, 'eval_steps_per_second': 24.447, 'epoch': 11.0}


                                                    
 12%|█▏        | 468/3900 [05:55<10:37,  5.39it/s]

{'eval_loss': 1.244598388671875, 'eval_runtime': 0.4159, 'eval_samples_per_second': 185.126, 'eval_steps_per_second': 24.042, 'epoch': 12.0}


 13%|█▎        | 501/3900 [06:22<12:40,  4.47it/s]  

{'loss': 1.8139, 'learning_rate': 1.7435897435897438e-05, 'epoch': 12.82}


                                                  
 13%|█▎        | 507/3900 [06:24<10:27,  5.41it/s]

{'eval_loss': 1.2117669582366943, 'eval_runtime': 0.4101, 'eval_samples_per_second': 187.775, 'eval_steps_per_second': 24.386, 'epoch': 13.0}


                                                    
 14%|█▍        | 546/3900 [06:56<09:42,  5.75it/s]

{'eval_loss': 1.1925514936447144, 'eval_runtime': 0.411, 'eval_samples_per_second': 187.361, 'eval_steps_per_second': 24.333, 'epoch': 14.0}


                                                    
 15%|█▌        | 585/3900 [07:25<09:16,  5.96it/s]

{'eval_loss': 1.1650327444076538, 'eval_runtime': 0.408, 'eval_samples_per_second': 188.703, 'eval_steps_per_second': 24.507, 'epoch': 15.0}


                                                    
 16%|█▌        | 624/3900 [07:56<10:06,  5.40it/s]

{'eval_loss': 1.1526316404342651, 'eval_runtime': 0.4189, 'eval_samples_per_second': 183.83, 'eval_steps_per_second': 23.874, 'epoch': 16.0}


                                                    
 17%|█▋        | 663/3900 [08:28<09:19,  5.79it/s]

{'eval_loss': 1.1317026615142822, 'eval_runtime': 0.4245, 'eval_samples_per_second': 181.37, 'eval_steps_per_second': 23.555, 'epoch': 17.0}


                                                    
 18%|█▊        | 702/3900 [08:57<09:13,  5.77it/s]

{'eval_loss': 1.1117113828659058, 'eval_runtime': 0.4079, 'eval_samples_per_second': 188.767, 'eval_steps_per_second': 24.515, 'epoch': 18.0}


                                                    
 19%|█▉        | 741/3900 [09:27<09:27,  5.57it/s]

{'eval_loss': 1.0954102277755737, 'eval_runtime': 0.4609, 'eval_samples_per_second': 167.078, 'eval_steps_per_second': 21.698, 'epoch': 19.0}


                                                    
 20%|██        | 780/3900 [09:57<08:58,  5.79it/s]

{'eval_loss': 1.0710582733154297, 'eval_runtime': 0.405, 'eval_samples_per_second': 190.145, 'eval_steps_per_second': 24.694, 'epoch': 20.0}


                                                    
 21%|██        | 819/3900 [10:28<09:21,  5.49it/s]

{'eval_loss': 1.0458658933639526, 'eval_runtime': 0.4134, 'eval_samples_per_second': 186.245, 'eval_steps_per_second': 24.188, 'epoch': 21.0}


                                                    
 22%|██▏       | 858/3900 [10:59<09:46,  5.18it/s]

{'eval_loss': 1.0285292863845825, 'eval_runtime': 0.4139, 'eval_samples_per_second': 186.033, 'eval_steps_per_second': 24.16, 'epoch': 22.0}


                                                    
 23%|██▎       | 897/3900 [11:30<10:45,  4.66it/s]

{'eval_loss': 1.0133144855499268, 'eval_runtime': 0.4114, 'eval_samples_per_second': 187.157, 'eval_steps_per_second': 24.306, 'epoch': 23.0}


                                                    
 24%|██▍       | 936/3900 [12:01<08:30,  5.81it/s]

{'eval_loss': 0.9914776682853699, 'eval_runtime': 0.4048, 'eval_samples_per_second': 190.199, 'eval_steps_per_second': 24.701, 'epoch': 24.0}


                                                    
 25%|██▌       | 975/3900 [12:30<08:07,  6.00it/s]

{'eval_loss': 0.9681766033172607, 'eval_runtime': 0.4067, 'eval_samples_per_second': 189.336, 'eval_steps_per_second': 24.589, 'epoch': 25.0}


 26%|██▌       | 1001/3900 [13:00<09:26,  5.11it/s] 

{'loss': 0.778, 'learning_rate': 1.4871794871794874e-05, 'epoch': 25.64}


                                                   
 26%|██▌       | 1014/3900 [13:03<08:23,  5.73it/s]

{'eval_loss': 0.9572198987007141, 'eval_runtime': 0.4066, 'eval_samples_per_second': 189.398, 'eval_steps_per_second': 24.597, 'epoch': 26.0}


                                                     
 27%|██▋       | 1053/3900 [13:34<07:57,  5.96it/s]

{'eval_loss': 0.9366672039031982, 'eval_runtime': 0.4081, 'eval_samples_per_second': 188.659, 'eval_steps_per_second': 24.501, 'epoch': 27.0}


                                                     
 28%|██▊       | 1092/3900 [14:04<08:31,  5.49it/s]

{'eval_loss': 0.9187056422233582, 'eval_runtime': 0.4222, 'eval_samples_per_second': 182.398, 'eval_steps_per_second': 23.688, 'epoch': 28.0}


                                                     
 29%|██▉       | 1131/3900 [14:35<09:07,  5.06it/s]

{'eval_loss': 0.8971647620201111, 'eval_runtime': 0.477, 'eval_samples_per_second': 161.424, 'eval_steps_per_second': 20.964, 'epoch': 29.0}


                                                     
 30%|███       | 1170/3900 [15:06<07:54,  5.76it/s]

{'eval_loss': 0.8863543272018433, 'eval_runtime': 0.3908, 'eval_samples_per_second': 197.037, 'eval_steps_per_second': 25.589, 'epoch': 30.0}


                                                     
 31%|███       | 1209/3900 [15:35<07:50,  5.72it/s]

{'eval_loss': 0.8811765313148499, 'eval_runtime': 0.4054, 'eval_samples_per_second': 189.931, 'eval_steps_per_second': 24.666, 'epoch': 31.0}


                                                     
 32%|███▏      | 1248/3900 [16:05<08:01,  5.51it/s]

{'eval_loss': 0.8534305095672607, 'eval_runtime': 0.3997, 'eval_samples_per_second': 192.666, 'eval_steps_per_second': 25.022, 'epoch': 32.0}


                                                     
 33%|███▎      | 1287/3900 [16:36<07:42,  5.65it/s]

{'eval_loss': 0.8523573279380798, 'eval_runtime': 0.4093, 'eval_samples_per_second': 188.14, 'eval_steps_per_second': 24.434, 'epoch': 33.0}


                                                     
 34%|███▍      | 1326/3900 [17:04<07:27,  5.76it/s]

{'eval_loss': 0.8454421162605286, 'eval_runtime': 0.3896, 'eval_samples_per_second': 197.623, 'eval_steps_per_second': 25.665, 'epoch': 34.0}


                                                     
 35%|███▌      | 1365/3900 [17:35<06:55,  6.09it/s]

{'eval_loss': 0.8551997542381287, 'eval_runtime': 0.4136, 'eval_samples_per_second': 186.181, 'eval_steps_per_second': 24.179, 'epoch': 35.0}


                                                     
 36%|███▌      | 1404/3900 [18:14<08:20,  4.99it/s]

{'eval_loss': 0.8509798645973206, 'eval_runtime': 0.4287, 'eval_samples_per_second': 179.605, 'eval_steps_per_second': 23.325, 'epoch': 36.0}


                                                     
 37%|███▋      | 1443/3900 [18:44<07:24,  5.53it/s]

{'eval_loss': 0.8410384654998779, 'eval_runtime': 0.4292, 'eval_samples_per_second': 179.414, 'eval_steps_per_second': 23.3, 'epoch': 37.0}


                                                     
 38%|███▊      | 1482/3900 [19:16<06:59,  5.76it/s]

{'eval_loss': 0.8459141850471497, 'eval_runtime': 0.4052, 'eval_samples_per_second': 190.017, 'eval_steps_per_second': 24.678, 'epoch': 38.0}


 38%|███▊      | 1501/3900 [19:44<08:57,  4.47it/s]  

{'loss': 0.3848, 'learning_rate': 1.230769230769231e-05, 'epoch': 38.46}


                                                   
 39%|███▉      | 1521/3900 [19:48<06:58,  5.69it/s]

{'eval_loss': 0.848955512046814, 'eval_runtime': 0.4193, 'eval_samples_per_second': 183.652, 'eval_steps_per_second': 23.851, 'epoch': 39.0}


                                                     
 40%|████      | 1560/3900 [20:19<06:39,  5.86it/s]

{'eval_loss': 0.8472787141799927, 'eval_runtime': 0.3993, 'eval_samples_per_second': 192.846, 'eval_steps_per_second': 25.045, 'epoch': 40.0}


                                                     
 41%|████      | 1599/3900 [20:53<06:40,  5.74it/s]

{'eval_loss': 0.8640817403793335, 'eval_runtime': 0.4037, 'eval_samples_per_second': 190.758, 'eval_steps_per_second': 24.774, 'epoch': 41.0}


                                                     
 42%|████▏     | 1638/3900 [21:25<06:43,  5.61it/s]

{'eval_loss': 0.872225284576416, 'eval_runtime': 0.4043, 'eval_samples_per_second': 190.454, 'eval_steps_per_second': 24.734, 'epoch': 42.0}


                                                     
 43%|████▎     | 1677/3900 [21:55<06:36,  5.61it/s]

{'eval_loss': 0.8672143220901489, 'eval_runtime': 0.4521, 'eval_samples_per_second': 170.301, 'eval_steps_per_second': 22.117, 'epoch': 43.0}


                                                     
 44%|████▍     | 1716/3900 [22:28<07:53,  4.61it/s]

{'eval_loss': 0.8820900917053223, 'eval_runtime': 0.4879, 'eval_samples_per_second': 157.831, 'eval_steps_per_second': 20.498, 'epoch': 44.0}


                                                     
 45%|████▌     | 1755/3900 [23:05<06:48,  5.25it/s]

{'eval_loss': 0.8892061114311218, 'eval_runtime': 0.4312, 'eval_samples_per_second': 178.574, 'eval_steps_per_second': 23.191, 'epoch': 45.0}


                                                     
 46%|████▌     | 1794/3900 [23:34<06:49,  5.14it/s]

{'eval_loss': 0.9008430242538452, 'eval_runtime': 0.4127, 'eval_samples_per_second': 186.586, 'eval_steps_per_second': 24.232, 'epoch': 46.0}


                                                     
 47%|████▋     | 1833/3900 [24:05<06:46,  5.08it/s]

{'eval_loss': 0.9102492928504944, 'eval_runtime': 0.4249, 'eval_samples_per_second': 181.24, 'eval_steps_per_second': 23.538, 'epoch': 47.0}


                                                     
 48%|████▊     | 1872/3900 [24:36<05:43,  5.90it/s]

{'eval_loss': 0.9198119044303894, 'eval_runtime': 0.4034, 'eval_samples_per_second': 190.867, 'eval_steps_per_second': 24.788, 'epoch': 48.0}


                                                     
 49%|████▉     | 1911/3900 [25:10<05:53,  5.62it/s]

{'eval_loss': 0.9303978085517883, 'eval_runtime': 0.4053, 'eval_samples_per_second': 190.0, 'eval_steps_per_second': 24.675, 'epoch': 49.0}


                                                     
 50%|█████     | 1950/3900 [25:41<05:34,  5.83it/s]

{'eval_loss': 0.922232985496521, 'eval_runtime': 0.4028, 'eval_samples_per_second': 191.144, 'eval_steps_per_second': 24.824, 'epoch': 50.0}


                                                     
 51%|█████     | 1989/3900 [26:13<05:52,  5.42it/s]

{'eval_loss': 0.9388132691383362, 'eval_runtime': 0.4191, 'eval_samples_per_second': 183.71, 'eval_steps_per_second': 23.858, 'epoch': 51.0}


 51%|█████▏    | 2001/3900 [26:42<11:16,  2.81it/s]  

{'loss': 0.2081, 'learning_rate': 9.743589743589744e-06, 'epoch': 51.28}


                                                   
 52%|█████▏    | 2028/3900 [26:47<05:31,  5.65it/s]

{'eval_loss': 0.9570145010948181, 'eval_runtime': 0.4013, 'eval_samples_per_second': 191.863, 'eval_steps_per_second': 24.917, 'epoch': 52.0}


                                                     
 53%|█████▎    | 2067/3900 [27:19<05:23,  5.66it/s]

{'eval_loss': 0.959748387336731, 'eval_runtime': 0.4058, 'eval_samples_per_second': 189.771, 'eval_steps_per_second': 24.646, 'epoch': 53.0}


                                                     
 54%|█████▍    | 2106/3900 [27:50<05:23,  5.55it/s]

{'eval_loss': 0.9719874262809753, 'eval_runtime': 0.4025, 'eval_samples_per_second': 191.288, 'eval_steps_per_second': 24.843, 'epoch': 54.0}


                                                     
 55%|█████▌    | 2145/3900 [28:26<05:37,  5.19it/s]

{'eval_loss': 0.9694761037826538, 'eval_runtime': 0.4258, 'eval_samples_per_second': 180.85, 'eval_steps_per_second': 23.487, 'epoch': 55.0}


                                                     
 56%|█████▌    | 2184/3900 [28:59<05:47,  4.94it/s]

{'eval_loss': 0.9796651601791382, 'eval_runtime': 0.4595, 'eval_samples_per_second': 167.561, 'eval_steps_per_second': 21.761, 'epoch': 56.0}


                                                     
 57%|█████▋    | 2223/3900 [29:32<05:32,  5.05it/s]

{'eval_loss': 0.9842897057533264, 'eval_runtime': 0.4498, 'eval_samples_per_second': 171.2, 'eval_steps_per_second': 22.234, 'epoch': 57.0}


KeyboardInterrupt: 

In [12]:
model.save_pretrained(SAVE_PATH)

## Inference

In [13]:
from src.inference import ModelInference
from src.preprocessor import Preprocessor
from transformers import (
    T5ForConditionalGeneration,
    AutoTokenizer,
)
from datasets import load_dataset

In [24]:
#constant
max_length = 128
text_col = 'content'
label_col = 'quadruplet'
preprocess_type = 'p00'
SAVE_PATH = f'../models/pt-indot5'
PRETRAINED_MODEL = "Wikidepia/IndoT5-base"
DATA_PATH = '../Data/quadruplet/quadruplet_only.csv'

In [25]:
model = T5ForConditionalGeneration.from_pretrained(SAVE_PATH).to('cuda')
tokenizer = AutoTokenizer.from_pretrained(PRETRAINED_MODEL)

In [26]:
preprocessor = Preprocessor(preprocess_type, tokenizer, max_length, text_col, label_col)
#dataset
raw_dataset = load_dataset('csv', data_files=DATA_PATH)
splitted_dataset = raw_dataset['train'].train_test_split(test_size=0.2, seed=42)
tokenized_dataset = splitted_dataset.map(preprocessor.preprocess_dataset, batched=True, remove_columns=splitted_dataset['train'].column_names)

Found cached dataset csv (C:/Users/danendra/.cache/huggingface/datasets/csv/default-bf79bf034d0392b1/0.0.0/6954658bab30a358235fa864b05cf819af0e179325c740e4bc853bcc7ec513e1)
100%|██████████| 1/1 [00:00<00:00, 497.37it/s]
Loading cached split indices for dataset at C:\Users\danendra\.cache\huggingface\datasets\csv\default-bf79bf034d0392b1\0.0.0\6954658bab30a358235fa864b05cf819af0e179325c740e4bc853bcc7ec513e1\cache-571d28bc462ba89a.arrow and C:\Users\danendra\.cache\huggingface\datasets\csv\default-bf79bf034d0392b1\0.0.0\6954658bab30a358235fa864b05cf819af0e179325c740e4bc853bcc7ec513e1\cache-24355355d1083472.arrow
Loading cached processed dataset at C:\Users\danendra\.cache\huggingface\datasets\csv\default-bf79bf034d0392b1\0.0.0\6954658bab30a358235fa864b05cf819af0e179325c740e4bc853bcc7ec513e1\cache-6635831735983f73.arrow
Loading cached processed dataset at C:\Users\danendra\.cache\huggingface\datasets\csv\default-bf79bf034d0392b1\0.0.0\6954658bab30a358235fa864b05cf819af0e179325c740e4bc853b

In [27]:
model_inference = ModelInference(batch_size=8, dataset=tokenized_dataset['test'], model=model, tokenizer=tokenizer)
pred_text = model_inference.inference()

You're using a T5TokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.
100%|██████████| 10/10 [00:08<00:00,  1.16it/s]


In [28]:
test_dataset = splitted_dataset['test']
test_dataset = test_dataset.add_column(f'{preprocess_type}_model_prediction', pred_text)
test_dataset.to_csv('../Data/quadruplet/model_inference_300_data.csv')

Loading cached processed dataset at C:\Users\danendra\.cache\huggingface\datasets\csv\default-bf79bf034d0392b1\0.0.0\6954658bab30a358235fa864b05cf819af0e179325c740e4bc853bcc7ec513e1\cache-37557ea6cb933a33.arrow
Creating CSV from Arrow format: 100%|██████████| 1/1 [00:00<00:00, 139.76ba/s]


37878

## Evaluate

In [29]:
import pandas as pd
from src.evaluator import Evaluator
from src.postprocessor import PostProcessor

In [36]:
df = pd.read_csv('../Data/quadruplet/model_inference_300_data.csv')
df.head(5)

Unnamed: 0,original_id,content,clean_tweet,final_sentiment,labels,quadruplet,spam,sentiment_label,p00_model_prediction
0,1.64e+18,@tokopedia min aku udah bayar tapi kenapa diba...,min aku udah bayar tapi kenapa dibatalin pesan...,negative,payment; produk;,"(pesananku, udah bayar tapi kenapa dibatalin, ...",,sentiment,"(pesananku, sudah bayar tapi kenapa dibatalin,..."
1,1.64e+18,Seperti biasa menghibungi call center @sicepat...,Seperti biasa menghibungi call center hanya be...,negative,delivery; produk;,"(call center, hanya berbelit belit, negative, ...",,sentiment,"(cal center, hanya berbelit belit, negative, c..."
2,1.64e+18,@tanyakanrl alfa kalo jumat-minggu ada promo t...,alfa kalo jumat-minggu ada promo tuh jd murah ...,neutral,price; produk;,"(alfa, kalo jumat-minggu ada promo, positive, ...",,sentiment,"(alfa, kalau jumat-mingu ada promo, positive, ..."
3,1.64e+18,@tokopedia apakah tokopedia care sedang gangguan?,apakah tokopedia care sedang gangguan?,negative,website&apps; produk;,"(tokopedia, gangguan, negative, website&apps);",,sentiment,"(tokopedia care, sedang ganguan, neutral, webs..."
4,1.64e+18,"Beli token listrik lewat tokped error, direjec...","Beli token listrik lewat tokped error, direjec...",negative,website&apps; payment; produk;,"(token listrik, lewat tokped error, negative, ...",,sentiment,"(token listrik, lewat tokped eror, negative, w..."


In [31]:
postprocessor = PostProcessor(use_postprocess=False)
evaluator = Evaluator(task_type='quadruplet', postprocessor=postprocessor)

In [32]:
raw_scores, all_labels, all_preds = evaluator.evaluate(pred_seqs=df['p00_model_prediction'],
                   gold_seqs=df['quadruplet'])

100%|██████████| 77/77 [00:00<00:00, 76913.89it/s]


In [33]:
raw_scores

{'precision': 0.3333333333333333,
 'recall': 0.3333333333333333,
 'f1': 0.3333333333333333}

In [35]:
all_labels

[[('pesananku',
   'udah bayar tapi kenapa dibatalin',
   'negative',
   'website&apps')],
 [('call center', 'hanya berbelit belit', 'negative', 'customerservice'),
  ('transaksi', 'tdk bisa di cancel', 'negative', 'delivery')],
 [('alfa', 'kalo jumat-minggu ada promo', 'positive', 'price')],
 [('tokopedia', 'gangguan', 'negative', 'website&apps')],
 [('token listrik', 'lewat tokped error', 'negative', 'website&apps')],
 [('default paymentnya', 'tanpa ada konfirmasi ulang', 'negative', 'payment')],
 [('payday', 'bulan ini gak dapet apa-apa', 'neutral', 'product')],
 [('tiktok shop', 'dapat harga plg murah', 'positive', 'price')],
 [('tokopedia', 'makin ke sini makin pelit aja', 'negative', 'price')],
 [('ghd', 'lebi lancar nyatok rambutnya', 'positive', 'product'),
  ('hairbeauron', 'lebih tersendat', 'negative', 'product'),
  ('catokan mahal', 'gak bau gosong', 'positive', 'product')],
 [('tokopedia', 'ongkir kelewat mahal', 'negative', 'delivery'),
  ('tokopedia', 'sistem gak aman', 

In [34]:
all_preds

[[('pesananku',
   'sudah bayar tapi kenapa dibatalin',
   'negative',
   'website&apps')],
 [('cal center', 'hanya berbelit belit', 'negative', 'customerservice'),
  ('transaksi',
   'tidak bisa di cancel karena sudah status menungu',
   'negative',
   'delivery')],
 [('alfa', 'kalau jumat-mingu ada promo', 'positive', 'price')],
 [('tokopedia care', 'sedang ganguan', 'neutral', 'website&apps')],
 [('token listrik', 'lewat tokped eror', 'negative', 'website&apps')],
 [('default paymentnya', 'tanpa ada konfirmasi ulang', 'negative', 'payment')],
 [('payday', 'bulan ini gak dapat apa-apa', 'neutral', 'product')],
 [('tokopedia', 'dapat harga paling murah', 'positive', 'price'),
  ('shopee', 'dapat harga paling murah', 'positive', 'price')],
 [('tokopedia', 'makin ke sini makin pelit saja', 'negative', 'price')],
 [('ghd', 'lebi lancar nyatok rambutnya', 'positive', 'product'),
  ('hairbeauron', 'lebih tersendat', 'negative', 'product')],
 [('tokopedia', 'ongkos kirim kelewat mahal', 'ne