In [1]:
from src.finetuner import FineTuner
from src.preprocessor import Preprocessor
from transformers import (
    T5ForConditionalGeneration,
    #BartForConditionalGeneration,
    AutoTokenizer,
    Seq2SeqTrainingArguments
)
from datasets import load_dataset
#from indobenchmark import IndoNLGTokenizer

  from .autonotebook import tqdm as notebook_tqdm


# Dataset

In [2]:
#constant
max_length = 30
text_col = 'content'
label_col = 'postprocess_quintuplet'
preprocess_type = 'p02'
experiment_type = 'p22'
SAVE_PATH = f'../models/{experiment_type}_quintuplet'
PRETRAINED_MODEL = "../models/pt-indot5-MLM_TA_PT"#"indobenchmark/indobart-v2"#"Wikidepia/IndoT5-base"#"../models/pt-indot5-MLM_PT"  
TOKENIZER_PATH = "Wikidepia/IndoT5-base" #"indobenchmark/indobart-v2"
DATA_PATH = '../Data/quintuplet/quintuplet_postprocessed.csv'

In [3]:
tokenizer = AutoTokenizer.from_pretrained(TOKENIZER_PATH)
#tokenizer = IndoNLGTokenizer.from_pretrained(TOKENIZER_PATH)
preprocessor = Preprocessor(preprocess_type, tokenizer, max_length, text_col, label_col)

In [4]:
# raw_dataset = load_dataset('csv', data_files={
#     'train' : f'{DATA_PATH}_train.csv',
#     'test' : f'{DATA_PATH}_test.csv',
#     'val' : f'{DATA_PATH}_val.csv'
# })
raw_dataset = load_dataset('csv', data_files=DATA_PATH)
#raw_dataset = raw_dataset.filter(lambda x: x['is_comparative']==False)
splitted_dataset = raw_dataset['train'].train_test_split(test_size=100, seed=42)
tokenized_dataset = splitted_dataset.map(preprocessor.preprocess_dataset, batched=True, remove_columns=raw_dataset['train'].column_names)

Found cached dataset csv (C:/Users/danendra/.cache/huggingface/datasets/csv/default-7c3f713988208aa5/0.0.0/6954658bab30a358235fa864b05cf819af0e179325c740e4bc853bcc7ec513e1)
100%|██████████| 1/1 [00:00<00:00, 399.23it/s]
Loading cached split indices for dataset at C:\Users\danendra\.cache\huggingface\datasets\csv\default-7c3f713988208aa5\0.0.0\6954658bab30a358235fa864b05cf819af0e179325c740e4bc853bcc7ec513e1\cache-06cdaa4abbd48a43.arrow and C:\Users\danendra\.cache\huggingface\datasets\csv\default-7c3f713988208aa5\0.0.0\6954658bab30a358235fa864b05cf819af0e179325c740e4bc853bcc7ec513e1\cache-475e0560816dc728.arrow
Loading cached processed dataset at C:\Users\danendra\.cache\huggingface\datasets\csv\default-7c3f713988208aa5\0.0.0\6954658bab30a358235fa864b05cf819af0e179325c740e4bc853bcc7ec513e1\cache-294dd2bf963c7f67.arrow
Loading cached processed dataset at C:\Users\danendra\.cache\huggingface\datasets\csv\default-7c3f713988208aa5\0.0.0\6954658bab30a358235fa864b05cf819af0e179325c740e4bc853b

In [5]:
tokenized_dataset

DatasetDict({
    train: Dataset({
        features: ['input_ids', 'attention_mask', 'labels'],
        num_rows: 1894
    })
    test: Dataset({
        features: ['input_ids', 'attention_mask', 'labels'],
        num_rows: 100
    })
})

In [6]:
len_labels = []
len_inputs = []
for i in range(len(tokenized_dataset['train'])):
    len_labels.append(len(tokenized_dataset['train'][i]['labels']))
    len_inputs.append(len(tokenized_dataset['train'][i]['input_ids']))

In [7]:
sum(len_labels)/len(len_labels), sum(len_inputs)/len(len_inputs)

(97.55227032734952, 30.0)

In [8]:
splitted_dataset['train']['content'][0]

'Tapi enaknya akun baru Shopee itu belum ada biaya adminnya 😂 meskipun masih nyeseq ga bisa dibuka https://t.co/3pQxvTTlnI'

In [9]:
tokenizer.decode(tokenized_dataset['train']['input_ids'][0])

'tapi enak akun baru shope itu belum ada biaya adminya meski masih nyeseq ga bisa buka</s><pad><pad><pad><pad><pad><pad><pad>'

In [10]:
tokenizer.decode(tokenized_dataset['train']['labels'][0])

'(shopee,akun baru shopee,belum ada biaya adminnya,positive,payment)</s><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad>'

# Model Training

In [32]:
# from transformers import Seq2SeqTrainer
# from transformers import DataCollatorForSeq2Seq

In [11]:
model = T5ForConditionalGeneration.from_pretrained(PRETRAINED_MODEL)
#model = BartForConditionalGeneration.from_pretrained(PRETRAINED_MODEL)
#model.resize_token_embeddings(len(tokenizer))

In [33]:
# data_collator = DataCollatorForSeq2Seq(tokenizer, model=model)
# trainer = Seq2SeqTrainer(
#     model,
#     training_args,
#     train_dataset=tokenized_dataset["train"],
#     eval_dataset=tokenized_dataset["test"],
#     data_collator=data_collator,
# )

# trainer.train()

In [12]:
#training argument
training_args = Seq2SeqTrainingArguments(
    SAVE_PATH,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    learning_rate=0.0003,
    #weight_decay=0.01,
    resume_from_checkpoint=True,
    num_train_epochs=10,
    save_total_limit=2,
    seed=42
)

In [13]:
finetuner = FineTuner(model=model, save_path=SAVE_PATH, tokenizer=tokenizer, 
                      train_dataset=tokenized_dataset['train'], eval_dataset=tokenized_dataset['test'])

cuda:0


In [14]:
finetuner.fine_tune(training_args)

  0%|          | 0/2370 [00:00<?, ?it/s]You're using a T5TokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.
                                                  
 10%|█         | 237/2370 [00:56<07:41,  4.62it/s]

{'eval_loss': 0.2741573750972748, 'eval_runtime': 0.5273, 'eval_samples_per_second': 189.638, 'eval_steps_per_second': 24.653, 'epoch': 1.0}


                                                    
 20%|██        | 474/2370 [02:10<07:24,  4.26it/s]

{'eval_loss': 0.24826639890670776, 'eval_runtime': 0.5883, 'eval_samples_per_second': 169.978, 'eval_steps_per_second': 22.097, 'epoch': 2.0}


 21%|██        | 500/2370 [02:39<08:09,  3.82it/s]  

{'loss': 2.1959, 'learning_rate': 0.00023670886075949365, 'epoch': 2.11}


                                                  
 30%|███       | 711/2370 [03:28<05:56,  4.65it/s]

{'eval_loss': 0.24687694013118744, 'eval_runtime': 0.5207, 'eval_samples_per_second': 192.034, 'eval_steps_per_second': 24.964, 'epoch': 3.0}


                                                    
 40%|████      | 948/2370 [04:46<05:09,  4.60it/s]

{'eval_loss': 0.2510281205177307, 'eval_runtime': 0.5293, 'eval_samples_per_second': 188.93, 'eval_steps_per_second': 24.561, 'epoch': 4.0}


 42%|████▏     | 1000/2370 [05:22<05:05,  4.48it/s] 

{'loss': 0.102, 'learning_rate': 0.00017341772151898733, 'epoch': 4.22}


                                                   
 50%|█████     | 1185/2370 [06:04<04:14,  4.66it/s]

{'eval_loss': 0.27470269799232483, 'eval_runtime': 0.5321, 'eval_samples_per_second': 187.919, 'eval_steps_per_second': 24.43, 'epoch': 5.0}


                                                     
 60%|██████    | 1422/2370 [07:23<03:23,  4.66it/s]

{'eval_loss': 0.2975703477859497, 'eval_runtime': 0.537, 'eval_samples_per_second': 186.215, 'eval_steps_per_second': 24.208, 'epoch': 6.0}


 63%|██████▎   | 1500/2370 [08:07<07:44,  1.87it/s]  

{'loss': 0.0553, 'learning_rate': 0.000110126582278481, 'epoch': 6.33}


                                                   
 70%|███████   | 1659/2370 [08:45<02:32,  4.65it/s]

{'eval_loss': 0.31456196308135986, 'eval_runtime': 0.5408, 'eval_samples_per_second': 184.909, 'eval_steps_per_second': 24.038, 'epoch': 7.0}


                                                     
 80%|████████  | 1896/2370 [10:06<01:48,  4.36it/s]

{'eval_loss': 0.33110669255256653, 'eval_runtime': 0.588, 'eval_samples_per_second': 170.058, 'eval_steps_per_second': 22.107, 'epoch': 8.0}


 84%|████████▍ | 2000/2370 [10:53<01:21,  4.51it/s]

{'loss': 0.0325, 'learning_rate': 4.683544303797468e-05, 'epoch': 8.44}


                                                   
 90%|█████████ | 2133/2370 [11:24<00:50,  4.67it/s]

{'eval_loss': 0.35090363025665283, 'eval_runtime': 0.5259, 'eval_samples_per_second': 190.143, 'eval_steps_per_second': 24.719, 'epoch': 9.0}


                                                   
100%|██████████| 2370/2370 [12:45<00:00,  4.72it/s]

{'eval_loss': 0.35784682631492615, 'eval_runtime': 0.5221, 'eval_samples_per_second': 191.525, 'eval_steps_per_second': 24.898, 'epoch': 10.0}


100%|██████████| 2370/2370 [13:11<00:00,  2.99it/s]


{'train_runtime': 791.6358, 'train_samples_per_second': 23.925, 'train_steps_per_second': 2.994, 'train_loss': 0.5070209623892096, 'epoch': 10.0}


In [15]:
model.save_pretrained(SAVE_PATH)

# Training Pipeline

In [2]:
#constant
SAVE_PATH = f'../models/preprocess_T5'
PRETRAINED_MODEL = "Wikidepia/IndoT5-base"
DATA_PATH = '../Data/quadruplet/quadruplet_annottated_sample_dataset_clean.csv'
#variable
max_length = 128
text_col = 'content'
label_col = 'quadruplet'
tokenizer = AutoTokenizer.from_pretrained(PRETRAINED_MODEL)
#training argument
training_args = Seq2SeqTrainingArguments(
    SAVE_PATH,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    learning_rate=2e-5,
    weight_decay=0.01,
    resume_from_checkpoint=True,
    num_train_epochs=20,
    save_total_limit=2,
)
#read dataset
raw_dataset = load_dataset('csv', data_files=DATA_PATH)
#model training
for i in range(4):
    preprocess_type = f'p0{i}'
    print(f"[INFO] preprocessing dataset for experiment {preprocess_type}")
    #preprocess dataset
    preprocessor = Preprocessor(f'{preprocess_type}', tokenizer, max_length, text_col, label_col)
    tokenized_dataset = raw_dataset.map(preprocessor.preprocess_dataset, batched=True, remove_columns=raw_dataset['train'].column_names)
    splitted_dataset = tokenized_dataset['train'].train_test_split(test_size=0.2, seed=42)
    print(f"[INFO] training model for experiment {preprocess_type}")
    model = T5ForConditionalGeneration.from_pretrained(PRETRAINED_MODEL)
    finetuner = FineTuner(model=model, save_path=f"{SAVE_PATH}_{preprocess_type}", tokenizer=tokenizer, 
                      train_dataset=splitted_dataset['train'], eval_dataset=splitted_dataset['test'])
    finetuner.fine_tune(training_args)

Found cached dataset csv (C:/Users/danendra/.cache/huggingface/datasets/csv/default-564c7144e159da05/0.0.0/6954658bab30a358235fa864b05cf819af0e179325c740e4bc853bcc7ec513e1)
100%|██████████| 1/1 [00:00<00:00, 420.36it/s]
Loading cached processed dataset at C:\Users\danendra\.cache\huggingface\datasets\csv\default-564c7144e159da05\0.0.0\6954658bab30a358235fa864b05cf819af0e179325c740e4bc853bcc7ec513e1\cache-0619265f74d98b9e.arrow
Loading cached split indices for dataset at C:\Users\danendra\.cache\huggingface\datasets\csv\default-564c7144e159da05\0.0.0\6954658bab30a358235fa864b05cf819af0e179325c740e4bc853bcc7ec513e1\cache-5d08aae3a84c90c6.arrow and C:\Users\danendra\.cache\huggingface\datasets\csv\default-564c7144e159da05\0.0.0\6954658bab30a358235fa864b05cf819af0e179325c740e4bc853bcc7ec513e1\cache-4bae551a5fe5658f.arrow


[INFO] preprocessing dataset for experiment p0
[INFO] training model for experiment p0




cuda:0
cuda:0


  0%|          | 0/2260 [00:00<?, ?it/s]You're using a T5TokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.
                                                  
  5%|▌         | 113/2260 [00:21<06:26,  5.56it/s]

{'eval_loss': 2.2919058799743652, 'eval_runtime': 1.0607, 'eval_samples_per_second': 213.077, 'eval_steps_per_second': 27.342, 'epoch': 1.0}


                                                    
 10%|█         | 226/2260 [01:01<05:36,  6.04it/s]

{'eval_loss': 1.6935251951217651, 'eval_runtime': 1.0732, 'eval_samples_per_second': 210.577, 'eval_steps_per_second': 27.021, 'epoch': 2.0}


                                                    
 15%|█▌        | 339/2260 [01:49<05:20,  5.99it/s]

{'eval_loss': 1.5075335502624512, 'eval_runtime': 1.0707, 'eval_samples_per_second': 211.073, 'eval_steps_per_second': 27.085, 'epoch': 3.0}


                                                    
 20%|██        | 452/2260 [02:31<05:03,  5.95it/s]

{'eval_loss': 1.3850293159484863, 'eval_runtime': 1.2142, 'eval_samples_per_second': 186.125, 'eval_steps_per_second': 23.883, 'epoch': 4.0}


 22%|██▏       | 501/2260 [03:02<05:13,  5.61it/s]  

{'loss': 2.05, 'learning_rate': 1.5575221238938054e-05, 'epoch': 4.42}


                                                  
 25%|██▌       | 565/2260 [03:16<05:58,  4.72it/s]

{'eval_loss': 1.2826519012451172, 'eval_runtime': 1.2596, 'eval_samples_per_second': 179.425, 'eval_steps_per_second': 23.024, 'epoch': 5.0}


                                                    
 30%|███       | 678/2260 [04:02<04:11,  6.29it/s]

{'eval_loss': 1.204469919204712, 'eval_runtime': 1.0775, 'eval_samples_per_second': 209.749, 'eval_steps_per_second': 26.915, 'epoch': 6.0}


                                                    
 35%|███▌      | 791/2260 [04:47<04:13,  5.80it/s]

{'eval_loss': 1.1226141452789307, 'eval_runtime': 1.0959, 'eval_samples_per_second': 206.225, 'eval_steps_per_second': 26.463, 'epoch': 7.0}


                                                    
 40%|████      | 904/2260 [05:30<03:48,  5.95it/s]

{'eval_loss': 1.0495717525482178, 'eval_runtime': 1.0682, 'eval_samples_per_second': 211.579, 'eval_steps_per_second': 27.149, 'epoch': 8.0}


 44%|████▍     | 1001/2260 [06:10<03:44,  5.62it/s] 

{'loss': 1.045, 'learning_rate': 1.1150442477876106e-05, 'epoch': 8.85}


                                                   
 45%|████▌     | 1017/2260 [06:13<03:23,  6.12it/s]

{'eval_loss': 0.9821604490280151, 'eval_runtime': 1.0893, 'eval_samples_per_second': 207.474, 'eval_steps_per_second': 26.623, 'epoch': 9.0}


                                                     
 50%|█████     | 1130/2260 [06:56<03:08,  6.00it/s]

{'eval_loss': 0.9229294061660767, 'eval_runtime': 1.0675, 'eval_samples_per_second': 211.715, 'eval_steps_per_second': 27.167, 'epoch': 10.0}


                                                     
 55%|█████▌    | 1243/2260 [07:41<02:40,  6.33it/s]

{'eval_loss': 0.8679027557373047, 'eval_runtime': 1.0689, 'eval_samples_per_second': 211.441, 'eval_steps_per_second': 27.132, 'epoch': 11.0}


                                                     
 60%|██████    | 1356/2260 [08:28<02:28,  6.08it/s]

{'eval_loss': 0.8150554895401001, 'eval_runtime': 1.0805, 'eval_samples_per_second': 209.163, 'eval_steps_per_second': 26.839, 'epoch': 12.0}


                                                     
 65%|██████▌   | 1469/2260 [09:12<02:17,  5.76it/s]

{'eval_loss': 0.7708607316017151, 'eval_runtime': 1.0873, 'eval_samples_per_second': 207.849, 'eval_steps_per_second': 26.671, 'epoch': 13.0}


 66%|██████▋   | 1501/2260 [09:40<02:18,  5.47it/s]  

{'loss': 0.7085, 'learning_rate': 6.72566371681416e-06, 'epoch': 13.27}


                                                   
 70%|███████   | 1582/2260 [09:54<02:02,  5.54it/s]

{'eval_loss': 0.7399626970291138, 'eval_runtime': 1.1075, 'eval_samples_per_second': 204.063, 'eval_steps_per_second': 26.185, 'epoch': 14.0}


                                                     
 75%|███████▌  | 1695/2260 [10:42<01:31,  6.21it/s]

{'eval_loss': 0.7208396196365356, 'eval_runtime': 1.0364, 'eval_samples_per_second': 218.056, 'eval_steps_per_second': 27.981, 'epoch': 15.0}


                                                     
 80%|████████  | 1808/2260 [11:25<01:28,  5.13it/s]

{'eval_loss': 0.709347128868103, 'eval_runtime': 1.1228, 'eval_samples_per_second': 201.283, 'eval_steps_per_second': 25.828, 'epoch': 16.0}


                                                   
 85%|████████▌ | 1921/2260 [12:09<01:02,  5.39it/s]

{'eval_loss': 0.7004119753837585, 'eval_runtime': 1.1687, 'eval_samples_per_second': 193.384, 'eval_steps_per_second': 24.815, 'epoch': 17.0}


 89%|████████▊ | 2001/2260 [12:49<00:48,  5.39it/s]

{'loss': 0.5465, 'learning_rate': 2.3008849557522127e-06, 'epoch': 17.7}


                                                   
 90%|█████████ | 2034/2260 [12:56<00:35,  6.36it/s]

{'eval_loss': 0.6968504190444946, 'eval_runtime': 1.0491, 'eval_samples_per_second': 215.42, 'eval_steps_per_second': 27.642, 'epoch': 18.0}


                                                   
 95%|█████████▌| 2147/2260 [13:40<00:18,  6.25it/s]

{'eval_loss': 0.6941794157028198, 'eval_runtime': 1.0858, 'eval_samples_per_second': 208.147, 'eval_steps_per_second': 26.709, 'epoch': 19.0}


                                                   
100%|██████████| 2260/2260 [14:22<00:00,  5.82it/s]

{'eval_loss': 0.6934356689453125, 'eval_runtime': 1.0919, 'eval_samples_per_second': 206.971, 'eval_steps_per_second': 26.558, 'epoch': 20.0}


100%|██████████| 2260/2260 [14:48<00:00,  2.54it/s]


{'train_runtime': 888.7706, 'train_samples_per_second': 20.32, 'train_steps_per_second': 2.543, 'train_loss': 1.0219785707186808, 'epoch': 20.0}
[INFO] preprocessing dataset for experiment p1


                                                                 

[INFO] training model for experiment p1




cuda:0
cuda:0


  5%|▌         | 113/2260 [00:18<05:34,  6.42it/s]
  5%|▌         | 113/2260 [00:20<05:34,  6.42it/s]

{'eval_loss': 2.2919058799743652, 'eval_runtime': 1.0349, 'eval_samples_per_second': 218.388, 'eval_steps_per_second': 28.023, 'epoch': 1.0}


 10%|█         | 226/2260 [01:01<05:29,  6.18it/s]  
 10%|█         | 226/2260 [01:02<05:29,  6.18it/s]

{'eval_loss': 1.6935251951217651, 'eval_runtime': 1.0327, 'eval_samples_per_second': 218.839, 'eval_steps_per_second': 28.081, 'epoch': 2.0}


 15%|█▌        | 339/2260 [01:46<05:30,  5.81it/s]  
 15%|█▌        | 339/2260 [01:47<05:30,  5.81it/s]

{'eval_loss': 1.5075335502624512, 'eval_runtime': 1.1001, 'eval_samples_per_second': 205.441, 'eval_steps_per_second': 26.362, 'epoch': 3.0}


 20%|██        | 452/2260 [02:33<05:25,  5.55it/s]  
 20%|██        | 452/2260 [02:34<05:25,  5.55it/s]

{'eval_loss': 1.3850293159484863, 'eval_runtime': 1.1684, 'eval_samples_per_second': 193.431, 'eval_steps_per_second': 24.821, 'epoch': 4.0}


 22%|██▏       | 501/2260 [03:06<05:20,  5.48it/s]  

{'loss': 2.05, 'learning_rate': 1.5575221238938054e-05, 'epoch': 4.42}


 25%|██▌       | 565/2260 [03:17<04:33,  6.19it/s]
 25%|██▌       | 565/2260 [03:18<04:33,  6.19it/s]

{'eval_loss': 1.2826519012451172, 'eval_runtime': 1.0303, 'eval_samples_per_second': 219.356, 'eval_steps_per_second': 28.147, 'epoch': 5.0}


 30%|███       | 678/2260 [04:01<04:19,  6.10it/s]  
 30%|███       | 678/2260 [04:02<04:19,  6.10it/s]

{'eval_loss': 1.204469919204712, 'eval_runtime': 1.1313, 'eval_samples_per_second': 199.773, 'eval_steps_per_second': 25.635, 'epoch': 6.0}


 35%|███▌      | 791/2260 [04:47<03:59,  6.13it/s]  
 35%|███▌      | 791/2260 [04:48<03:59,  6.13it/s]

{'eval_loss': 1.1226141452789307, 'eval_runtime': 1.0519, 'eval_samples_per_second': 214.851, 'eval_steps_per_second': 27.569, 'epoch': 7.0}


 40%|████      | 904/2260 [05:30<03:56,  5.73it/s]  
 40%|████      | 904/2260 [05:31<03:56,  5.73it/s]

{'eval_loss': 1.0495717525482178, 'eval_runtime': 1.0573, 'eval_samples_per_second': 213.749, 'eval_steps_per_second': 27.428, 'epoch': 8.0}


 44%|████▍     | 1001/2260 [06:11<03:51,  5.44it/s] 

{'loss': 1.045, 'learning_rate': 1.1150442477876106e-05, 'epoch': 8.85}


 45%|████▌     | 1017/2260 [06:14<03:29,  5.93it/s]
 45%|████▌     | 1017/2260 [06:15<03:29,  5.93it/s]

{'eval_loss': 0.9821604490280151, 'eval_runtime': 1.0913, 'eval_samples_per_second': 207.1, 'eval_steps_per_second': 26.575, 'epoch': 9.0}


 50%|█████     | 1130/2260 [07:00<03:09,  5.95it/s]  
 50%|█████     | 1130/2260 [07:01<03:09,  5.95it/s]

{'eval_loss': 0.9229294061660767, 'eval_runtime': 1.056, 'eval_samples_per_second': 214.025, 'eval_steps_per_second': 27.463, 'epoch': 10.0}


 55%|█████▌    | 1243/2260 [07:44<02:51,  5.92it/s]  
 55%|█████▌    | 1243/2260 [07:46<02:51,  5.92it/s]

{'eval_loss': 0.8679027557373047, 'eval_runtime': 1.0721, 'eval_samples_per_second': 210.809, 'eval_steps_per_second': 27.051, 'epoch': 11.0}


 60%|██████    | 1356/2260 [08:26<02:27,  6.14it/s]  
 60%|██████    | 1356/2260 [08:27<02:27,  6.14it/s]

{'eval_loss': 0.8150554895401001, 'eval_runtime': 1.0697, 'eval_samples_per_second': 211.267, 'eval_steps_per_second': 27.11, 'epoch': 12.0}


 65%|██████▌   | 1469/2260 [09:15<02:14,  5.90it/s]  
 65%|██████▌   | 1469/2260 [09:16<02:14,  5.90it/s]

{'eval_loss': 0.7708607316017151, 'eval_runtime': 1.0648, 'eval_samples_per_second': 212.245, 'eval_steps_per_second': 27.235, 'epoch': 13.0}


 66%|██████▋   | 1501/2260 [09:44<02:16,  5.54it/s]  

{'loss': 0.7085, 'learning_rate': 6.72566371681416e-06, 'epoch': 13.27}


 70%|███████   | 1582/2260 [09:58<01:57,  5.76it/s]
 70%|███████   | 1582/2260 [09:59<01:57,  5.76it/s]

{'eval_loss': 0.7399626970291138, 'eval_runtime': 1.0733, 'eval_samples_per_second': 210.569, 'eval_steps_per_second': 27.02, 'epoch': 14.0}


 75%|███████▌  | 1695/2260 [10:40<01:36,  5.86it/s]  
 75%|███████▌  | 1695/2260 [10:41<01:36,  5.86it/s]

{'eval_loss': 0.7208396196365356, 'eval_runtime': 1.0949, 'eval_samples_per_second': 206.409, 'eval_steps_per_second': 26.486, 'epoch': 15.0}


 80%|████████  | 1808/2260 [11:30<01:14,  6.07it/s]  
 80%|████████  | 1808/2260 [11:31<01:14,  6.07it/s]

{'eval_loss': 0.709347128868103, 'eval_runtime': 1.0885, 'eval_samples_per_second': 207.623, 'eval_steps_per_second': 26.642, 'epoch': 16.0}


 85%|████████▌ | 1921/2260 [12:13<00:58,  5.76it/s]
 85%|████████▌ | 1921/2260 [12:14<00:58,  5.76it/s]

{'eval_loss': 0.7004119753837585, 'eval_runtime': 1.0818, 'eval_samples_per_second': 208.912, 'eval_steps_per_second': 26.807, 'epoch': 17.0}


 89%|████████▊ | 2001/2260 [12:53<00:48,  5.38it/s]

{'loss': 0.5465, 'learning_rate': 2.3008849557522127e-06, 'epoch': 17.7}


 90%|█████████ | 2034/2260 [12:59<00:37,  6.04it/s]
 90%|█████████ | 2034/2260 [13:00<00:37,  6.04it/s]

{'eval_loss': 0.6968504190444946, 'eval_runtime': 1.1267, 'eval_samples_per_second': 200.579, 'eval_steps_per_second': 25.738, 'epoch': 18.0}


 95%|█████████▌| 2147/2260 [13:48<00:18,  6.08it/s]
 95%|█████████▌| 2147/2260 [13:49<00:18,  6.08it/s]

{'eval_loss': 0.6941794157028198, 'eval_runtime': 1.0729, 'eval_samples_per_second': 210.637, 'eval_steps_per_second': 27.029, 'epoch': 19.0}


100%|██████████| 2260/2260 [14:31<00:00,  6.03it/s]
100%|██████████| 2260/2260 [14:32<00:00,  6.03it/s]

{'eval_loss': 0.6934356689453125, 'eval_runtime': 1.0964, 'eval_samples_per_second': 206.125, 'eval_steps_per_second': 26.45, 'epoch': 20.0}


100%|██████████| 2260/2260 [14:54<00:00,  2.53it/s]


{'train_runtime': 894.7418, 'train_samples_per_second': 20.185, 'train_steps_per_second': 2.526, 'train_loss': 1.0219785707186808, 'epoch': 20.0}
[INFO] preprocessing dataset for experiment p2


                                                                 

[INFO] training model for experiment p2




cuda:0
cuda:0


  5%|▌         | 113/2260 [00:19<05:43,  6.24it/s]
  5%|▌         | 113/2260 [00:20<05:43,  6.24it/s]

{'eval_loss': 2.2919058799743652, 'eval_runtime': 1.029, 'eval_samples_per_second': 219.637, 'eval_steps_per_second': 28.184, 'epoch': 1.0}


 10%|█         | 226/2260 [01:07<06:09,  5.50it/s]  
 10%|█         | 226/2260 [01:08<06:09,  5.50it/s]

{'eval_loss': 1.6935251951217651, 'eval_runtime': 1.2246, 'eval_samples_per_second': 184.554, 'eval_steps_per_second': 23.682, 'epoch': 2.0}


 15%|█▌        | 339/2260 [01:50<05:28,  5.84it/s]  
 15%|█▌        | 339/2260 [01:51<05:28,  5.84it/s]

{'eval_loss': 1.5075335502624512, 'eval_runtime': 1.0791, 'eval_samples_per_second': 209.424, 'eval_steps_per_second': 26.873, 'epoch': 3.0}


 20%|██        | 452/2260 [02:34<05:10,  5.83it/s]  
 20%|██        | 452/2260 [02:35<05:10,  5.83it/s]

{'eval_loss': 1.3850293159484863, 'eval_runtime': 1.072, 'eval_samples_per_second': 210.814, 'eval_steps_per_second': 27.051, 'epoch': 4.0}


 22%|██▏       | 501/2260 [03:11<05:29,  5.34it/s]  

{'loss': 2.05, 'learning_rate': 1.5575221238938054e-05, 'epoch': 4.42}


 25%|██▌       | 565/2260 [03:23<05:08,  5.49it/s]
 25%|██▌       | 565/2260 [03:24<05:08,  5.49it/s]

{'eval_loss': 1.2826519012451172, 'eval_runtime': 1.1115, 'eval_samples_per_second': 203.333, 'eval_steps_per_second': 26.091, 'epoch': 5.0}


 30%|███       | 678/2260 [04:07<04:32,  5.80it/s]  
 30%|███       | 678/2260 [04:09<04:32,  5.80it/s]

{'eval_loss': 1.204469919204712, 'eval_runtime': 1.1775, 'eval_samples_per_second': 191.932, 'eval_steps_per_second': 24.628, 'epoch': 6.0}


 35%|███▌      | 791/2260 [04:50<04:11,  5.84it/s]  
 35%|███▌      | 791/2260 [04:51<04:11,  5.84it/s]

{'eval_loss': 1.1226141452789307, 'eval_runtime': 1.0925, 'eval_samples_per_second': 206.857, 'eval_steps_per_second': 26.544, 'epoch': 7.0}


 40%|████      | 904/2260 [05:37<03:55,  5.75it/s]  
 40%|████      | 904/2260 [05:38<03:55,  5.75it/s]

{'eval_loss': 1.0495717525482178, 'eval_runtime': 1.1311, 'eval_samples_per_second': 199.809, 'eval_steps_per_second': 25.639, 'epoch': 8.0}


 44%|████▍     | 1001/2260 [06:18<03:47,  5.54it/s] 

{'loss': 1.045, 'learning_rate': 1.1150442477876106e-05, 'epoch': 8.85}


 45%|████▌     | 1017/2260 [06:21<03:30,  5.90it/s]
 45%|████▌     | 1017/2260 [06:22<03:30,  5.90it/s]

{'eval_loss': 0.9821604490280151, 'eval_runtime': 1.1075, 'eval_samples_per_second': 204.06, 'eval_steps_per_second': 26.185, 'epoch': 9.0}


 50%|█████     | 1130/2260 [07:06<03:13,  5.84it/s]  
 50%|█████     | 1130/2260 [07:07<03:13,  5.84it/s]

{'eval_loss': 0.9229294061660767, 'eval_runtime': 1.1007, 'eval_samples_per_second': 205.333, 'eval_steps_per_second': 26.348, 'epoch': 10.0}


 55%|█████▌    | 1243/2260 [07:52<03:02,  5.59it/s]  
 55%|█████▌    | 1243/2260 [07:53<03:02,  5.59it/s]

{'eval_loss': 0.8679027557373047, 'eval_runtime': 1.1222, 'eval_samples_per_second': 201.396, 'eval_steps_per_second': 25.843, 'epoch': 11.0}


 60%|██████    | 1356/2260 [08:38<02:44,  5.51it/s]  
 60%|██████    | 1356/2260 [08:39<02:44,  5.51it/s]

{'eval_loss': 0.8150554895401001, 'eval_runtime': 1.1029, 'eval_samples_per_second': 204.92, 'eval_steps_per_second': 26.295, 'epoch': 12.0}


 65%|██████▌   | 1469/2260 [09:22<02:14,  5.87it/s]  
 65%|██████▌   | 1469/2260 [09:23<02:14,  5.87it/s]

{'eval_loss': 0.7708607316017151, 'eval_runtime': 1.0803, 'eval_samples_per_second': 209.208, 'eval_steps_per_second': 26.845, 'epoch': 13.0}


 66%|██████▋   | 1501/2260 [09:55<02:19,  5.44it/s]  

{'loss': 0.7085, 'learning_rate': 6.72566371681416e-06, 'epoch': 13.27}


 70%|███████   | 1582/2260 [10:09<01:59,  5.65it/s]
 70%|███████   | 1582/2260 [10:10<01:59,  5.65it/s]

{'eval_loss': 0.7399626970291138, 'eval_runtime': 1.1943, 'eval_samples_per_second': 189.231, 'eval_steps_per_second': 24.282, 'epoch': 14.0}


 75%|███████▌  | 1695/2260 [10:56<01:33,  6.04it/s]  
 75%|███████▌  | 1695/2260 [10:57<01:33,  6.04it/s]

{'eval_loss': 0.7208396196365356, 'eval_runtime': 1.1384, 'eval_samples_per_second': 198.532, 'eval_steps_per_second': 25.475, 'epoch': 15.0}


 80%|████████  | 1808/2260 [11:39<01:20,  5.63it/s]  
 80%|████████  | 1808/2260 [11:40<01:20,  5.63it/s]

{'eval_loss': 0.709347128868103, 'eval_runtime': 1.1043, 'eval_samples_per_second': 204.658, 'eval_steps_per_second': 26.261, 'epoch': 16.0}


 85%|████████▌ | 1921/2260 [12:26<01:01,  5.52it/s]  
 85%|████████▌ | 1921/2260 [12:27<01:01,  5.52it/s]

{'eval_loss': 0.7004119753837585, 'eval_runtime': 1.1255, 'eval_samples_per_second': 200.795, 'eval_steps_per_second': 25.766, 'epoch': 17.0}


 89%|████████▊ | 2001/2260 [13:05<00:48,  5.36it/s]

{'loss': 0.5465, 'learning_rate': 2.3008849557522127e-06, 'epoch': 17.7}


 90%|█████████ | 2034/2260 [13:11<00:38,  5.89it/s]
 90%|█████████ | 2034/2260 [13:12<00:38,  5.89it/s]

{'eval_loss': 0.6968504190444946, 'eval_runtime': 1.1239, 'eval_samples_per_second': 201.085, 'eval_steps_per_second': 25.803, 'epoch': 18.0}


 95%|█████████▌| 2147/2260 [13:56<00:19,  5.94it/s]
 95%|█████████▌| 2147/2260 [13:57<00:19,  5.94it/s]

{'eval_loss': 0.6941794157028198, 'eval_runtime': 1.1049, 'eval_samples_per_second': 204.537, 'eval_steps_per_second': 26.246, 'epoch': 19.0}


100%|██████████| 2260/2260 [14:43<00:00,  5.82it/s]
100%|██████████| 2260/2260 [14:44<00:00,  5.82it/s]

{'eval_loss': 0.6934356689453125, 'eval_runtime': 1.1093, 'eval_samples_per_second': 203.73, 'eval_steps_per_second': 26.142, 'epoch': 20.0}


100%|██████████| 2260/2260 [15:08<00:00,  2.49it/s]


{'train_runtime': 909.6609, 'train_samples_per_second': 19.854, 'train_steps_per_second': 2.484, 'train_loss': 1.0219785707186808, 'epoch': 20.0}
[INFO] preprocessing dataset for experiment p3


                                                                 

[INFO] training model for experiment p3




cuda:0
cuda:0


  5%|▌         | 113/2260 [00:20<05:55,  6.04it/s]
  5%|▌         | 113/2260 [00:21<05:55,  6.04it/s]

{'eval_loss': 2.2919058799743652, 'eval_runtime': 1.0951, 'eval_samples_per_second': 206.369, 'eval_steps_per_second': 26.481, 'epoch': 1.0}


 10%|█         | 226/2260 [01:03<05:43,  5.93it/s]  
 10%|█         | 226/2260 [01:04<05:43,  5.93it/s]

{'eval_loss': 1.6935251951217651, 'eval_runtime': 1.0952, 'eval_samples_per_second': 206.348, 'eval_steps_per_second': 26.478, 'epoch': 2.0}


 15%|█▌        | 339/2260 [01:50<05:31,  5.79it/s]  
 15%|█▌        | 339/2260 [01:51<05:31,  5.79it/s]

{'eval_loss': 1.5075335502624512, 'eval_runtime': 1.099, 'eval_samples_per_second': 205.641, 'eval_steps_per_second': 26.388, 'epoch': 3.0}


 20%|██        | 452/2260 [02:36<05:14,  5.74it/s]  
 20%|██        | 452/2260 [02:37<05:14,  5.74it/s]

{'eval_loss': 1.3850293159484863, 'eval_runtime': 1.0881, 'eval_samples_per_second': 207.701, 'eval_steps_per_second': 26.652, 'epoch': 4.0}


 22%|██▏       | 501/2260 [03:10<05:24,  5.42it/s]  

{'loss': 2.05, 'learning_rate': 1.5575221238938054e-05, 'epoch': 4.42}


 25%|██▌       | 565/2260 [03:21<04:48,  5.87it/s]
 25%|██▌       | 565/2260 [03:22<04:48,  5.87it/s]

{'eval_loss': 1.2826519012451172, 'eval_runtime': 1.0873, 'eval_samples_per_second': 207.858, 'eval_steps_per_second': 26.672, 'epoch': 5.0}


 30%|███       | 678/2260 [04:07<04:25,  5.96it/s]  
 30%|███       | 678/2260 [04:08<04:25,  5.96it/s]

{'eval_loss': 1.204469919204712, 'eval_runtime': 1.0891, 'eval_samples_per_second': 207.51, 'eval_steps_per_second': 26.627, 'epoch': 6.0}


 35%|███▌      | 791/2260 [04:54<04:11,  5.84it/s]  
 35%|███▌      | 791/2260 [04:55<04:11,  5.84it/s]

{'eval_loss': 1.1226141452789307, 'eval_runtime': 1.1001, 'eval_samples_per_second': 205.434, 'eval_steps_per_second': 26.361, 'epoch': 7.0}


 40%|████      | 904/2260 [05:39<03:52,  5.84it/s]  
 40%|████      | 904/2260 [05:40<03:52,  5.84it/s]

{'eval_loss': 1.0495717525482178, 'eval_runtime': 1.0918, 'eval_samples_per_second': 207.004, 'eval_steps_per_second': 26.563, 'epoch': 8.0}


 44%|████▍     | 1001/2260 [06:21<03:59,  5.26it/s] 

{'loss': 1.045, 'learning_rate': 1.1150442477876106e-05, 'epoch': 8.85}


 45%|████▌     | 1017/2260 [06:24<03:37,  5.71it/s]
 45%|████▌     | 1017/2260 [06:25<03:37,  5.71it/s]

{'eval_loss': 0.9821604490280151, 'eval_runtime': 1.1376, 'eval_samples_per_second': 198.67, 'eval_steps_per_second': 25.493, 'epoch': 9.0}


 50%|█████     | 1130/2260 [07:08<03:25,  5.49it/s]  
 50%|█████     | 1130/2260 [07:09<03:25,  5.49it/s]

{'eval_loss': 0.9229294061660767, 'eval_runtime': 1.1407, 'eval_samples_per_second': 198.123, 'eval_steps_per_second': 25.423, 'epoch': 10.0}


 55%|█████▌    | 1243/2260 [07:53<02:49,  5.99it/s]  
 55%|█████▌    | 1243/2260 [07:54<02:49,  5.99it/s]

{'eval_loss': 0.8679027557373047, 'eval_runtime': 1.0878, 'eval_samples_per_second': 207.766, 'eval_steps_per_second': 26.66, 'epoch': 11.0}


 60%|██████    | 1356/2260 [08:40<02:26,  6.17it/s]  
 60%|██████    | 1356/2260 [08:42<02:26,  6.17it/s]

{'eval_loss': 0.8150554895401001, 'eval_runtime': 1.1085, 'eval_samples_per_second': 203.877, 'eval_steps_per_second': 26.161, 'epoch': 12.0}


 65%|██████▌   | 1469/2260 [09:24<02:12,  5.98it/s]  
 65%|██████▌   | 1469/2260 [09:26<02:12,  5.98it/s]

{'eval_loss': 0.7708607316017151, 'eval_runtime': 1.0683, 'eval_samples_per_second': 211.542, 'eval_steps_per_second': 27.145, 'epoch': 13.0}


 66%|██████▋   | 1501/2260 [09:54<02:14,  5.66it/s]  

{'loss': 0.7085, 'learning_rate': 6.72566371681416e-06, 'epoch': 13.27}


 70%|███████   | 1582/2260 [10:08<01:55,  5.90it/s]
 70%|███████   | 1582/2260 [10:09<01:55,  5.90it/s]

{'eval_loss': 0.7399626970291138, 'eval_runtime': 1.0748, 'eval_samples_per_second': 210.267, 'eval_steps_per_second': 26.981, 'epoch': 14.0}


 75%|███████▌  | 1695/2260 [10:51<01:31,  6.16it/s]  
 75%|███████▌  | 1695/2260 [10:52<01:31,  6.16it/s]

{'eval_loss': 0.7208396196365356, 'eval_runtime': 1.0813, 'eval_samples_per_second': 209.009, 'eval_steps_per_second': 26.82, 'epoch': 15.0}


 80%|████████  | 1808/2260 [11:35<01:18,  5.74it/s]  
 80%|████████  | 1808/2260 [11:36<01:18,  5.74it/s]

{'eval_loss': 0.709347128868103, 'eval_runtime': 1.0783, 'eval_samples_per_second': 209.59, 'eval_steps_per_second': 26.894, 'epoch': 16.0}


 85%|████████▌ | 1921/2260 [12:20<00:56,  6.01it/s]
 85%|████████▌ | 1921/2260 [12:21<00:56,  6.01it/s]

{'eval_loss': 0.7004119753837585, 'eval_runtime': 1.0659, 'eval_samples_per_second': 212.024, 'eval_steps_per_second': 27.207, 'epoch': 17.0}


 89%|████████▊ | 2001/2260 [13:03<00:46,  5.57it/s]

{'loss': 0.5465, 'learning_rate': 2.3008849557522127e-06, 'epoch': 17.7}


 90%|█████████ | 2034/2260 [13:09<00:37,  6.01it/s]
 90%|█████████ | 2034/2260 [13:10<00:37,  6.01it/s]

{'eval_loss': 0.6968504190444946, 'eval_runtime': 1.161, 'eval_samples_per_second': 194.665, 'eval_steps_per_second': 24.979, 'epoch': 18.0}


 95%|█████████▌| 2147/2260 [13:52<00:20,  5.63it/s]
 95%|█████████▌| 2147/2260 [13:54<00:20,  5.63it/s]

{'eval_loss': 0.6941794157028198, 'eval_runtime': 1.3898, 'eval_samples_per_second': 162.617, 'eval_steps_per_second': 20.867, 'epoch': 19.0}


100%|██████████| 2260/2260 [14:37<00:00,  5.71it/s]
100%|██████████| 2260/2260 [14:38<00:00,  5.71it/s]

{'eval_loss': 0.6934356689453125, 'eval_runtime': 1.2494, 'eval_samples_per_second': 180.889, 'eval_steps_per_second': 23.211, 'epoch': 20.0}


100%|██████████| 2260/2260 [15:05<00:00,  2.49it/s]


{'train_runtime': 905.9792, 'train_samples_per_second': 19.934, 'train_steps_per_second': 2.495, 'train_loss': 1.0219785707186808, 'epoch': 20.0}


# Inference

In [16]:
from src.inference import ModelInference
from src.preprocessor import Preprocessor
from transformers import (
    T5ForConditionalGeneration,
    AutoTokenizer,
)
from datasets import load_dataset

In [2]:
#constant
max_length = 30
text_col = 'content'
label_col = 'postprocess_quintuplet'
preprocess_type = 'p00'
SAVE_PATH = f'../models/quintuplet_t5_MLM-PT'
PRETRAINED_MODEL = "Wikidepia/IndoT5-base"#"../models/pt-indot5-MLM_PT" #"Wikidepia/IndoT5-base" 
TOKENIZER_PATH = "Wikidepia/IndoT5-base"
DATA_PATH = '../Data/quintuplet/quintuplet_postprocessed_1000.csv'
inference_length = 128

In [3]:
model = T5ForConditionalGeneration.from_pretrained(SAVE_PATH)
tokenizer = AutoTokenizer.from_pretrained(PRETRAINED_MODEL)

In [4]:
preprocessor = Preprocessor(preprocess_type, tokenizer, max_length, text_col, label_col)

In [5]:
raw_dataset = load_dataset('csv', data_files=DATA_PATH)
splitted_dataset = raw_dataset['train'].train_test_split(test_size=0.1, seed=42)
tokenized_dataset = splitted_dataset.map(preprocessor.preprocess_dataset, batched=True, remove_columns=raw_dataset['train'].column_names)

Found cached dataset csv (C:/Users/danendra/.cache/huggingface/datasets/csv/default-e45b5c90cd924547/0.0.0/6954658bab30a358235fa864b05cf819af0e179325c740e4bc853bcc7ec513e1)
100%|██████████| 1/1 [00:00<00:00, 500.04it/s]
Loading cached split indices for dataset at C:\Users\danendra\.cache\huggingface\datasets\csv\default-e45b5c90cd924547\0.0.0\6954658bab30a358235fa864b05cf819af0e179325c740e4bc853bcc7ec513e1\cache-ff1816f6c2c3ba58.arrow and C:\Users\danendra\.cache\huggingface\datasets\csv\default-e45b5c90cd924547\0.0.0\6954658bab30a358235fa864b05cf819af0e179325c740e4bc853bcc7ec513e1\cache-64790f2f7fbaf197.arrow
Loading cached processed dataset at C:\Users\danendra\.cache\huggingface\datasets\csv\default-e45b5c90cd924547\0.0.0\6954658bab30a358235fa864b05cf819af0e179325c740e4bc853bcc7ec513e1\cache-b27f5c338b779ada.arrow
Loading cached processed dataset at C:\Users\danendra\.cache\huggingface\datasets\csv\default-e45b5c90cd924547\0.0.0\6954658bab30a358235fa864b05cf819af0e179325c740e4bc853b

## pakai test data asli

In [18]:
model_inference = ModelInference(batch_size=8, dataset=tokenized_dataset['test'], model=model, 
                                 tokenizer=tokenizer, inference_len=512)
pred_text = model_inference.inference()

100%|██████████| 13/13 [00:07<00:00,  1.84it/s]


In [19]:
test_dataset = splitted_dataset['test']
test_dataset = test_dataset.add_column(f'{preprocess_type}_model_prediction', pred_text)
test_dataset.to_csv('../Data/quintuplet/test_data_predictions.csv')

Creating CSV from Arrow format: 100%|██████████| 1/1 [00:00<00:00, 16.52ba/s]


59610

## buat dummy data

In [29]:
#membuat dummy dataset
# dummy_dataset = splitted_dataset['train'].train_test_split(test_size=400, seed=42)
# test_dataset = dummy_dataset['test'].train_test_split(test_size=200, seed=42)
# test_dataset['train'].to_csv('../Data/quintuplet/val_data.csv') 
# test_dataset['test'].to_csv('../Data/quintuplet/test_data.csv') 
#.map(preprocessor.preprocess_dataset, batched=True, remove_columns=raw_dataset['train'].column_names)

Loading cached split indices for dataset at C:\Users\danendra\.cache\huggingface\datasets\csv\default-7c3f713988208aa5\0.0.0\6954658bab30a358235fa864b05cf819af0e179325c740e4bc853bcc7ec513e1\cache-2c42dbc24aa38187.arrow and C:\Users\danendra\.cache\huggingface\datasets\csv\default-7c3f713988208aa5\0.0.0\6954658bab30a358235fa864b05cf819af0e179325c740e4bc853bcc7ec513e1\cache-f5003ba921be7c84.arrow
Loading cached split indices for dataset at C:\Users\danendra\.cache\huggingface\datasets\csv\default-7c3f713988208aa5\0.0.0\6954658bab30a358235fa864b05cf819af0e179325c740e4bc853bcc7ec513e1\cache-52fb2de3f9cb2504.arrow and C:\Users\danendra\.cache\huggingface\datasets\csv\default-7c3f713988208aa5\0.0.0\6954658bab30a358235fa864b05cf819af0e179325c740e4bc853bcc7ec513e1\cache-29b8e9a42cc5be2e.arrow
Creating CSV from Arrow format: 100%|██████████| 1/1 [00:00<00:00, 86.83ba/s]
Creating CSV from Arrow format: 100%|██████████| 1/1 [00:00<00:00, 117.52ba/s]


105907

In [17]:
#raw dataset
val_dataset = load_dataset('csv', data_files='../Data/quintuplet/val_data.csv')
test_dataset = load_dataset('csv', data_files='../Data/quintuplet/test_data.csv')
#tokenized dataset
tokenized_val_dataset = val_dataset.map(preprocessor.preprocess_dataset, batched=True, remove_columns=val_dataset['train'].column_names)
tokenized_test_dataset = test_dataset.map(preprocessor.preprocess_dataset, batched=True, remove_columns=test_dataset['train'].column_names)

Downloading and preparing dataset csv/default to C:/Users/danendra/.cache/huggingface/datasets/csv/default-b8dcae55ca6bd5c3/0.0.0/6954658bab30a358235fa864b05cf819af0e179325c740e4bc853bcc7ec513e1...


Downloading data files: 100%|██████████| 1/1 [00:00<?, ?it/s]
Extracting data files: 100%|██████████| 1/1 [00:00<00:00, 51.24it/s]
                                                        

Dataset csv downloaded and prepared to C:/Users/danendra/.cache/huggingface/datasets/csv/default-b8dcae55ca6bd5c3/0.0.0/6954658bab30a358235fa864b05cf819af0e179325c740e4bc853bcc7ec513e1. Subsequent calls will reuse this data.


100%|██████████| 1/1 [00:00<00:00, 49.95it/s]
Found cached dataset csv (C:/Users/danendra/.cache/huggingface/datasets/csv/default-ae888e11fe36b0d0/0.0.0/6954658bab30a358235fa864b05cf819af0e179325c740e4bc853bcc7ec513e1)
100%|██████████| 1/1 [00:00<00:00, 500.81it/s]
Loading cached processed dataset at C:\Users\danendra\.cache\huggingface\datasets\csv\default-ae888e11fe36b0d0\0.0.0\6954658bab30a358235fa864b05cf819af0e179325c740e4bc853bcc7ec513e1\cache-71127a19e7f0cb1f.arrow


In [18]:
model_inference = ModelInference(batch_size=8, dataset=tokenized_val_dataset['train'], model=model, 
                                 tokenizer=tokenizer, inference_len=512)
pred_text = model_inference.inference()

  0%|          | 0/25 [00:00<?, ?it/s]

100%|██████████| 25/25 [00:17<00:00,  1.47it/s]


In [19]:
val_dataset['train'] = val_dataset['train'].add_column(f'{experiment_type}_model_prediction', pred_text)
val_dataset['train'].to_csv('../Data/quintuplet/val_data.csv')

Creating CSV from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

Creating CSV from Arrow format: 100%|██████████| 1/1 [00:00<00:00, 34.90ba/s]


509672

## using train data

In [24]:
model_inference = ModelInference(batch_size=8, dataset=tokenized_dataset['train'], model=model, 
                                 tokenizer=tokenizer, inference_len=512)
pred_text = model_inference.inference()

  3%|▎         | 8/237 [00:06<02:54,  1.31it/s]


KeyboardInterrupt: 

In [64]:
train_dataset = splitted_dataset['train']
train_dataset = train_dataset.add_column(f'{preprocess_type}_model_prediction', pred_text)
train_dataset.to_csv('../Data/quintuplet/model_1000-data_predictions_train.csv')

Creating CSV from Arrow format: 100%|██████████| 1/1 [00:00<00:00, 39.96ba/s]


543439

# Evaluate

In [20]:
import pandas as pd
from src.evaluator import Evaluator
from src.postprocessor import PostProcessor
from src.utils import extract_triplet

In [21]:
df = pd.read_csv('../Data/quintuplet/val_data.csv')
df['postprocess_quintuplet'] = df['postprocess_quintuplet'].apply(lambda x:x.lower())
df.head(3)

Unnamed: 0,aoriginal_id,content,final_sentiment,baseline_aspect_category,tweet_type,clean_tweet,label,corrected_label,keterangan,quintuplet_label,postprocess_quintuplet,p00_model_prediction,p01_model_prediction,p02_model_prediction,p03_model_prediction,p02_bart_model_prediction,p20_model_prediction,p21_model_prediction,p22_model_prediction
0,1648822375587381249,"@JNE_ID JT78364921062 ,, Ma,af no resi itu ,, ...",negative,price; produk;,SUBJECTIVE,"JT78364921062 ,, Ma,af no resi itu ,, order d...","(shopee, order, blm sampe ke alamat penerimany...",,,"(shopee, order, blm sampe ke alamat penerimany...","(shopee,order,blm sampe ke alamat penerimanya,...","(shopee,order,blm sampe ke alamat penerimanya,...","(shopee,order,blm sampe ke alamat penerimanya,...","(shopee,order,blm sampe ke alamat penerimanya,...","(shopee,order,blm sampe ke alamat penerimanya,...","(shopee,shopee express,cepet,positive,deliver...","(shopee,order,blm sampe ke alamat penerimanya,...","(shopee,order,blm sampe ke alamat penerimanya,...","(shopee,order,blm sampe ke alamat penerimanya,..."
1,1639829371166015489,@sbtcon Jodantae sama shopee express soalnya a...,negative,produk;,SUBJECTIVE,Jodantae sama shopee express soalnya abang ab...,"(shopee, shopee express, cepet, positive, deli...",,,"(shopee, shopee express, cepet, positive, deli...","(shopee,shopee express,cepet,positive,delivery)","(shopee,shopee express,cepet,positive,delivery)","(shopee,shopee express,cepet,positive,delivery)","(shopee,shopee express,cepet,positive,delivery)","(shopee,shopee express,cepet,positive,delivery)","(shopee,shopee express,cepet,positive,deliver...","(shopee,shopee express,cepet,positive,delivery)","(shopee,shopee express,cepet,positive,delivery)","(shopee,shopee express,cepet,positive,delivery)"
2,1640221429517156352,Berubah ketika barang sampai.yg tadinya ada ja...,negative,produk;,SUBJECTIVE,Berubah ketika barang sampai.yg tadinya ada ja...,"(tokopedia, invoice, bisa ubah invoice seenak ...",,,"(tokopedia, invoice, bisa ubah invoice seenak ...","(tokopedia,invoice,bisa ubah invoice seenak me...","(_,barang,sampai skrng,negative,delivery)","(_,barang,ubah barang dari yg tadinya jadi ila...","(tokopedia,barang,sampai yg tadi ada jd ilang,...","(tokopedia,pengubah barang,jadi lenyap,negativ...","(shopee,shopee express,cepet,positive,deliver...","(tokopedia,barang,sampai yg tadi ada jadi leny...","(tokopedia,barang,tiba2 jadi lenyap,negative,d...","(tokopedia,barang,nyasar,negative,delivery)"


In [22]:
postprocessor = PostProcessor(use_postprocess=False)
evaluator = Evaluator(task_type='quintuplet', postprocessor=postprocessor)

In [23]:
raw_scores, all_labels, all_preds = evaluator.evaluate(pred_seqs=df[f'{experiment_type}_model_prediction'],
                   gold_seqs=df['postprocess_quintuplet'])

100%|██████████| 200/200 [00:00<00:00, 100054.96it/s]


In [25]:
raw_scores

{'precision': 0.858, 'recall': 0.85, 'f1': 0.854}

train score

In [72]:
df = pd.read_csv('../Data/quintuplet/model_1000-data_predictions_train.csv')
df['postprocess_quintuplet'] = df['postprocess_quintuplet'].apply(lambda x:x.lower())
df.head(3)

Unnamed: 0,aoriginal_id,content,final_sentiment,baseline_aspect_category,tweet_type,clean_tweet,label,corrected_label,keterangan,quintuplet_label,postprocess_quintuplet,p00_model_prediction
0,1647261067511496704,eh mau lebaran kyk gini ekspedisi kan lagi hec...,negative,produk;,SUBJECTIVE,eh mau lebaran kyk gini ekspedisi kan lagi hec...,"(shopee, shopee xpress, ekpsedisi kan lagi hec...",,,"(shopee, shopee xpress, ekpsedisi kan lagi hec...","(shopee,shopee xpress,ekpsedisi kan lagi hecti...","(shopee,shopee xpress,ekspedisi kan lagi hecti..."
1,1646546316204597248,@tanyakanrl asli pernah lewat beranda tbtb ada...,negative,delivery; website&apps; produk;,SUBJECTIVE,asli pernah lewat beranda tbtb ada yg jual ko...,"(shopee, konten dewasa, parah si kok bisa lolo...",,,"(shopee, konten dewasa, parah si kok bisa lolo...","(shopee,konten dewasa,parah si kok bisa lolos ...","(shopee,konten dewasa,parah si kok bisa lolos ..."
2,1647243563443388416,"Adeuh, ini barang gua dri shopee blm nyampai2 ...",negative,delivery; customerservice; produk;,SUBJECTIVE,"Adeuh, ini barang gua dri shopee blm nyampai2 ...","(shopee, barang, blm nyampai2 jg woy, negative...",,,"(shopee, barang, blm nyampai2 jg woy, negative...","(shopee,barang,blm nyampai2 jg woy,negative,de...","(shopee,barang,blm nyampai2 jg woy,negative,de..."


In [73]:
raw_scores, all_labels, all_preds = evaluator.evaluate(pred_seqs=df['p00_model_prediction'],
                   gold_seqs=df['postprocess_quintuplet'])

100%|██████████| 930/930 [00:00<00:00, 109199.15it/s]


In [74]:
raw_scores

{'precision': 0.6540483701366983,
 'recall': 0.6032977691561591,
 'f1': 0.627648839556004}