In [19]:
from src.finetuner import FineTuner
from src.inference import ModelInference
from src.preprocessor import Preprocessor
from transformers import (
    T5ForConditionalGeneration,
    AutoTokenizer,
    Seq2SeqTrainingArguments
)
from datasets import load_dataset

In [20]:
#constant
SAVE_PATH = f'../models/pt-indot5-TA_PT'
PRETRAINED_MODEL = "Wikidepia/IndoT5-base"

# Dataset

In [21]:
tokenizer = AutoTokenizer.from_pretrained(PRETRAINED_MODEL)

In [22]:
max_length = 128
text_col = 'input'
label_col = 'triplet'
preprocessor = Preprocessor('p1', tokenizer, max_length, text_col, label_col)

In [23]:
raw_dataset = load_dataset('csv', data_files={'train':'../Data/post-train/ASTE/train.csv'
                                                        , 'validation':'../Data/post-train/ASTE/dev.csv',
                                                        'test' : '../Data/post-train/ASTE/test.csv'})
preprocessor = Preprocessor('p1', tokenizer, max_length, text_col, label_col)
tokenized_dataset = raw_dataset.map(preprocessor.preprocess_dataset, batched=True, remove_columns=raw_dataset['train'].column_names)

Found cached dataset csv (C:/Users/danendra/.cache/huggingface/datasets/csv/default-ecae22b8b998e95e/0.0.0/6954658bab30a358235fa864b05cf819af0e179325c740e4bc853bcc7ec513e1)
100%|██████████| 3/3 [00:00<00:00, 749.96it/s]
Loading cached processed dataset at C:\Users\danendra\.cache\huggingface\datasets\csv\default-ecae22b8b998e95e\0.0.0\6954658bab30a358235fa864b05cf819af0e179325c740e4bc853bcc7ec513e1\cache-db44143562293862.arrow
Loading cached processed dataset at C:\Users\danendra\.cache\huggingface\datasets\csv\default-ecae22b8b998e95e\0.0.0\6954658bab30a358235fa864b05cf819af0e179325c740e4bc853bcc7ec513e1\cache-ecf9d05cf85cad4f.arrow
Loading cached processed dataset at C:\Users\danendra\.cache\huggingface\datasets\csv\default-ecae22b8b998e95e\0.0.0\6954658bab30a358235fa864b05cf819af0e179325c740e4bc853bcc7ec513e1\cache-f4bfb634d23e25b1.arrow


In [24]:
tokenizer.decode(tokenized_dataset['train']['input_ids'][0])

'kamar saya ada kendala di ac tidak berfungsi optimal dan juga wifi koneksi kurang stabil</s>'

In [25]:
tokenizer.decode(tokenized_dataset['train']['labels'][0])

'(ac,tidak berfungsi optimal,NEG);(wifi koneksi,kurang stabil,NEG)</s>'

# Model Training

In [26]:
model = T5ForConditionalGeneration.from_pretrained(PRETRAINED_MODEL)

In [27]:
#training argument
training_args = Seq2SeqTrainingArguments(
    SAVE_PATH,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    learning_rate=2e-5,
    weight_decay=0.01,
    resume_from_checkpoint=True,
    num_train_epochs=50,
    save_total_limit=2,
)

In [28]:
finetuner = FineTuner(model=model, save_path=SAVE_PATH, tokenizer=tokenizer, 
                      train_dataset=tokenized_dataset['train'], eval_dataset=tokenized_dataset['validation'])

cuda:0


In [29]:
finetuner.fine_tune(training_args)



cuda:0


  0%|          | 0/17750 [00:00<?, ?it/s]You're using a T5TokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.
                                                     
  2%|▏         | 355/17750 [01:39<1:10:19,  4.12it/s]

{'eval_loss': 0.9452595710754395, 'eval_runtime': 6.2612, 'eval_samples_per_second': 154.443, 'eval_steps_per_second': 19.325, 'epoch': 1.0}


  3%|▎         | 501/17750 [02:38<1:01:48,  4.65it/s] 

{'loss': 1.596, 'learning_rate': 1.943661971830986e-05, 'epoch': 1.41}


                                                     
  4%|▍         | 710/17750 [03:29<54:34,  5.20it/s]

{'eval_loss': 0.6241455674171448, 'eval_runtime': 5.6087, 'eval_samples_per_second': 172.411, 'eval_steps_per_second': 21.574, 'epoch': 2.0}


  6%|▌         | 1000/17750 [05:08<1:05:58,  4.23it/s]

{'loss': 0.6577, 'learning_rate': 1.887323943661972e-05, 'epoch': 2.82}


                                                      
  6%|▌         | 1065/17750 [05:32<1:21:42,  3.40it/s]

{'eval_loss': 0.3459356427192688, 'eval_runtime': 6.1248, 'eval_samples_per_second': 157.883, 'eval_steps_per_second': 19.756, 'epoch': 3.0}


                                                       
  8%|▊         | 1420/17750 [07:23<1:00:53,  4.47it/s]

{'eval_loss': 0.26861268281936646, 'eval_runtime': 5.899, 'eval_samples_per_second': 163.925, 'eval_steps_per_second': 20.512, 'epoch': 4.0}


  8%|▊         | 1500/17750 [08:09<1:01:28,  4.41it/s] 

{'loss': 0.3614, 'learning_rate': 1.830985915492958e-05, 'epoch': 4.23}


                                                      
 10%|█         | 1775/17750 [09:25<55:14,  4.82it/s]

{'eval_loss': 0.2513056695461273, 'eval_runtime': 5.9933, 'eval_samples_per_second': 161.348, 'eval_steps_per_second': 20.189, 'epoch': 5.0}


 11%|█▏        | 2001/17750 [10:37<1:05:35,  4.00it/s] 

{'loss': 0.2648, 'learning_rate': 1.774647887323944e-05, 'epoch': 5.63}


                                                      
 12%|█▏        | 2130/17750 [11:13<55:13,  4.71it/s]

{'eval_loss': 0.23803502321243286, 'eval_runtime': 5.9692, 'eval_samples_per_second': 161.997, 'eval_steps_per_second': 20.271, 'epoch': 6.0}


                                                       
 14%|█▍        | 2485/17750 [13:06<52:07,  4.88it/s]

{'eval_loss': 0.22920143604278564, 'eval_runtime': 6.4663, 'eval_samples_per_second': 149.544, 'eval_steps_per_second': 18.712, 'epoch': 7.0}


 14%|█▍        | 2501/17750 [13:36<1:10:53,  3.59it/s] 

{'loss': 0.2247, 'learning_rate': 1.7183098591549297e-05, 'epoch': 7.04}


                                                      
 16%|█▌        | 2840/17750 [15:03<1:00:36,  4.10it/s]

{'eval_loss': 0.22656704485416412, 'eval_runtime': 6.0351, 'eval_samples_per_second': 160.229, 'eval_steps_per_second': 20.049, 'epoch': 8.0}


 17%|█▋        | 3001/17750 [15:58<53:47,  4.57it/s]   

{'loss': 0.1883, 'learning_rate': 1.6619718309859155e-05, 'epoch': 8.45}


                                                      
 18%|█▊        | 3195/17750 [16:45<48:46,  4.97it/s]

{'eval_loss': 0.22051884233951569, 'eval_runtime': 5.6625, 'eval_samples_per_second': 170.773, 'eval_steps_per_second': 21.369, 'epoch': 9.0}


 20%|█▉        | 3501/17750 [18:15<53:42,  4.42it/s]   

{'loss': 0.1687, 'learning_rate': 1.6056338028169017e-05, 'epoch': 9.86}


                                                    
 20%|██        | 3550/17750 [18:32<47:51,  4.94it/s]

{'eval_loss': 0.21989645063877106, 'eval_runtime': 5.6465, 'eval_samples_per_second': 171.256, 'eval_steps_per_second': 21.429, 'epoch': 10.0}


                                                       
 22%|██▏       | 3905/17750 [20:26<44:19,  5.20it/s]

{'eval_loss': 0.21874482929706573, 'eval_runtime': 5.635, 'eval_samples_per_second': 171.606, 'eval_steps_per_second': 21.473, 'epoch': 11.0}


 23%|██▎       | 4001/17750 [21:09<53:23,  4.29it/s]   

{'loss': 0.1496, 'learning_rate': 1.5492957746478872e-05, 'epoch': 11.27}


                                                      
 24%|██▍       | 4260/17750 [22:15<44:54,  5.01it/s]

{'eval_loss': 0.2170586735010147, 'eval_runtime': 5.5552, 'eval_samples_per_second': 174.07, 'eval_steps_per_second': 21.781, 'epoch': 12.0}


 25%|██▌       | 4500/17750 [23:38<44:00,  5.02it/s]   

{'loss': 0.1345, 'learning_rate': 1.4929577464788734e-05, 'epoch': 12.68}


                                                      
 26%|██▌       | 4615/17750 [24:15<1:01:32,  3.56it/s]

{'eval_loss': 0.2221137434244156, 'eval_runtime': 6.7261, 'eval_samples_per_second': 143.769, 'eval_steps_per_second': 17.99, 'epoch': 13.0}


                                                       
 28%|██▊       | 4970/17750 [26:15<48:14,  4.41it/s]

{'eval_loss': 0.21863305568695068, 'eval_runtime': 6.0217, 'eval_samples_per_second': 160.587, 'eval_steps_per_second': 20.094, 'epoch': 14.0}


 28%|██▊       | 5000/17750 [26:46<1:07:21,  3.15it/s] 

{'loss': 0.1219, 'learning_rate': 1.4366197183098594e-05, 'epoch': 14.08}


                                                      
 30%|███       | 5325/17750 [28:23<47:01,  4.40it/s]

{'eval_loss': 0.22647476196289062, 'eval_runtime': 6.6947, 'eval_samples_per_second': 144.442, 'eval_steps_per_second': 18.074, 'epoch': 15.0}


 31%|███       | 5501/17750 [29:34<51:29,  3.96it/s]   

{'loss': 0.1122, 'learning_rate': 1.380281690140845e-05, 'epoch': 15.49}


                                                      
 32%|███▏      | 5680/17750 [30:24<44:00,  4.57it/s]

{'eval_loss': 0.21989354491233826, 'eval_runtime': 6.1325, 'eval_samples_per_second': 157.684, 'eval_steps_per_second': 19.731, 'epoch': 16.0}


 34%|███▍      | 6001/17750 [32:07<43:14,  4.53it/s]   

{'loss': 0.1025, 'learning_rate': 1.323943661971831e-05, 'epoch': 16.9}


                                                    
 34%|███▍      | 6035/17750 [32:21<52:25,  3.72it/s]

{'eval_loss': 0.22559808194637299, 'eval_runtime': 6.4573, 'eval_samples_per_second': 149.753, 'eval_steps_per_second': 18.738, 'epoch': 17.0}


                                                       
 36%|███▌      | 6390/17750 [34:22<40:59,  4.62it/s]

{'eval_loss': 0.2229899913072586, 'eval_runtime': 6.2743, 'eval_samples_per_second': 154.122, 'eval_steps_per_second': 19.285, 'epoch': 18.0}


 37%|███▋      | 6500/17750 [35:17<43:54,  4.27it/s]   

{'loss': 0.094, 'learning_rate': 1.2676056338028171e-05, 'epoch': 18.31}


                                                    
 38%|███▊      | 6745/17750 [36:17<43:44,  4.19it/s]

{'eval_loss': 0.23363301157951355, 'eval_runtime': 5.8057, 'eval_samples_per_second': 166.561, 'eval_steps_per_second': 20.842, 'epoch': 19.0}


 39%|███▉      | 7001/17750 [37:42<39:05,  4.58it/s]   

{'loss': 0.0877, 'learning_rate': 1.211267605633803e-05, 'epoch': 19.72}


                                                      
 40%|████      | 7100/17750 [38:12<35:29,  5.00it/s]

{'eval_loss': 0.22683662176132202, 'eval_runtime': 6.2178, 'eval_samples_per_second': 155.52, 'eval_steps_per_second': 19.46, 'epoch': 20.0}


                                                       
 42%|████▏     | 7455/17750 [40:03<35:19,  4.86it/s]

{'eval_loss': 0.23168307542800903, 'eval_runtime': 5.8409, 'eval_samples_per_second': 165.556, 'eval_steps_per_second': 20.716, 'epoch': 21.0}


 42%|████▏     | 7501/17750 [40:42<45:14,  3.78it/s]   

{'loss': 0.0839, 'learning_rate': 1.1549295774647888e-05, 'epoch': 21.13}


                                                    
 44%|████▍     | 7810/17750 [41:58<35:55,  4.61it/s]

{'eval_loss': 0.23700390756130219, 'eval_runtime': 5.8874, 'eval_samples_per_second': 164.25, 'eval_steps_per_second': 20.552, 'epoch': 22.0}


 45%|████▌     | 8001/17750 [43:07<36:24,  4.46it/s]   

{'loss': 0.0768, 'learning_rate': 1.0985915492957748e-05, 'epoch': 22.54}


                                                    
 46%|████▌     | 8165/17750 [43:50<34:05,  4.69it/s]

{'eval_loss': 0.2363223433494568, 'eval_runtime': 5.9608, 'eval_samples_per_second': 162.227, 'eval_steps_per_second': 20.299, 'epoch': 23.0}


 48%|████▊     | 8500/17750 [45:33<34:35,  4.46it/s]   

{'loss': 0.0706, 'learning_rate': 1.0422535211267606e-05, 'epoch': 23.94}


                                                    
 48%|████▊     | 8520/17750 [45:44<36:34,  4.21it/s]

{'eval_loss': 0.23977282643318176, 'eval_runtime': 6.1663, 'eval_samples_per_second': 156.821, 'eval_steps_per_second': 19.623, 'epoch': 24.0}


                                                       
 50%|█████     | 8875/17750 [47:40<30:49,  4.80it/s]

{'eval_loss': 0.24121585488319397, 'eval_runtime': 5.9182, 'eval_samples_per_second': 163.394, 'eval_steps_per_second': 20.445, 'epoch': 25.0}


 51%|█████     | 9001/17750 [48:35<33:08,  4.40it/s]   

{'loss': 0.0666, 'learning_rate': 9.859154929577466e-06, 'epoch': 25.35}


                                                      
 52%|█████▏    | 9230/17750 [49:41<29:12,  4.86it/s]

{'eval_loss': 0.24184198677539825, 'eval_runtime': 5.9077, 'eval_samples_per_second': 163.686, 'eval_steps_per_second': 20.482, 'epoch': 26.0}


 54%|█████▎    | 9501/17750 [51:03<29:38,  4.64it/s]   

{'loss': 0.0654, 'learning_rate': 9.295774647887325e-06, 'epoch': 26.76}


                                                    
 54%|█████▍    | 9585/17750 [51:29<26:59,  5.04it/s]

{'eval_loss': 0.2450026273727417, 'eval_runtime': 5.9017, 'eval_samples_per_second': 163.851, 'eval_steps_per_second': 20.503, 'epoch': 27.0}


                                                       
 56%|█████▌    | 9940/17750 [53:24<26:51,  4.85it/s]

{'eval_loss': 0.25158265233039856, 'eval_runtime': 5.9162, 'eval_samples_per_second': 163.449, 'eval_steps_per_second': 20.452, 'epoch': 28.0}


 56%|█████▋    | 10001/17750 [54:03<30:16,  4.27it/s]  

{'loss': 0.0594, 'learning_rate': 8.732394366197183e-06, 'epoch': 28.17}


                                                     
 58%|█████▊    | 10295/17750 [55:17<26:51,  4.63it/s]

{'eval_loss': 0.2507924735546112, 'eval_runtime': 5.9134, 'eval_samples_per_second': 163.528, 'eval_steps_per_second': 20.462, 'epoch': 29.0}


 59%|█████▉    | 10500/17750 [56:27<25:27,  4.75it/s]   

{'loss': 0.0573, 'learning_rate': 8.169014084507043e-06, 'epoch': 29.58}


                                                     
 60%|██████    | 10650/17750 [57:08<26:51,  4.40it/s]

{'eval_loss': 0.24997927248477936, 'eval_runtime': 5.8682, 'eval_samples_per_second': 164.787, 'eval_steps_per_second': 20.62, 'epoch': 30.0}


 62%|██████▏   | 11001/17750 [58:56<24:59,  4.50it/s]   

{'loss': 0.0556, 'learning_rate': 7.6056338028169015e-06, 'epoch': 30.99}


                                                     
 62%|██████▏   | 11005/17750 [59:03<25:40,  4.38it/s]

{'eval_loss': 0.2558487355709076, 'eval_runtime': 5.9134, 'eval_samples_per_second': 163.527, 'eval_steps_per_second': 20.462, 'epoch': 31.0}


                                                        
 64%|██████▍   | 11360/17750 [1:00:52<24:35,  4.33it/s]

{'eval_loss': 0.25271841883659363, 'eval_runtime': 5.8862, 'eval_samples_per_second': 164.283, 'eval_steps_per_second': 20.557, 'epoch': 32.0}


 65%|██████▍   | 11500/17750 [1:01:46<24:15,  4.30it/s]   

{'loss': 0.0535, 'learning_rate': 7.042253521126761e-06, 'epoch': 32.39}


                                                       
 66%|██████▌   | 11715/17750 [1:02:41<20:49,  4.83it/s]

{'eval_loss': 0.2572874426841736, 'eval_runtime': 5.8682, 'eval_samples_per_second': 164.787, 'eval_steps_per_second': 20.62, 'epoch': 33.0}


 68%|██████▊   | 12000/17750 [1:04:15<21:32,  4.45it/s]   

{'loss': 0.0493, 'learning_rate': 6.478873239436621e-06, 'epoch': 33.8}


                                                       
 68%|██████▊   | 12070/17750 [1:04:37<18:27,  5.13it/s]

{'eval_loss': 0.2619326710700989, 'eval_runtime': 5.846, 'eval_samples_per_second': 165.411, 'eval_steps_per_second': 20.698, 'epoch': 34.0}


                                                          
 70%|███████   | 12425/17750 [1:06:31<19:34,  4.54it/s]

{'eval_loss': 0.26079344749450684, 'eval_runtime': 6.1088, 'eval_samples_per_second': 158.297, 'eval_steps_per_second': 19.808, 'epoch': 35.0}


 70%|███████   | 12501/17750 [1:07:14<18:44,  4.67it/s]   

{'loss': 0.0478, 'learning_rate': 5.915492957746479e-06, 'epoch': 35.21}


                                                       
 72%|███████▏  | 12780/17750 [1:08:25<17:57,  4.61it/s]

{'eval_loss': 0.2652977705001831, 'eval_runtime': 6.0964, 'eval_samples_per_second': 158.618, 'eval_steps_per_second': 19.848, 'epoch': 36.0}


 73%|███████▎  | 13001/17750 [1:09:42<18:05,  4.37it/s]   

{'loss': 0.0468, 'learning_rate': 5.352112676056338e-06, 'epoch': 36.62}


                                                       
 74%|███████▍  | 13135/17750 [1:10:21<16:16,  4.72it/s]

{'eval_loss': 0.26342713832855225, 'eval_runtime': 5.9639, 'eval_samples_per_second': 162.144, 'eval_steps_per_second': 20.289, 'epoch': 37.0}


                                                          
 76%|███████▌  | 13490/17750 [1:12:38<19:26,  3.65it/s]

{'eval_loss': 0.2680544853210449, 'eval_runtime': 23.1069, 'eval_samples_per_second': 41.849, 'eval_steps_per_second': 5.237, 'epoch': 38.0}


 76%|███████▌  | 13501/17750 [1:13:05<44:33,  1.59it/s]   

{'loss': 0.0456, 'learning_rate': 4.788732394366197e-06, 'epoch': 38.03}


                                                         
 78%|███████▊  | 13845/17750 [1:14:33<14:14,  4.57it/s]

{'eval_loss': 0.27221858501434326, 'eval_runtime': 5.8579, 'eval_samples_per_second': 165.077, 'eval_steps_per_second': 20.656, 'epoch': 39.0}


 79%|███████▉  | 14000/17750 [1:15:37<13:49,  4.52it/s]   

{'loss': 0.0438, 'learning_rate': 4.225352112676057e-06, 'epoch': 39.44}


                                                       
 80%|████████  | 14200/17750 [1:16:29<11:53,  4.98it/s]

{'eval_loss': 0.2720315158367157, 'eval_runtime': 5.9055, 'eval_samples_per_second': 163.746, 'eval_steps_per_second': 20.489, 'epoch': 40.0}


 82%|████████▏ | 14501/17750 [1:18:06<10:57,  4.94it/s]   

{'loss': 0.0427, 'learning_rate': 3.6619718309859158e-06, 'epoch': 40.85}


                                                       
 82%|████████▏ | 14555/17750 [1:18:23<10:29,  5.08it/s]

{'eval_loss': 0.2733479142189026, 'eval_runtime': 5.563, 'eval_samples_per_second': 173.828, 'eval_steps_per_second': 21.751, 'epoch': 41.0}


                                                         
 84%|████████▍ | 14910/17750 [1:20:08<09:00,  5.25it/s]

{'eval_loss': 0.27329060435295105, 'eval_runtime': 5.5833, 'eval_samples_per_second': 173.195, 'eval_steps_per_second': 21.672, 'epoch': 42.0}


 85%|████████▍ | 15001/17750 [1:20:58<09:40,  4.73it/s]  

{'loss': 0.0419, 'learning_rate': 3.0985915492957746e-06, 'epoch': 42.25}


                                                       
 86%|████████▌ | 15265/17750 [1:22:07<08:04,  5.13it/s]

{'eval_loss': 0.2756756544113159, 'eval_runtime': 5.9561, 'eval_samples_per_second': 162.354, 'eval_steps_per_second': 20.315, 'epoch': 43.0}


 87%|████████▋ | 15500/17750 [1:23:32<14:45,  2.54it/s]  

{'loss': 0.0421, 'learning_rate': 2.535211267605634e-06, 'epoch': 43.66}


                                                       
 88%|████████▊ | 15620/17750 [1:24:05<07:13,  4.91it/s]

{'eval_loss': 0.27593791484832764, 'eval_runtime': 6.2098, 'eval_samples_per_second': 155.721, 'eval_steps_per_second': 19.485, 'epoch': 44.0}


                                                         
 90%|█████████ | 15975/17750 [1:25:57<06:01,  4.91it/s]

{'eval_loss': 0.2755855917930603, 'eval_runtime': 5.9145, 'eval_samples_per_second': 163.497, 'eval_steps_per_second': 20.458, 'epoch': 45.0}


 90%|█████████ | 16001/17750 [1:26:32<06:52,  4.24it/s]  

{'loss': 0.0402, 'learning_rate': 1.971830985915493e-06, 'epoch': 45.07}


                                                       
 92%|█████████▏| 16330/17750 [1:27:54<04:36,  5.13it/s]

{'eval_loss': 0.27625027298927307, 'eval_runtime': 5.4938, 'eval_samples_per_second': 176.016, 'eval_steps_per_second': 22.025, 'epoch': 46.0}


 93%|█████████▎| 16501/17750 [1:28:57<04:18,  4.83it/s]  

{'loss': 0.0391, 'learning_rate': 1.4084507042253523e-06, 'epoch': 46.48}


                                                       
 94%|█████████▍| 16685/17750 [1:29:42<03:29,  5.08it/s]

{'eval_loss': 0.2762194871902466, 'eval_runtime': 5.5193, 'eval_samples_per_second': 175.204, 'eval_steps_per_second': 21.923, 'epoch': 47.0}


 96%|█████████▌| 17001/17750 [1:31:22<03:02,  4.10it/s]  

{'loss': 0.0392, 'learning_rate': 8.450704225352114e-07, 'epoch': 47.89}


                                                       
 96%|█████████▌| 17040/17750 [1:31:37<02:45,  4.29it/s]

{'eval_loss': 0.27872234582901, 'eval_runtime': 5.6704, 'eval_samples_per_second': 170.535, 'eval_steps_per_second': 21.339, 'epoch': 48.0}


                                                         
 98%|█████████▊| 17395/17750 [1:33:26<01:15,  4.69it/s]

{'eval_loss': 0.2780137062072754, 'eval_runtime': 5.5599, 'eval_samples_per_second': 173.925, 'eval_steps_per_second': 21.763, 'epoch': 49.0}


 99%|█████████▊| 17500/17750 [1:34:16<01:17,  3.22it/s]

{'loss': 0.0371, 'learning_rate': 2.8169014084507043e-07, 'epoch': 49.3}


                                                       
100%|██████████| 17750/17750 [1:35:44<00:00,  1.38s/it]

{'eval_loss': 0.2777468264102936, 'eval_runtime': 6.0533, 'eval_samples_per_second': 159.747, 'eval_steps_per_second': 19.989, 'epoch': 50.0}


100%|██████████| 17750/17750 [1:36:11<00:00,  3.08it/s]


{'train_runtime': 5771.7034, 'train_samples_per_second': 24.568, 'train_steps_per_second': 3.075, 'train_loss': 0.1517695594035404, 'epoch': 50.0}


# Inference

In [30]:
model = T5ForConditionalGeneration.from_pretrained(SAVE_PATH).to('cuda')

In [31]:
model_inference = ModelInference(batch_size=8, dataset=tokenized_dataset['test'], model=model, tokenizer=tokenizer)
pred_text = model_inference.inference()

100%|██████████| 117/117 [01:54<00:00,  1.03it/s]


In [32]:
test_dataset = raw_dataset['test']
test_dataset = test_dataset.add_column('triplet_model_prediction', pred_text)
test_dataset.to_csv('../Data/post-train/ASTE/model_inference.csv')

Creating CSV from Arrow format: 100%|██████████| 1/1 [00:00<00:00, 17.66ba/s]


265327

# Evaluate

In [33]:
import pandas as pd
from src.evaluator import Evaluator

In [34]:
df = pd.read_csv('../Data/post-train/ASTE/model_inference.csv')

In [35]:
evaluator = Evaluator(task_type='triplet')
raw_scores, all_labels, all_preds = evaluator.evaluate(df['triplet_model_prediction'].astype('str'), df['triplet'].astype('str'))

100%|██████████| 935/935 [00:00<00:00, 74731.30it/s]


In [36]:
raw_scores

{'precision': 0.746765974127793,
 'recall': 0.775020341741253,
 'f1': 0.7606308644440007}

# Inference with zili anotated data

In [1]:
from src.inference import ModelInference
from src.preprocessor import Preprocessor
from transformers import (
    T5ForConditionalGeneration,
    AutoTokenizer,
)
from datasets import load_dataset

  from .autonotebook import tqdm as notebook_tqdm


In [5]:
#constant
max_length = 20
text_col = 'content'
label_col = 'triplet_label'
preprocess_type = 'p00'
SAVE_PATH = f'../models/pt-indot5-TA_PT'
PRETRAINED_MODEL = "Wikidepia/IndoT5-base"
DATA_PATH = '../Data/quadruplet/quadruplet_2200-data_annottated_clean.csv'
inference_length = 128

In [3]:
model = T5ForConditionalGeneration.from_pretrained(SAVE_PATH)
tokenizer = AutoTokenizer.from_pretrained(PRETRAINED_MODEL)

In [6]:
preprocessor = Preprocessor(preprocess_type, tokenizer, max_length, text_col, label_col)
raw_dataset = load_dataset('csv', data_files=DATA_PATH)
tokenized_dataset = raw_dataset.map(preprocessor.preprocess_dataset, batched=True, remove_columns=raw_dataset['train'].column_names)

Downloading and preparing dataset csv/default to C:/Users/danendra/.cache/huggingface/datasets/csv/default-01756b12cba475bf/0.0.0/6954658bab30a358235fa864b05cf819af0e179325c740e4bc853bcc7ec513e1...


Downloading data files: 100%|██████████| 1/1 [00:00<?, ?it/s]
Extracting data files: 100%|██████████| 1/1 [00:00<00:00,  7.51it/s]
                                                        

Dataset csv downloaded and prepared to C:/Users/danendra/.cache/huggingface/datasets/csv/default-01756b12cba475bf/0.0.0/6954658bab30a358235fa864b05cf819af0e179325c740e4bc853bcc7ec513e1. Subsequent calls will reuse this data.


100%|██████████| 1/1 [00:00<00:00, 90.41it/s]
                                                                 

In [10]:
tokenized_dataset

DatasetDict({
    train: Dataset({
        features: ['input_ids', 'attention_mask', 'labels'],
        num_rows: 2298
    })
})

In [11]:
tokenizer.decode(tokenized_dataset['train']['labels'][0])

'(hp, harga terlalu tinggi, negative)</s>'

In [12]:
tokenizer.decode(tokenized_dataset['train']['input_ids'][0])

'nder hpnya kamu pakai berapa lama? kayaknya jt masih terlalu tingi</s>'

In [13]:
model_inference = ModelInference(batch_size=8, dataset=tokenized_dataset['train'], model=model, 
                                 tokenizer=tokenizer, inference_len=inference_length)
pred_text = model_inference.inference()

You're using a T5TokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.
100%|██████████| 288/288 [02:19<00:00,  2.07it/s]


In [15]:
raw_dataset = raw_dataset['train'].add_column(f'{preprocess_type}_model_prediction', pred_text)
raw_dataset.to_csv('../Data/quadruplet/model_2200_data_semi_supervised_labels.csv')

Creating CSV from Arrow format:   0%|          | 0/3 [00:00<?, ?ba/s]

Creating CSV from Arrow format: 100%|██████████| 3/3 [00:00<00:00, 47.03ba/s]


908982

# Evaluate

In [21]:
import pandas as pd
from src.evaluator import Evaluator
from src.postprocessor import PostProcessor
from src.utils import extract_triplet

In [25]:
df = pd.read_csv('../Data/quadruplet/model_2200_data_semi_supervised_labels.csv')
df['triplet_label'] = df['triplet_label'].apply(lambda x:x.lower())
df['p00_model_prediction'] = df['p00_model_prediction'].apply(lambda x:x.lower())
df.head(5)

Unnamed: 0,content,final_sentiment,baseline_aspect_category,tweet_type,clean_tweet,quadruplet,triplet_label,p00_model_prediction
0,@JUALAN_BASE @fleurlovincs Nder hpnya kamu pak...,neutral,delivery; produk;,SUBJECTIVE,Nder hpnya kamu pake brp lama? Kayaknya jt ms...,"(hp, harga terlalu tinggi, negative, produk)","(hp, harga terlalu tinggi, negative)","(hpnya,berapa lama,neg)"
1,@teh_manis__ @ezash coba cari di olx mba kalo ...,neutral,produk;,SUBJECTIVE,coba cari di olx mba kalo baru pertama kali m...,"(motor bekas, pertama kali, positive, produk)","(motor bekas, pertama kali, positive)","(motor,bekas,neg)"
2,@tanyakanrl Kalo mau beli second jangan di tok...,positive,delivery; produk;,SUBJECTIVE,Kalo mau beli second jangan di toko oren atau...,"(beli second, biar lebih aman, negative, produ...","(beli second, biar lebih aman, negative);(olx,...","(f,jangan di market place,neg)"
3,kalo jam 1 malem gabisa tidur pengen makan nas...,neutral,customerservice; produk;,SUBJECTIVE,kalo jam malem gabisa tidur pengen makan nasi ...,"(nasi goreng, enak, positive, produk)","(nasi goreng, enak, positive)","(nasi goreng,enak,pos)"
4,"@tirta_cipeng Di luar dunia fashion, saya seri...",neutral,produk;,SUBJECTIVE,"Di luar dunia fashion, saya sering banget nem...","(olx, reseller, negative, customerservice)","(olx, reseller, negative)","(budget,mengakui,pos)"


In [18]:
postprocessor = PostProcessor(use_postprocess=False)
evaluator = Evaluator(task_type='triplet', postprocessor=postprocessor)

In [19]:
raw_scores, all_labels, all_preds = evaluator.evaluate(pred_seqs=df['p00_model_prediction'],
                   gold_seqs=df['triplet_label'])

100%|██████████| 2298/2298 [00:00<00:00, 114761.93it/s]


post-process

In [31]:
sen_map = {
    'neg' : 'negative',
    'pos' : 'positive',
    'net' : 'neutral'
}
errors_idx = []
for i in range(len(df)):
    try:
        row = df.iloc[i]
        triplets = extract_triplet(row['p00_model_prediction'])
        new_triplet = []
        for triplet in triplets:
            aspect, opinion, sentiment = triplet
            sentiment = sen_map[sentiment]
            new_triplet.append(f"({aspect},{opinion},{sentiment})")
        new_triplet_str = ";".join(new_triplet)
        df.at[i, 'p00_model_prediction'] = new_triplet_str
    except:
        errors_idx.append(i)

In [34]:
clean_df = df.drop(index=errors_idx)

In [40]:
clean_df

Unnamed: 0,content,final_sentiment,baseline_aspect_category,tweet_type,clean_tweet,quadruplet,triplet_label,p00_model_prediction
17,Trus waktu kantorku hampir pindah di daerah ka...,neutral,produk;,SUBJECTIVE,Trus waktu kantorku hampir pindah di daerah ka...,"(mamikos, nyari kosan/kontrakan, olx, produk);...","(mamikos, nyari kosan/kontrakan, olx);(fesbuk,...",
23,@beautydyah Bisa kok hanya saja yang lebih dir...,positive,delivery; website&apps; payment; produk;,SUBJECTIVE,Bisa kok hanya saja yang lebih direkomendasik...,"(lebih direkomendasikan, keluar, centang biru,...","(lebih direkomendasikan, keluar, centang biru)","(biru,bisa,positive)"
24,"Btw ga cantumin foto soalnya bukan punyaku, jd...",negative,produk;,SUBJECTIVE,"Btw ga cantumin foto soalnya bukan punyaku, jd...","(penjual, ngajak cod-an, positive, produk)","(penjual, ngajak cod-an, positive)","(foto,ga cantumin,negative)"
25,"@babyjxxxoxo Wkwk lebay sih kl email, kecuali ...",neutral,produk;,SUBJECTIVE,"Wkwk lebay sih kl email, kecuali nomor wasaf ...","(pasang iklan, nomornya lupa hide, negative, w...","(pasang iklan, nomornya lupa hide, negative)","(email,berlebihan,negative)"
26,@SeputarTetangga Buat referensi coba liat2 aja...,neutral,price; produk;,SUBJECTIVE,Buat referensi coba liat2 aja harga rumah di ...,"(harga rumah, murah2, positive, produk); (kont...","(harga rumah, murah2, positive);(kontrakan, di...","(harga rumah,murah,positive)"
...,...,...,...,...,...,...,...,...
2293,@indomilkyourway @alfamart @alfamidi_ku @Indom...,positive,produk;,SUBJECTIVE,PINK BLOSSOM ENAK BGTIIII,"(pink blossom, enak bgt, positive, produk)","(pink blossom, enak bgt, positive)","(bgti,pink,positive);(bgti,enak bgti,positive)"
2294,shopee gw doang apa emang lagi error sih. tiap...,negative,produk;,SUBJECTIVE,shopee gw doang apa emang lagi error sih. tiap...,"(shopee, lagi error, negative, website&apps)","(shopee, lagi error, negative)","(shope,doang,negative)"
2295,@indomilkyourway @alfamart @alfamidi_ku @Indom...,positive,produk;,SUBJECTIVE,"Yang black latte enak banget sumpilll, plis j...","(black latte, enak banget sumpilll, positive, ...","(black latte, enak banget sumpilll, positive)","(varian,enak banget,positive);(varian,sumpil,p..."
2296,@sbtcon Di deskripsi link Shopee sllu tulis ka...,negative,produk;,SUBJECTIVE,Di deskripsi link Shopee sllu tulis kalo rusa...,"(packing, diluar tanggung jawab seller, negati...","(packing, diluar tanggung jawab seller, negative)","(seler,rusak,negative)"


In [41]:
raw_scores, all_labels, all_preds = evaluator.evaluate(pred_seqs=df['p00_model_prediction'],
                   gold_seqs=df['triplet_label'])

100%|██████████| 2298/2298 [00:00<00:00, 7800.50it/s]


In [43]:
clean_df.to_csv('../Data/quadruplet/model_2200_data_semi_supervised_labels.csv', index=False)