In [13]:
'''
magic command from IPython extension to reload modules before executing user code
https://ipython.readthedocs.io/en/stable/config/extensions/autoreload.html
'''

%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [14]:
import numpy as np
import pandas as pd
from io import StringIO
import csv
import sys
from tqdm.notebook import tqdm

In [15]:
'''
collecting corpus, split by double end
'''

collection = []

with open("./data/Indonesian_Manually_Tagged_Corpus.tsv", "r") as f:
    txt_file = f.read()

for sentence in txt_file.split('\n\n'):
    temp = pd.read_csv(StringIO(sentence), delimiter='\t', header=None, quoting=csv.QUOTE_NONE)
    collection.append(temp.to_numpy())

collection = np.array(collection, dtype=object)

In [16]:
TRAINING_PERCENTAGE = 70
VALIDATION_PERCENTAGE = 10
TESTING_PERCENTAGE = 20

In [17]:
'''
shuffling collection data, and divide the data into training set, validation set, and testing set
'''

np.random.shuffle(collection)

training_count = round(len(collection) * TRAINING_PERCENTAGE / 100)
validation_count = round((len(collection) - training_count) * VALIDATION_PERCENTAGE / (100 - TRAINING_PERCENTAGE))

training = collection[0 : training_count]
validation = collection[training_count : training_count + validation_count]
testing = collection[training_count + validation_count : ]

In [102]:
from PosTagging import PosTagging
tagger = PosTagging(training)
_,_,_ = tagger.train()

[7021/7021]	|██████████████████████████████████████████████████|
finished...

In [None]:
lambdas = tagger.validate_0(validation)

2022-03-26 14:40:57,337	INFO worker.py:878 -- Calling ray.init() again after it has already been called.


  0%|          | 0/100 [00:00<?, ?it/s]

0.27701716314855
0.5540343262971


In [58]:
lambdas

[(0.0, 0.1335174590649043),
 (0.1, 0.8166107713552969),
 (0.2, 0.8492404813572697),
 (0.30000000000000004, 0.8658512527125666),
 (0.4, 0.8767015190372854),
 (0.5, 0.8860130203195896),
 (0.6, 0.8933912014203985),
 (0.7, 0.9013217597159203),
 (0.7999999999999999, 0.908029197080292),
 (0.8999999999999999, 0.9129216808048924)]

In [9]:
def get_tag_accuracy(__lambda, data):
    wrong = 0
    right = 0
    for i in tqdm(range(len(data)), colour="YELLOW"):
        tagger.set_lambda(__lambda)
        tags = tagger.predict(data[i])
        for j in range(len(tags)):
            if tags[j] == data[i][j][1]: right += 1 
            else: wrong += 1
    return float((right/(right+wrong)))

In [10]:
__lambda = 0.0
step_up = 0.01
lambda_values = []
ctr = 0
for i in tqdm(range(int(1.0/step_up))):
    lambda_values.append(get_tag_accuracy(__lambda, validation))
    __lambda +=step_up
lambda_values

  0%|          | 0/100 [00:00<?, ?it/s]

  0%|          | 0/1003 [00:00<?, ?it/s]

  0%|          | 0/1003 [00:00<?, ?it/s]

  0%|          | 0/1003 [00:00<?, ?it/s]

  0%|          | 0/1003 [00:00<?, ?it/s]

  0%|          | 0/1003 [00:00<?, ?it/s]

  0%|          | 0/1003 [00:00<?, ?it/s]

  0%|          | 0/1003 [00:00<?, ?it/s]

  0%|          | 0/1003 [00:00<?, ?it/s]

  0%|          | 0/1003 [00:00<?, ?it/s]

  0%|          | 0/1003 [00:00<?, ?it/s]

  0%|          | 0/1003 [00:00<?, ?it/s]

  0%|          | 0/1003 [00:00<?, ?it/s]

  0%|          | 0/1003 [00:00<?, ?it/s]

  0%|          | 0/1003 [00:00<?, ?it/s]

  0%|          | 0/1003 [00:00<?, ?it/s]

  0%|          | 0/1003 [00:00<?, ?it/s]

  0%|          | 0/1003 [00:00<?, ?it/s]

  0%|          | 0/1003 [00:00<?, ?it/s]

  0%|          | 0/1003 [00:00<?, ?it/s]

  0%|          | 0/1003 [00:00<?, ?it/s]

  0%|          | 0/1003 [00:00<?, ?it/s]

  0%|          | 0/1003 [00:00<?, ?it/s]

  0%|          | 0/1003 [00:00<?, ?it/s]

  0%|          | 0/1003 [00:00<?, ?it/s]

  0%|          | 0/1003 [00:00<?, ?it/s]

  0%|          | 0/1003 [00:00<?, ?it/s]

  0%|          | 0/1003 [00:00<?, ?it/s]

  0%|          | 0/1003 [00:00<?, ?it/s]

  0%|          | 0/1003 [00:00<?, ?it/s]

  0%|          | 0/1003 [00:00<?, ?it/s]

  0%|          | 0/1003 [00:00<?, ?it/s]

  0%|          | 0/1003 [00:00<?, ?it/s]

  0%|          | 0/1003 [00:00<?, ?it/s]

  0%|          | 0/1003 [00:00<?, ?it/s]

  0%|          | 0/1003 [00:00<?, ?it/s]

  0%|          | 0/1003 [00:00<?, ?it/s]

  0%|          | 0/1003 [00:00<?, ?it/s]

  0%|          | 0/1003 [00:00<?, ?it/s]

  0%|          | 0/1003 [00:00<?, ?it/s]

  0%|          | 0/1003 [00:00<?, ?it/s]

  0%|          | 0/1003 [00:00<?, ?it/s]

  0%|          | 0/1003 [00:00<?, ?it/s]

  0%|          | 0/1003 [00:00<?, ?it/s]

  0%|          | 0/1003 [00:00<?, ?it/s]

  0%|          | 0/1003 [00:00<?, ?it/s]

  0%|          | 0/1003 [00:00<?, ?it/s]

  0%|          | 0/1003 [00:00<?, ?it/s]

  0%|          | 0/1003 [00:00<?, ?it/s]

  0%|          | 0/1003 [00:00<?, ?it/s]

  0%|          | 0/1003 [00:00<?, ?it/s]

  0%|          | 0/1003 [00:00<?, ?it/s]

  0%|          | 0/1003 [00:00<?, ?it/s]

  0%|          | 0/1003 [00:00<?, ?it/s]

  0%|          | 0/1003 [00:00<?, ?it/s]

  0%|          | 0/1003 [00:00<?, ?it/s]

  0%|          | 0/1003 [00:00<?, ?it/s]

  0%|          | 0/1003 [00:00<?, ?it/s]

  0%|          | 0/1003 [00:00<?, ?it/s]

  0%|          | 0/1003 [00:00<?, ?it/s]

  0%|          | 0/1003 [00:00<?, ?it/s]

  0%|          | 0/1003 [00:00<?, ?it/s]

  0%|          | 0/1003 [00:00<?, ?it/s]

  0%|          | 0/1003 [00:00<?, ?it/s]

  0%|          | 0/1003 [00:00<?, ?it/s]

  0%|          | 0/1003 [00:00<?, ?it/s]

  0%|          | 0/1003 [00:00<?, ?it/s]

  0%|          | 0/1003 [00:00<?, ?it/s]

  0%|          | 0/1003 [00:00<?, ?it/s]

  0%|          | 0/1003 [00:00<?, ?it/s]

  0%|          | 0/1003 [00:00<?, ?it/s]

  0%|          | 0/1003 [00:00<?, ?it/s]

  0%|          | 0/1003 [00:00<?, ?it/s]

  0%|          | 0/1003 [00:00<?, ?it/s]

  0%|          | 0/1003 [00:00<?, ?it/s]

  0%|          | 0/1003 [00:00<?, ?it/s]

  0%|          | 0/1003 [00:00<?, ?it/s]

  0%|          | 0/1003 [00:00<?, ?it/s]

  0%|          | 0/1003 [00:00<?, ?it/s]

  0%|          | 0/1003 [00:00<?, ?it/s]

  0%|          | 0/1003 [00:00<?, ?it/s]

  0%|          | 0/1003 [00:00<?, ?it/s]

  0%|          | 0/1003 [00:00<?, ?it/s]

  0%|          | 0/1003 [00:00<?, ?it/s]

  0%|          | 0/1003 [00:00<?, ?it/s]

  0%|          | 0/1003 [00:00<?, ?it/s]

  0%|          | 0/1003 [00:00<?, ?it/s]

  0%|          | 0/1003 [00:00<?, ?it/s]

  0%|          | 0/1003 [00:00<?, ?it/s]

  0%|          | 0/1003 [00:00<?, ?it/s]

  0%|          | 0/1003 [00:00<?, ?it/s]

  0%|          | 0/1003 [00:00<?, ?it/s]

  0%|          | 0/1003 [00:00<?, ?it/s]

  0%|          | 0/1003 [00:00<?, ?it/s]

  0%|          | 0/1003 [00:00<?, ?it/s]

  0%|          | 0/1003 [00:00<?, ?it/s]

  0%|          | 0/1003 [00:00<?, ?it/s]

  0%|          | 0/1003 [00:00<?, ?it/s]

  0%|          | 0/1003 [00:00<?, ?it/s]

  0%|          | 0/1003 [00:00<?, ?it/s]

  0%|          | 0/1003 [00:00<?, ?it/s]

[0.14047379422466516,
 0.6282805605016645,
 0.7053108306882403,
 0.7362390647983278,
 0.7611674537431292,
 0.7780444375629016,
 0.7907795927847023,
 0.8000696756212743,
 0.8070759464271889,
 0.8132306263064178,
 0.8188820933653325,
 0.8237980955330185,
 0.8282108848803902,
 0.8323914221568476,
 0.8361074552914763,
 0.8393976929627622,
 0.841758922350391,
 0.8453588294495626,
 0.8479136022296199,
 0.8505070836881629,
 0.8529070217542774,
 0.8553456684988775,
 0.8573585197801347,
 0.8594100797398777,
 0.8616551830920492,
 0.8628551521251064,
 0.8646744600139351,
 0.8660679724394209,
 0.8673840675079353,
 0.8690872493613068,
 0.8705194704652783,
 0.8714484787489355,
 0.8722226523186498,
 0.8741967949214213,
 0.8751645118835643,
 0.8763257722381358,
 0.8780289540915073,
 0.8789192536966788,
 0.8802740574436789,
 0.8815514438337075,
 0.8820546566540218,
 0.8828288302237361,
 0.8838739645428505,
 0.8848416815049934,
 0.8856932724316792,
 0.8865448633583649,
 0.8871254935356507,
 0.8879770844

In [104]:
best_lambda = np.amax(lambda_values)
best_lambda

0.9189440272509096

In [None]:
accuracy = get_tag_accuracy(best_lambda, testing)
print("Accuracy: {:.2%}".format(accuracy))