In [1]:
'''
magic command from IPython extension to reload modules before executing user code
https://ipython.readthedocs.io/en/stable/config/extensions/autoreload.html
'''

%load_ext autoreload
%autoreload 2

In [31]:
import numpy as np
import pandas as pd
from io import StringIO
import csv
import sys
from tqdm.notebook import tqdm

In [3]:
'''
collecting corpus, split by double end
'''

collection = []

with open("./data/Indonesian_Manually_Tagged_Corpus.tsv", "r") as f:
    txt_file = f.read()

for sentence in txt_file.split('\n\n'):
    temp = pd.read_csv(StringIO(sentence), delimiter='\t', header=None, quoting=csv.QUOTE_NONE)
    collection.append(temp.to_numpy())

collection = np.array(collection, dtype=object)

In [4]:
TRAINING_PERCENTAGE = 70
VALIDATION_PERCENTAGE = 10
TESTING_PERCENTAGE = 20

In [5]:
'''
shuffling collection data, and divide the data into training set, validation set, and testing set
'''

np.random.shuffle(collection)

training_count = round(len(collection) * TRAINING_PERCENTAGE / 100)
validation_count = round((len(collection) - training_count) * VALIDATION_PERCENTAGE / (100 - TRAINING_PERCENTAGE))

training = collection[0 : training_count]
validation = collection[training_count : training_count + validation_count]
testing = collection[training_count + validation_count : ]

In [67]:
from PosTagging import PosTagging
tagger = PosTagging(training)
emit, transition, context = tagger.evaluateFeed()

[7021/7021]	|██████████████████████████████████████████████████|
finished...

In [70]:
i = 0
tags = tagger.predict(validation[i])
print(tags)

['NN', 'NN', 'PR', 'NN', 'NN', 'NN', 'SC', 'VB', 'IN', 'NN', 'OD', 'CD', 'Z', 'CC', 'NN', 'PRP', 'MD', 'VB', 'IN', 'NN', 'OD', 'CD', 'Z', 'VB', 'NNP', 'NNP', 'Z', 'SC', 'VB', 'IN', 'NNP', 'Z', 'NNP', 'NNP']


In [43]:
validation[0]

array([['Penurunan', 'NN'],
       ['produksi', 'NN'],
       ['ini', 'PR'],
       ['akibat', 'NN'],
       ['dampak', 'NN'],
       ['musim kering', 'NN'],
       ['yang', 'SC'],
       ['terjadi', 'VB'],
       ['pada', 'IN'],
       ['semester', 'NN'],
       ['II', 'OD'],
       ['2006', 'CD'],
       [',', 'Z'],
       ['dan', 'CC'],
       ['dampak', 'NN'],
       ['-nya', 'PRP'],
       ['masih', 'MD'],
       ['dirasakan', 'VB'],
       ['pada', 'IN'],
       ['kuartal', 'NN'],
       ['pertama', 'OD'],
       ['2007', 'CD'],
       [',', 'Z'],
       ['kata', 'VB'],
       ['Direktur', 'NNP'],
       ['AALI', 'NNP'],
       [',', 'Z'],
       ['Julie', 'NNP'],
       ['Syaftari', 'NNP'],
       ['kepada', 'IN'],
       ['BEJ', 'NNP'],
       [',', 'Z'],
       ['Kamis', 'NNP'],
       ['.', 'Z']], dtype=object)

In [90]:
def get_tag_accuracy(__lambda, data):
    wrong = 0
    right = 0
    for i in tqdm(range(len(data)), colour="YELLOW"):
        tagger.set_lambda(__lambda)
        tags = tagger.predict(data[i])
        for j in range(len(tags)):
            if tags[j] == data[i][j][1]: right += 1 
            else: wrong += 1
    return float((right/(right+wrong)))

In [96]:
__lambda = 0.0
step_up = 0.1
lambda_values = []
ctr = 0
for i in tqdm(range(int(1.0/step_up))):
    lambda_values.append(get_tag_accuracy(__lambda, validation))
    __lambda +=step_up
lambda_values

  0%|          | 0/10 [00:00<?, ?it/s]

  0%|          | 0/1003 [00:00<?, ?it/s]

  0%|          | 0/1003 [00:00<?, ?it/s]

  0%|          | 0/1003 [00:00<?, ?it/s]

  0%|          | 0/1003 [00:00<?, ?it/s]

  0%|          | 0/1003 [00:00<?, ?it/s]

  0%|          | 0/1003 [00:00<?, ?it/s]

  0%|          | 0/1003 [00:00<?, ?it/s]

  0%|          | 0/1003 [00:00<?, ?it/s]

  0%|          | 0/1003 [00:00<?, ?it/s]

  0%|          | 0/1003 [00:00<?, ?it/s]

[0.1363323124042879,
 0.8134762633996937,
 0.8504977029096478,
 0.8674961715160796,
 0.8796324655436447,
 0.8875957120980091,
 0.8958269525267993,
 0.9033690658499234,
 0.9088820826952527,
 0.9136676875957122]

In [99]:
best_lambda = np.amax(lambda_values)
best_lambda

0.9136676875957122

In [102]:
accuracy = get_tag_accuracy(best_lambda, testing)
display(accuracy)

  0%|          | 0/2006 [00:00<?, ?it/s]

0.9152820673208492