In [1]:
'''
magic command from IPython extension to reload modules before executing user code
https://ipython.readthedocs.io/en/stable/config/extensions/autoreload.html
'''

%load_ext autoreload
%autoreload 2

In [2]:
import numpy as np
import pandas as pd
from io import StringIO
import csv
import sys
from tqdm.notebook import tqdm

In [3]:
'''
collecting corpus, split by double end
'''

collection = []

with open("./data/Indonesian_Manually_Tagged_Corpus.tsv", "r") as f:
    txt_file = f.read()

for sentence in txt_file.split('\n\n'):
    temp = pd.read_csv(StringIO(sentence), delimiter='\t', header=None, quoting=csv.QUOTE_NONE)
    collection.append(temp.to_numpy())

collection = np.array(collection, dtype=object)

In [4]:
TRAINING_PERCENTAGE = 70
VALIDATION_PERCENTAGE = 10
TESTING_PERCENTAGE = 20

In [5]:
'''
shuffling collection data, and divide the data into training set, validation set, and testing set
'''

np.random.shuffle(collection)

training_count = round(len(collection) * TRAINING_PERCENTAGE / 100)
validation_count = round((len(collection) - training_count) * VALIDATION_PERCENTAGE / (100 - TRAINING_PERCENTAGE))

training = collection[0 : training_count]
validation = collection[training_count : training_count + validation_count]
testing = collection[training_count + validation_count : ]

In [6]:
from PosTagging import PosTagging
tagger = PosTagging(training)
_,_,_ = tagger.train()

[7021/7021]	|██████████████████████████████████████████████████|
finished...

In [7]:
lambdas = tagger.validate(validation, 0.01)

  0%|          | 0/100 [00:00<?, ?it/s]

In [8]:
lambdas

[(0.0, 0.13436979455857856),
 (0.01, 0.6072420084080273),
 (0.02, 0.6878718172443881),
 (0.03, 0.7209090187990799),
 (0.04, 0.7457761561037519),
 (0.05, 0.7628301737130165),
 (0.060000000000000005, 0.7767113508368366),
 (0.07, 0.7869041008963275),
 (0.08, 0.794875862616007),
 (0.09, 0.8011422225747601),
 (0.09999999999999999, 0.8083207741730785),
 (0.10999999999999999, 0.8130800349012454),
 (0.11999999999999998, 0.8177996351233442),
 (0.12999999999999998, 0.82176568573015),
 (0.13999999999999999, 0.8256524153248196),
 (0.15, 0.8291822003648767),
 (0.16, 0.8317204727532324),
 (0.17, 0.8353295788054256),
 (0.18000000000000002, 0.8382644562544618),
 (0.19000000000000003, 0.8411200126913619),
 (0.20000000000000004, 0.8440152296343302),
 (0.21000000000000005, 0.8465535020226858),
 (0.22000000000000006, 0.8489331323867693),
 (0.23000000000000007, 0.8508764971841041),
 (0.24000000000000007, 0.8523042754025542),
 (0.25000000000000006, 0.8546839057666376),
 (0.26000000000000006, 0.8561513444911

In [9]:
best_lambdas = np.amax(lambdas)
test_accuracy = tagger.test(testing, best_lambdas)
print("Test Accuracy: {:.2%}".format(test_accuracy))

  0%|          | 0/2006 [00:00<?, ?it/s]

Test Accuracy: 91.77%
