In [1]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split

import gensim
from gensim.models import Word2Vec
from gensim.utils import simple_preprocess
from gensim.models.keyedvectors import KeyedVectors

from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.utils import to_categorical

from Sastrawi.Stemmer.StemmerFactory import StemmerFactory
import nltk

import re

In [2]:
df = pd.read_csv("Dataset/dataset_fix.csv")

In [3]:
kamus_alay_1 = pd.read_csv(
    "https://raw.githubusercontent.com/nasalsabila/kamus-alay/master/colloquial-indonesian-lexicon.csv",
    usecols=["slang", "formal"])

kamus_alay_2 = pd.read_csv(
    "https://raw.githubusercontent.com/haryoa/indo-collex/main/dict/inforformal-formal-Indonesian-dictionary.tsv",
    sep="\t"
)
kamus_alay_2.columns=["slang", "formal"]

# generate kamus alay
dict_alay = dict()
for index, row in kamus_alay_1.iterrows():
    dict_alay[row['slang']] = row['formal']
for index, row in kamus_alay_2.iterrows():
    dict_alay[row['slang']] = row['formal']
    
# price normalization regex
prefix_harga = '(idr|rp)'
suffix_harga = '(k|rb|rupiah|ribu|rban|an)'
re_pattern = [
    f'{prefix_harga}[\d]+{suffix_harga}', # prefix + suffix
    f'{prefix_harga}[\d]+', # prefix
    f'[\d]+{suffix_harga}', # suffix
    '\d\d\d\d[\d]+' # digit minimal 5 / diatas 10,000
]

factory = StemmerFactory()
stemmer = factory.create_stemmer()
    
def pre_process_word(word, kamus_alay=dict_alay, regex_patterns=re_pattern, process_alay=False):
    # BASIC PREPROCESSING 
    word = word.lower()                         # case folding
    word = re.sub('[^\w\s]', '', word)         # remove punctuation
    word = re.sub(r'[^\x00-\x7F]+', '', word)  # remove emoji
    word = re.sub('[\s]+', '', word)           # remove extra whitespace
        
    # remove duplicate consecutive char (max=2)
#     word = re.sub(r'(.)(?=\1\1)', '', word)     
    
    # ubah kata berulang (minimal 3huruf diulang) cth : kemanamana -> kemana2
#     re_search = re.search(r'(\w+)(?=\1)', word)
#     if (re_search):
#         start_idx = re_search.end()
#         end_idx = re_search.end() + len(re_search.group())
#         if (end_idx - start_idx) >= 3:
#             word = word[:start_idx] + '2' + word[end_idx:]
        
    #  bahasa alay
    if not process_alay:
        if word in kamus_alay.keys():
            word = kamus_alay[word]
        
    # stemming
    
    word = stemmer.stem(word)
        
    # normalisasi harga
#     for pattern in regex_patterns:
#         word = re.sub(pattern, '100k', str(word))
        
    return word

# apply preprocess to alay dict
processed_kamus_alay = dict()
for key in dict_alay.keys():
    new_key = pre_process_word(key, process_alay=True)
    processed_kamus_alay[new_key] = dict_alay[key]

In [4]:
dic={}
for i, tag in enumerate(df.Tag.unique()):
    dic[tag] = i
dic

{'O': 0,
 'B-FOOD': 1,
 'I-FOOD': 2,
 'B-MISCELLANEOUS': 3,
 'I-MISCELLANEOUS': 4,
 'B-SERVICE': 5,
 'I-SERVICE': 6,
 'B-AMBIENCE': 7,
 'I-AMBIENCE': 8,
 'B-PRICE': 9,
 'I-PRICE': 10}

In [5]:
labels = df["Tag"].apply(lambda x:dic[x])

In [6]:
list_kalimat = []
for i in range(df["Kalimat #"].min(),df["Kalimat #"].max()+1):
    list_kata = ["<S>"]
    for kata in df[df["Kalimat #"] == i]["Word"]:
        list_kata.append(pre_process_word(str(kata), kamus_alay=processed_kamus_alay))
    list_kata.append("</S>")
    list_kalimat.append(list_kata)

In [7]:
list_kalimat_join = []
for i,kalimat in enumerate(list_kalimat):
    list_kalimat_join.append(" ".join(kalimat))

In [8]:
list_kalimat_join[808]

'<S> baru ke sini lagi telah beberapa lama absen kali ini saya coba matcha cheesecakenya dengan harga 315k nett paling mahal banding ukur slice di lain matchanya masuk yang manis bukan pahit begitu tapi lebih enak kok porsi lihat standard tapi nyata ngenyangin juga telah makan varian rasa ada chocolate caramel strawberry blueberry raspberry red velvet almond dan lain lupa hahaha cocok buat yang lagi ngidem cheesecake tapi tidak ken beli loyang </S>'

In [9]:
raw_review = list_kalimat_join[808]

In [10]:
trigram = []
for kalimat in (list_kalimat):
    for i in range(len(kalimat)):
        if i > 0 and i < len(kalimat)-1:
            trigram.append([kalimat[i-1], kalimat[i], kalimat[i+1]])

In [11]:
NUM_WORDS=10000
tokenizer = Tokenizer(num_words=10000,filters='!"#$%&()*+,-.:;=?@[\\]^_`{|}~\t\n\'',
                      lower=True)
tokenizer.fit_on_texts(trigram)
sequences_train = tokenizer.texts_to_sequences(trigram)
word_index = tokenizer.word_index
print('Found %s unique tokens.' % len(word_index))

Found 5918 unique tokens.


In [12]:
from sklearn.preprocessing import OneHotEncoder
enc = OneHotEncoder(handle_unknown='ignore')
toEncode = labels.values.reshape(-1, 1)
enc = enc.fit(toEncode)
Encoded = enc.transform(toEncode).toarray()

In [13]:
df_trigram = pd.DataFrame(columns=["Trigram", "Label", "Trigram Encoded", "Label Encoded"], data=zip(trigram, df["Tag"].values, sequences_train, Encoded))

In [14]:
df_trigram.head(3)

Unnamed: 0,Trigram,Label,Trigram Encoded,Label Encoded
0,"[<S>, di tiap, oleh sebab itu]",O,"[19, 561, 30]","[1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
1,"[di tiap, oleh sebab itu, ke sini]",O,"[561, 30, 64]","[1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
2,"[oleh sebab itu, ke sini, tidak]",O,"[30, 64, 6]","[1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."


In [15]:
X_train, X_test, y_train, y_test = train_test_split(
    df_trigram["Trigram Encoded"].values, 
    df_trigram["Label Encoded"].values, 
    test_size=0.2, 
    random_state=50, 
)

In [16]:
X_train = np.array([np.array(x).astype('float32') for x in X_train])
y_train = np.array([np.array(x).astype('float32') for x in y_train])

X_test = np.array([np.array(x).astype('float32') for x in X_test])
y_test = np.array([np.array(x).astype('float32') for x in y_test])

In [17]:
import gensim
from gensim.models import Word2Vec
from gensim.utils import simple_preprocess

from gensim.models.keyedvectors import KeyedVectors

general_200 = Word2Vec.load("embedding/idwiki_word2vec_200_new_lower.model")
word_vectors_general = general_200.wv

known_words = 0
unknown_words = 0

GENERAL_EMBEDDING_DIM = 200
vocabulary_size = min(len(word_index)+1,NUM_WORDS)
embedding_matrix = np.zeros((vocabulary_size, GENERAL_EMBEDDING_DIM))
for word, i in word_index.items():
    if i>=NUM_WORDS:
        continue
    try:
        embedding_vector = word_vectors_general[word]
        embedding_matrix[i] = embedding_vector
        known_words += 1
    except KeyError:
        embedding_matrix[i] = np.random.normal(0,np.sqrt(0.25),GENERAL_EMBEDDING_DIM)
        unknown_words += 1

del(word_vectors_general)

from tensorflow.keras.layers import Embedding
general_embedding_layer = Embedding(vocabulary_size,
                            GENERAL_EMBEDDING_DIM,
                            weights=[embedding_matrix],
                            trainable=True
                        )

In [18]:
import gensim
from gensim.models import Word2Vec
from gensim.utils import simple_preprocess
from tensorflow.keras.layers import Embedding
from gensim.models.keyedvectors import KeyedVectors

# pergikuliner_word2vec_100.model
# domain_embedding/domain_specific_skip_word2vec_100_10_5.model
# domain_embedding/domain_specific_skip_word2vec_100_15_10_basic_alay.model
# domain_embedding/domain_specific_skip_word2vec_100_15_10_basic_alay_normalize.model
# domain_embedding/domain_specific_skip_word2vec_100_15_10_basic_alay_stem.model
domain_100 = Word2Vec.load("domain_embedding/domain_specific_skip_word2vec_100_15_10_basic_alay_stem.model")
word_vectors_domain = domain_100.wv

known_words = []
unknown_words = []

DOMAIN_EMBEDDING_DIM = 100
vocabulary_size = min(len(word_index)+1,NUM_WORDS)
embedding_matrix = np.zeros((vocabulary_size, DOMAIN_EMBEDDING_DIM))
for word, i in word_index.items():
    if i>=NUM_WORDS:
        continue
    try:
        embedding_vector = word_vectors_domain[word]
        embedding_matrix[i] = embedding_vector
        known_words.append(word)
    except KeyError:
        embedding_matrix[i] = np.random.normal(0,np.sqrt(0.25),DOMAIN_EMBEDDING_DIM)
        unknown_words.append(word)

domain_embedding_layer = Embedding(vocabulary_size,
                            DOMAIN_EMBEDDING_DIM,
                            weights=[embedding_matrix],
                            trainable=True)

In [19]:
from tensorflow.keras.models import Model
from tensorflow.keras.layers import SpatialDropout1D, Input, Dense, GRU, Embedding, Dropout, LSTM, concatenate,Bidirectional, AveragePooling1D, Reshape, Flatten
# from tensorflow.keras.layers.core import Reshape, Flatten
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.models import Model
from tensorflow.keras import regularizers
sequence_length = X_train.shape[1]
num_filters_blstm = 128
num_filters_lstm = 128

inputs = Input(shape=(sequence_length,))

embedding = general_embedding_layer(inputs)
# embedding = domain_embedding_layer(inputs)

bilstm = Bidirectional(LSTM(num_filters_blstm, 
                           dropout=0.5, recurrent_dropout=0.3, 
                        return_sequences=True))(embedding)
lstm = LSTM(num_filters_lstm, dropout=0.2, recurrent_dropout=0.2, return_sequences=True)(embedding)

AvPool = AveragePooling1D(pool_size=(2), strides=None)(bilstm)
flatten = Flatten()(AvPool)

output = Dense(units=11, activation='softmax',kernel_regularizer=regularizers.l2(0.01))(flatten)

model = Model(inputs, output)

adam = Adam(learning_rate=1e-3)
model.compile(loss='categorical_crossentropy',
              optimizer=adam,
              metrics=['acc'])
callbacks = [EarlyStopping(monitor='val_loss', patience=3, verbose=1,
                          restore_best_weights=True
                          )]
model.summary()

Model: "model"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_1 (InputLayer)        [(None, 3)]               0         
                                                                 
 embedding (Embedding)       (None, 3, 200)            1183800   
                                                                 
 bidirectional (Bidirectiona  (None, 3, 256)           336896    
 l)                                                              
                                                                 
 average_pooling1d (AverageP  (None, 1, 256)           0         
 ooling1D)                                                       
                                                                 
 flatten (Flatten)           (None, 256)               0         
                                                                 
 dense (Dense)               (None, 11)                2827  

In [20]:
model.fit(X_train, y_train, 
          batch_size=128, 
          epochs=100, 
          verbose=1, 
          validation_data=(X_test, y_test),
          callbacks=callbacks
         )

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 00024: early stopping


<keras.callbacks.History at 0x7fcb6b7e99a0>

In [21]:
y_pred = model.predict(X_test)

In [22]:
y_test_argmax = [np.argmax(i) for i in y_test]
y_pred_argmax = [np.argmax(i) for i in y_pred]

In [23]:
from sklearn.metrics import classification_report, make_scorer, f1_score

print(classification_report(
    y_test_argmax, y_pred_argmax,
    labels=[1,2,3,4,5,6,7,8,9,10],
    digits=3))
print("f1 score:", f1_score(y_test_argmax, y_pred_argmax, average='micro'))

              precision    recall  f1-score   support

           1      0.571     0.361     0.442       413
           2      0.656     0.587     0.619      1349
           3      0.324     0.101     0.154       109
           4      0.500     0.276     0.356       398
           5      0.676     0.373     0.481        67
           6      0.673     0.504     0.576       282
           7      0.750     0.411     0.531        73
           8      0.619     0.551     0.583       236
           9      0.929     0.289     0.441        45
          10      0.670     0.520     0.586       125

   micro avg      0.629     0.474     0.540      3097
   macro avg      0.637     0.397     0.477      3097
weighted avg      0.619     0.474     0.530      3097

f1 score: 0.7891768895924021
