# Text Classification Notebook

In [None]:
!pip install --upgrade pip
!pip install tensorflow
!pip install pandas
!pip install numpy
!pip install PySastrawi
!pip install sklearn
!pip install keras-tuner

In [None]:
# Machine Learning related imports
import tensorflow as tf

from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.layers import Input

In [None]:
# Dataset and generic imports
import pandas as pd
import numpy as np
import sklearn
import os
import string

from Sastrawi.Stemmer.StemmerFactory import StemmerFactory
from Sastrawi.StopWordRemover.StopWordRemoverFactory import StopWordRemoverFactory

In [None]:
# Load dataset
filepath = os.getcwd()
datasetpath =  os.path.join(filepath, "cleaned_data", "laporanencoded.csv")

# Github URL for dataset when used in Google Colab
github_url = "https://raw.githubusercontent.com/Capstone-Project-B21-CAP0113/ml-tf/main/laporan/cleaned_data/laporanencoded.csv"

laporan = pd.read_csv(datasetpath, encoding="ISO-8859-1")
# Print dataset shape
print(laporan.shape)
# Print dataset head
laporan.head()


In [None]:
# Split text and labels
label_list = [
    "perselisihan",
    "infrastruktur",
    "pemerintah",
    "kesehatan",
    "teknologi",
    "administrasi",
    "fasilitas",
    "lingkungan",
    "ketertiban",
    "listrik",
    "bahaya",
    "lainnya",
    "pungli",
    "ilegal",
    "lalulintas",
    "bencana",
    "air",
    "pendidikan",
    "kebersihan",
    "sosial",
    "wisata",
    "sara",
    "pencurian",
    "korupsi",
    "bbm",
    "keuangan"
] 
x = laporan["text"]
y = laporan[label_list]

# Print number of element in each category ( one element can have many label since it's a multi label classification )
for i in label_list:
    print("{}: {}".format(i, (laporan[i] == 1).sum()))

In [None]:
# Text head
x.head()

In [None]:
# Label head
y.head()

In [None]:
# Remove stopwords and stem words using Sastrawi
stemmer_factory = StemmerFactory()
stemmer = stemmer_factory.create_stemmer()

stopword_factory = StopWordRemoverFactory()
stopword = stopword_factory.create_stop_word_remover()

for i in range(len(x)):
    x[i] = stopword.remove(x[i])
    x[i] = stemmer.stem(x[i])

x.head()

In [None]:
# Tokenize and pad text
vocab_size = 2000
embedding_dim = 16
max_length = 300
trunc_type = "post"
oov_token = "<OOV>"

tokenizer = Tokenizer(num_words = vocab_size, oov_token=oov_token)
tokenizer.fit_on_texts(x)

sequences = tokenizer.texts_to_sequences(x)

padded = pad_sequences(sequences, maxlen=max_length, truncating=trunc_type)

In [None]:
# Shuffle, batch and separate data into train, dev and test
BUFFER_SIZE = 1000
BATCH_SIZE = 32
DATASET_SIZE = len(x)


tx = tf.convert_to_tensor(padded)
ty = tf.convert_to_tensor(y)

dataset = tf.data.Dataset.from_tensor_slices((tx, ty))
dataset = dataset.shuffle(BUFFER_SIZE).batch(BATCH_SIZE)

train_set = dataset.take(int(0.8 * len(dataset)))
test_set = dataset.skip(int(0.8 * len(dataset)))
test_set = dataset.take(int(0.2 * len(dataset))) 

print(len(train_set))
print(len(test_set))

In [None]:
model = tf.keras.Sequential([
    tf.keras.layers.Embedding(vocab_size, 128),
    tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(128, return_sequences=True)),
    tf.keras.layers.Dropout(0.2),
    tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(64)),
    tf.keras.layers.Dropout(0.2),
    tf.keras.layers.Dense(64, activation='relu'),
    tf.keras.layers.Dropout(0.2),
    tf.keras.layers.Dense(32, activation='relu'),
    tf.keras.layers.Dense(26, activation='sigmoid')
])

model.summary()

In [None]:
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

In [None]:
NUM_EPOCHS = 75
history = model.fit(train_set, epochs=NUM_EPOCHS, validation_data=test_set)

In [None]:
# Text cleaning function for inferrence
def treatinput(inp):
    # Make string into lowercase string
    treated = inp.lower()
    # remove punctuation
    treated = treated.translate(str.maketrans("","",string.punctuation))
    # remove trailing whitespace
    treated = treated.strip()
    # Remove stopwords
    treated = stopword.remove(treated)
    # Stem string
    treated = stemmer.stem(treated)
    return treated

In [None]:
# sample inferrece
sample_laporan = "Jalan sukamaju jaya macet sudah dari pagi, lampu merah mati dan tidak ada polisi lalu lintas"
treated_input = treatinput(sample_laporan)
print(treated_input)
pad = pad_sequences(tokenizer.texts_to_sequences([treated_input]), maxlen=max_length, truncating=trunc_type)
prediction = model.predict(pad)

res = dict(zip(label_list, prediction[0]))

In [None]:
# Print predictions
for i in res.keys():
    # if(res[i] > 0.6):
    print("{} {:.5f}".format(i, res[i]))

In [None]:
# Try a new model with Keras Tuner // CAREFUL THIS TAKES A LONG TIME
import kerastuner as kt

In [None]:
# Define Model
def model_builder(hp):
    emb_units = hp.Int('emb_units', min_value=16, max_value=256, step=16)
    bid1_units = hp.Int('bid1_units', min_value=16, max_value=256, step=16)
    drop1_frac = hp.Float('drop1_frac', min_value=0.1, max_value=0.8, step=0.1)
    bid2_units = hp.Int('bid2_units', min_value=16, max_value=256, step=16)
    drop2_frac = hp.Float('drop2_frac', min_value=0.1, max_value=0.8, step=0.1)
    dense1_units = hp.Int('dense1_unit', min_value=16, max_value=256, step=16)
    drop3_frac = hp.Float('drop3_frac', min_value=0.1, max_value=0.8, step=0.1)
    dense2_units = hp.Int('dense2_unit', min_value=16, max_value=256, step=16)

    model = tf.keras.Sequential([
    tf.keras.layers.Embedding(vocab_size, emb_units),
    tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(bid1_units, return_sequences=True)),
    tf.keras.layers.Dropout(drop1_frac),
    tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(bid2_units)),
    tf.keras.layers.Dropout(drop2_frac),
    tf.keras.layers.Dense(dense1_units, activation='relu'),
    tf.keras.layers.Dropout(drop3_frac),
    tf.keras.layers.Dense(dense2_units, activation='relu'),
    tf.keras.layers.Dense(26, activation='sigmoid')
])

    hp_learning_rate = hp.Choice('learning_rate', values=[1e-2, 1e-3, 1e-4, 1e-5])

    model.compile(loss='binary_crossentropy', optimizer=tf.keras.optimizers.Adam(learning_rate=hp_learning_rate), metrics=['accuracy'])
    
    return model

In [None]:
tuner = kt.Hyperband(model_builder, objective='val_accuracy', max_epochs=100, factor=3, overwrite=True)

In [None]:
stop_early = tf.keras.callbacks.EarlyStopping(monitor='val_loss', patience=5)

In [None]:
tuner.search(train_set, epochs=100, validation_data=test_set, callbacks=[stop_early])

best_hps= tuner.get_best_hyperparameters(num_trials=1)[0]
print(best_hps)

In [None]:
# Trial 5 Complete [00h 03m 30s]
# val_accuracy: 0.2395833283662796

# Best val_accuracy So Far: 0.2534722089767456
# Total elapsed time: 00h 55m 20s

# Search: Running Trial #6

# Hyperparameter    |Value             |Best Value So Far 
# emb_units         |48                |32                
# bid1_units        |240               |112               
# drop1_frac        |0.6               |0.3               
# bid2_units        |176               |112               
# drop2_frac        |0.4               |0.4               
# dense1_unit       |176               |112               
# drop3_frac        |0.7               |0.4               
# dense2_unit       |144               |224               
# learning_rate     |1e-05             |0.01              
# tuner/epochs      |2                 |2                 
# tuner/initial_e...|0                 |0                 
# tuner/bracket     |2                 |2                 
# tuner/round       |0                 |0                 

# Epoch 1/2
# 72/72 [==============================] - 618s 8s/step - loss: 0.6918 - accuracy: 0.0573 - val_loss: 0.6889 - val_accuracy: 0.0625
# Epoch 2/2
# 72/72 [==============================] - 623s 9s/step - loss: 0.6846 - accuracy: 0.0760 - val_loss: 0.6749 - val_accuracy: 0.0521

In [None]:
model_t = tuner.hypermodel.build(best_hps)
history = model_t.fit(dataset, epochs=100, validation_data=test_set)

val_acc_per_epoch = history.history['val_accuracy']
best_epoch = val_acc_per_epoch.index(max(val_acc_per_epoch)) + 1
print('Best epoch: %d' % (best_epoch,))

In [None]:
hypermodel = tuner.hypermodel.build(best_hps)

# Retrain the model
hypermodel.fit(dataset, epochs=best_epoch, validation_data=test_set)