# Text Classification Notebook

In [1]:
!pip install --upgrade pip
!pip install tensorflow
!pip install pandas
!pip install numpy

Defaulting to user installation because normal site-packages is not writeable
^C


In [91]:
# Machine Learning related imports
import tensorflow as tf

from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.layers import Input

In [92]:
# Dataset and generic imports
import pandas as pd
import numpy as np
import os

In [93]:
# Load dataset
filepath = os.getcwd()
datasetpath =  os.path.join(filepath, "cleaned_data", "laporanencoded.csv")

github_url = "https://raw.githubusercontent.com/Capstone-Project-B21-CAP0113/ml-tf/main/laporan/cleaned_data/laporanencoded.csv"

laporan = pd.read_csv(datasetpath, encoding="ISO-8859-1")
# Print dataset shape
print(laporan.shape)
# Print dataset head
laporan.head()


(2852, 27)


Unnamed: 0,text,perselisihan,infrastruktur,pemerintah,kesehatan,teknologi,administrasi,fasilitas,lingkungan,ketertiban,...,air,pendidikan,kebersihan,sosial,wisata,sara,pencurian,korupsi,bbm,keuangan
0,mohon bantuannya untuk menormalkan sistem di l...,0,0,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,ass pak gubsaya perangkat desa karangsari kec ...,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,selamat sore bapak gubernur atau yang mewakili...,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,assalamualaikum pak gub saya pengurus paguyuba...,0,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,lapor saya okada arle sandi email okadaarlegm...,0,0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [94]:
# Split text and labels
label_list = [
    "perselisihan",
    "infrastruktur",
    "pemerintah",
    "kesehatan",
    "teknologi",
    "administrasi",
    "fasilitas",
    "lingkungan",
    "ketertiban",
    "listrik",
    "bahaya",
    "lainnya",
    "pungli",
    "ilegal",
    "lalulintas",
    "bencana",
    "air",
    "pendidikan",
    "kebersihan",
    "sosial",
    "wisata",
    "sara",
    "pencurian",
    "korupsi",
    "bbm",
    "keuangan"
] 
x = laporan["text"]
y = laporan[label_list]

In [95]:
# Text head
x.head()

0    mohon bantuannya untuk menormalkan sistem di l...
1    ass pak gubsaya perangkat desa karangsari kec ...
2    selamat sore bapak gubernur atau yang mewakili...
3    assalamualaikum pak gub saya pengurus paguyuba...
4    lapor saya okada arle sandi email  okadaarlegm...
Name: text, dtype: object

In [96]:
# Label head
y.head()

Unnamed: 0,perselisihan,infrastruktur,pemerintah,kesehatan,teknologi,administrasi,fasilitas,lingkungan,ketertiban,listrik,...,air,pendidikan,kebersihan,sosial,wisata,sara,pencurian,korupsi,bbm,keuangan
0,0,0,0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [97]:
# Tokenize and pad text
vocab_size = 2000
embedding_dim = 16
max_length = 300
trunc_type = "post"
oov_token = "<OOV>"

tokenizer = Tokenizer(num_words = vocab_size, oov_token=oov_token)
tokenizer.fit_on_texts(x)

sequences = tokenizer.texts_to_sequences(x)

padded = pad_sequences(sequences, maxlen=max_length, truncating=trunc_type)

In [98]:
# Shuffle, batch and separate data into train, dev and test
BUFFER_SIZE = 1000
BATCH_SIZE = 32
DATASET_SIZE = len(x)

dataset = tf.data.Dataset.from_tensor_slices((padded, y))
dataset = dataset.shuffle(BUFFER_SIZE).batch(BATCH_SIZE)

train_set = dataset.take(int(0.8 * len(dataset)))
test_set = dataset.skip(int(0.8 * len(dataset)))
test_set = dataset.take(int(0.2 * len(dataset))) 

print(len(train_set))
print(len(test_set))

72
18


In [99]:
model = tf.keras.Sequential([
    tf.keras.layers.Embedding(vocab_size, 64),
    tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(64)),
    tf.keras.layers.Dense(128, activation='relu'),
    tf.keras.layers.Dense(64, activation='relu'),
    tf.keras.layers.Dense(32, activation='relu'),
    tf.keras.layers.Dense(26, activation='sigmoid')
])

model.summary()

Model: "sequential_7"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_7 (Embedding)      (None, None, 64)          128000    
_________________________________________________________________
bidirectional_7 (Bidirection (None, 128)               66048     
_________________________________________________________________
dense_26 (Dense)             (None, 128)               16512     
_________________________________________________________________
dense_27 (Dense)             (None, 64)                8256      
_________________________________________________________________
dense_28 (Dense)             (None, 32)                2080      
_________________________________________________________________
dense_29 (Dense)             (None, 26)                858       
Total params: 221,754
Trainable params: 221,754
Non-trainable params: 0
________________________________________________

In [100]:
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

In [101]:
NUM_EPOCHS = 30
history = model.fit(train_set, epochs=NUM_EPOCHS, validation_data=test_set)

Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30
Epoch 10/30
Epoch 11/30
Epoch 12/30
Epoch 13/30
Epoch 14/30
Epoch 15/30
Epoch 16/30
Epoch 17/30
Epoch 18/30
Epoch 19/30
Epoch 20/30
Epoch 21/30
Epoch 22/30
Epoch 23/30
Epoch 24/30
Epoch 25/30
Epoch 26/30
Epoch 27/30
Epoch 28/30
Epoch 29/30
Epoch 30/30


In [122]:
# sample inferrece
sample_laporan = 'jalan kolibri sudah rusak 5 bulan pak tolong dibenahi'
pad = pad_sequences(tokenizer.texts_to_sequences([sample_laporan]), maxlen=max_length, truncating=trunc_type)
prediction = model.predict(pad)

res = dict(zip(label_list, prediction[0]))

<class 'numpy.float32'>


In [125]:
for i in res.keys():
    print("{} {:.5f}".format(i, res[i]))

perselisihan 0.00000
infrastruktur 0.77492
pemerintah 0.04302
kesehatan 0.00006
teknologi 0.03233
administrasi 0.00027
fasilitas 0.01265
lingkungan 0.00683
ketertiban 0.00007
listrik 0.00000
bahaya 0.00357
lainnya 0.00290
pungli 0.00003
ilegal 0.00000
lalulintas 0.00002
bencana 0.00027
air 0.00000
pendidikan 0.00787
kebersihan 0.00100
sosial 0.00000
wisata 0.00008
sara 0.00000
pencurian 0.00002
korupsi 0.00018
bbm 0.00000
keuangan 0.00606
