# **Eventure : Category Classifier**

Determining categories based on user input using Natural Language Processing

In [1]:
import pandas as pd
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense
from tensorflow.keras.utils import to_categorical
!pip install Sastrawi
from Sastrawi.Stemmer.StemmerFactory import StemmerFactory
!python --version
print(tf.__version__)

Collecting Sastrawi
  Downloading Sastrawi-1.0.1-py2.py3-none-any.whl (209 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m209.7/209.7 kB[0m [31m1.6 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: Sastrawi
Successfully installed Sastrawi-1.0.1
Python 3.10.12
2.15.0


In [2]:
# Membaca file excel
file_path = '/content/drive/MyDrive/Capstone Project/datasets/data training.csv'
df = pd.read_csv(file_path)

# Pastikan kolom 'text' dan 'label' ada di DataFrame
if 'text' not in df.columns or 'label' not in df.columns:
    raise ValueError("File Excel harus memiliki kolom 'text' dan 'label'")

df.head(500)

Unnamed: 0,text,label,all_category,sum_category
0,ke tempat yang menyegarkan,wisata,wisata,19.0
1,berkeliling naik ATV,wisata,edukasi,23.0
2,mendaki gunung di alam terbuka,wisata,olahraga,22.0
3,melewati hutan yang sangat luas dan liar,wisata,hiburan,14.0
4,piknik di taman,wisata,bisnis,26.0
...,...,...,...,...
140,menonton kompetisi tari,budaya,,
141,membeli barang-barang khas daerah,budaya,,
142,budaya,budaya,,
143,mengunjungi pasar tradisional,budaya,,


In [3]:
# Inisialisasi stemmer
factory = StemmerFactory()
stemmer = factory.create_stemmer()

# Fungsi untuk stemming teks
def stem_text(text):
    return stemmer.stem(text)

# Terapkan stemming pada kolom 'text'
df['text'] = df['text'].apply(stem_text)

df['text'].head(500)

0                       ke tempat yang segar
1                          keliling naik atv
2                   daki gunung di alam buka
3      lewat hutan yang sangat luas dan liar
4                            piknik di taman
                       ...                  
140                    tonton kompetisi tari
141                  beli barang khas daerah
142                                   budaya
143                 unjung pasar tradisional
144                                     lain
Name: text, Length: 145, dtype: object

In [4]:
# Inisialisasi tokenizer
tokenizer = Tokenizer(num_words=500, oov_token='<OOV>')
tokenizer.fit_on_texts(df['text'])

# Konversi teks menjadi sequence
sequences = tokenizer.texts_to_sequences(df['text'])
padded_sequences = pad_sequences(sequences, padding='post')

# Konversi label menjadi angka
label_map = {
    'wisata': 0,
    'edukasi': 1,
    'olahraga': 2,
    'hiburan': 3,
    'musik': 4,
    'bisnis': 5,
    'budaya': 6,
    'lainnya': 7
}
labels = df['label'].map(label_map).values

# One-hot encoding pada label
labels_one_hot = to_categorical(labels)

In [5]:
# Definisikan model
model = Sequential([
    Embedding(input_dim=500, output_dim=64, input_length=padded_sequences.shape[1]),
    LSTM(64),
    Dense(8, activation='softmax')
])

model.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 11, 64)            32000     
                                                                 
 lstm (LSTM)                 (None, 64)                33024     
                                                                 
 dense (Dense)               (None, 8)                 520       
                                                                 
Total params: 65544 (256.03 KB)
Trainable params: 65544 (256.03 KB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________


In [6]:
model.compile(
    loss='categorical_crossentropy',
    optimizer='adam',
    metrics=['accuracy']
)

model.fit(
    padded_sequences,
    labels_one_hot,
    epochs=200
)

Epoch 1/200
Epoch 2/200
Epoch 3/200
Epoch 4/200
Epoch 5/200
Epoch 6/200
Epoch 7/200
Epoch 8/200
Epoch 9/200
Epoch 10/200
Epoch 11/200
Epoch 12/200
Epoch 13/200
Epoch 14/200
Epoch 15/200
Epoch 16/200
Epoch 17/200
Epoch 18/200
Epoch 19/200
Epoch 20/200
Epoch 21/200
Epoch 22/200
Epoch 23/200
Epoch 24/200
Epoch 25/200
Epoch 26/200
Epoch 27/200
Epoch 28/200
Epoch 29/200
Epoch 30/200
Epoch 31/200
Epoch 32/200
Epoch 33/200
Epoch 34/200
Epoch 35/200
Epoch 36/200
Epoch 37/200
Epoch 38/200
Epoch 39/200
Epoch 40/200
Epoch 41/200
Epoch 42/200
Epoch 43/200
Epoch 44/200
Epoch 45/200
Epoch 46/200
Epoch 47/200
Epoch 48/200
Epoch 49/200
Epoch 50/200
Epoch 51/200
Epoch 52/200
Epoch 53/200
Epoch 54/200
Epoch 55/200
Epoch 56/200
Epoch 57/200
Epoch 58/200
Epoch 59/200
Epoch 60/200
Epoch 61/200
Epoch 62/200
Epoch 63/200
Epoch 64/200
Epoch 65/200
Epoch 66/200
Epoch 67/200
Epoch 68/200
Epoch 69/200
Epoch 70/200
Epoch 71/200
Epoch 72/200
Epoch 73/200
Epoch 74/200
Epoch 75/200
Epoch 76/200
Epoch 77/200
Epoch 78

<keras.src.callbacks.History at 0x78c7008f2170>

In [7]:
# Save the model
model.save('model-nlp-with-stem.h5')

  saving_api.save_model(


In [8]:
def predict(text):
    # Stemming teks input
    stemmed_text = stem_text(text)

    # Contoh prediksi
    new_text = [stemmed_text]
    new_seq = tokenizer.texts_to_sequences(new_text)
    new_padded = pad_sequences(new_seq, maxlen=padded_sequences.shape[1], padding='post')

    # Prediksi
    prediction = model.predict(new_padded)
    predicted_label_index = prediction.argmax(axis=-1)
    predicted_label = list(label_map.keys())[predicted_label_index[0]]

    # Menampilkan semua probabilitas kategori
    category_probabilities = {category: prob for category, prob in zip(label_map.keys(), prediction[0])}

    print(f'\nTeks: "{new_text[0]}" diklasifikasikan sebagai: {predicted_label}')
    print('\nProbabilitas kategori:')
    for category, probability in category_probabilities.items():
        print(f'{category}: {probability:.4f}')

In [9]:
predict('kegiatan yang mengurangi kadar kalori')


Teks: "giat yang kurang kadar kalori" diklasifikasikan sebagai: olahraga

Probabilitas kategori:
wisata: 0.0007
edukasi: 0.0007
olahraga: 0.9975
hiburan: 0.0001
musik: 0.0001
bisnis: 0.0001
budaya: 0.0008
lainnya: 0.0000
