In [1]:
# binary text classification
import pandas as pd
df = pd.read_csv('yelp_labelled.txt', names=['sentence', 'label'], sep='\t')
df.tail() # check 5 last item

Unnamed: 0,sentence,label
995,I think food should have flavor and texture an...,0
996,Appetite instantly gone.,0
997,Overall I was not impressed and would not go b...,0
998,"The whole experience was underwhelming, and I ...",0
999,"Then, as if I hadn't wasted enough of my life ...",0


In [2]:
# split dataset
from sklearn.model_selection import train_test_split
kalimat = df['sentence'].values
y = df['label'].values
kalimat_latih, kalimat_test, y_latih, y_test = train_test_split(kalimat, y, test_size=0.2)

In [3]:
# implement tokenization
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

tokenizer = Tokenizer(num_words=250, oov_token='x')
tokenizer.fit_on_texts(kalimat_latih)
tokenizer.fit_on_texts(kalimat_test)

sekuens_latih = tokenizer.texts_to_sequences(kalimat_latih)
sekuens_test = tokenizer.texts_to_sequences(kalimat_test)

padded_latih = pad_sequences(sekuens_latih, maxlen=20)
padded_test = pad_sequences(sekuens_test, maxlen=20)

In [4]:
# create model
import tensorflow as tf
model = tf.keras.Sequential([
    # 250 is the number of words in tokenizer vocabulary
    # 16 is the embedding dimension
    # 20 is the length of the sequence
    tf.keras.layers.Embedding(250, 16, input_length=20),
    tf.keras.layers.GlobalAveragePooling1D(),
    tf.keras.layers.Dense(24, activation='relu'),
    tf.keras.layers.Dense(1, activation='sigmoid')
])

model.compile(
    optimizer='adam',
    loss='binary_crossentropy',
    metrics=['accuracy']
)

In [5]:
# train model
model.fit(
    padded_latih,
    y_latih,
    epochs=30,
    validation_data=(padded_test, y_test),
    verbose=2
)

Epoch 1/30
25/25 - 1s - loss: 0.6933 - accuracy: 0.4875 - val_loss: 0.6912 - val_accuracy: 0.5700 - 1s/epoch - 52ms/step
Epoch 2/30
25/25 - 0s - loss: 0.6908 - accuracy: 0.6050 - val_loss: 0.6893 - val_accuracy: 0.6500 - 74ms/epoch - 3ms/step
Epoch 3/30
25/25 - 0s - loss: 0.6874 - accuracy: 0.6750 - val_loss: 0.6854 - val_accuracy: 0.6550 - 88ms/epoch - 4ms/step
Epoch 4/30
25/25 - 0s - loss: 0.6827 - accuracy: 0.6775 - val_loss: 0.6791 - val_accuracy: 0.6950 - 100ms/epoch - 4ms/step
Epoch 5/30
25/25 - 0s - loss: 0.6732 - accuracy: 0.7050 - val_loss: 0.6699 - val_accuracy: 0.6950 - 86ms/epoch - 3ms/step
Epoch 6/30
25/25 - 0s - loss: 0.6591 - accuracy: 0.7325 - val_loss: 0.6561 - val_accuracy: 0.7050 - 74ms/epoch - 3ms/step
Epoch 7/30
25/25 - 0s - loss: 0.6398 - accuracy: 0.7575 - val_loss: 0.6373 - val_accuracy: 0.7150 - 90ms/epoch - 4ms/step
Epoch 8/30
25/25 - 0s - loss: 0.6154 - accuracy: 0.7500 - val_loss: 0.6144 - val_accuracy: 0.7650 - 85ms/epoch - 3ms/step
Epoch 9/30
25/25 - 0s - 

<keras.src.callbacks.History at 0x7f2f2e9a61d0>

In [13]:
# predict using model
teks = ['i was not impressed with this restaurant',
        'the food here is good',
        'this was not a waste of time']

sekuens_tebak = tokenizer.texts_to_sequences(teks)
padded_tebak = pad_sequences(sekuens_tebak, maxlen=20)

prediksi = model.predict(padded_tebak)

hasil = (prediksi > 0.5).astype(int)

for i in range(len(teks)):
  if hasil[i] == 1:
    print(f'Text: "{teks[i]}" is a positive review')
  else:
    print(f'Text: "{teks[i]}" is a negative review')

print('Probability: ', prediksi)
print('Result: ', hasil)

Text: "i was not impressed with this restaurant" is a negative review
Text: "the food here is good" is a positive review
Text: "this was not a waste of time" is a negative review
Probability:  [[0.083626  ]
 [0.9519878 ]
 [0.01879993]]
Result:  [[0]
 [1]
 [0]]
