### Latihan Tokenization

In [None]:
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

In [None]:
tokenizer = Tokenizer(num_words= 100, oov_token='<OOV>')

teks = ['Saya suka programming',
        'Programming sangat menyenangkan!',
        'Machine Learning berbeda dengan pemrograman konvensional']

tokenizer.fit_on_texts(teks)
sequences = tokenizer.texts_to_sequences(teks)
print(tokenizer.word_index)

{'<OOV>': 1, 'programming': 2, 'saya': 3, 'suka': 4, 'sangat': 5, 'menyenangkan': 6, 'machine': 7, 'learning': 8, 'berbeda': 9, 'dengan': 10, 'pemrograman': 11, 'konvensional': 12}


In [None]:
print(tokenizer.texts_to_sequences(['Saya suka programming!']))
print(tokenizer.texts_to_sequences(['Saya suka belajar programing sejak SMP']))

[[3, 4, 2]]
[[3, 4, 1, 1, 1, 1]]


In [None]:
sequences_samapanjang = pad_sequences(sequences)

sequences_samapanjang = pad_sequences(sequences,
                                      padding='post',
                                      maxlen=5,
                                      truncating='post')
print(sequences_samapanjang)

[[ 3  4  2  0  0]
 [ 2  5  6  0  0]
 [ 7  8  9 10 11]]


### Multiclass Text Classification

In [None]:
!pip install kaggle



In [None]:
!mkdir -p ~/.kaggle
!cp kaggle.json ~/.kaggle/
!chmod 600 ~/.kaggle/kaggle.json

In [None]:
!kaggle datasets download -d antoniuscs/imdb-synopsis-indonesian-movies

Downloading imdb-synopsis-indonesian-movies.zip to /content
  0% 0.00/299k [00:00<?, ?B/s]
100% 299k/299k [00:00<00:00, 101MB/s]


In [None]:
!unzip /content/imdb-synopsis-indonesian-movies.zip

Archive:  /content/imdb-synopsis-indonesian-movies.zip
  inflating: imdb_indonesian_movies_2.csv  


In [None]:
import pandas as pd
df = pd.read_csv('imdb_indonesian_movies_2.csv')
df = df.drop(columns=['judul_film'])
df.head()

Unnamed: 0,ringkasan_sinopsis,genre
0,Raden Mas Said putra sulung Tumenggung Wilarik...,Drama
1,Soe Hok Gie adalah seorang aktivis yang hidup ...,Drama
2,Guru Bangsa Tjokroaminoto menceritakan tentang...,Drama
3,POL menceritakan kisah hidup yang luar biasa d...,Drama
4,Perjalanan pahlawan Indonesia KH Ahmad Dahlan ...,Drama


In [None]:
category = pd.get_dummies(df.genre)
df_baru = pd.concat([df, category], axis=1)
df_baru = df_baru.drop(columns='genre')
df_baru

Unnamed: 0,ringkasan_sinopsis,Drama,Horor,Komedi,Laga,Romantis
0,Raden Mas Said putra sulung Tumenggung Wilarik...,1,0,0,0,0
1,Soe Hok Gie adalah seorang aktivis yang hidup ...,1,0,0,0,0
2,Guru Bangsa Tjokroaminoto menceritakan tentang...,1,0,0,0,0
3,POL menceritakan kisah hidup yang luar biasa d...,1,0,0,0,0
4,Perjalanan pahlawan Indonesia KH Ahmad Dahlan ...,1,0,0,0,0
...,...,...,...,...,...,...
1000,Winter in Tokyo berpusat pada kehidupan Ishida...,0,0,0,0,1
1001,Markonah melarikan diri ke Jakarta karena akan...,0,0,0,0,1
1002,"Tempat aking lebih dari 36 jam, Last Night ada...",0,0,0,0,1
1003,Proyek baru ini adalah tentang seorang lelaki ...,0,0,0,0,1


In [None]:
sinopsis = df_baru['ringkasan_sinopsis'].values
label = df_baru[['Drama', 'Horor', 'Komedi', 'Laga', 'Romantis']].values

In [None]:
from sklearn.model_selection import train_test_split
sinopsis_latih, sinopsis_test, label_latih, label_test = train_test_split(sinopsis, label, test_size=0.2)

In [None]:
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

tokenizer = Tokenizer(num_words=5000, oov_token='x')
tokenizer.fit_on_texts(sinopsis_latih)
tokenizer.fit_on_texts(sinopsis_test)

sekuens_latih = tokenizer.texts_to_sequences(sinopsis_latih)
sekuens_test = tokenizer.texts_to_sequences(sinopsis_test)

padded_latih = pad_sequences(sekuens_latih)
padded_test = pad_sequences(sekuens_test)

In [None]:
import tensorflow as tf
model = tf.keras.Sequential([
    tf.keras.layers.Embedding(input_dim=5000, output_dim=16),
    tf.keras.layers.LSTM(64),
    tf.keras.layers.Dense(128, activation='relu'),
    tf.keras.layers.Dense(64, activation='relu'),
    tf.keras.layers.Dense(5, activation='softmax')
])
model.compile(loss='categorical_crossentropy',optimizer='adam',metrics=['accuracy'])

In [None]:
class myCallback(tf.keras.callbacks.Callback):
  def on_epoch_end(self, epoch, logs={}):
    if(logs.get('accuracy')>0.9 and logs.get('val_accuracy')>0.9):
      print("\nAkurasi train dan validasi didapat telah mencapai nilai > 90%!")
      self.model.stop_training = True
callbacks = myCallback()

In [None]:
num_epochs = 30
history = model.fit(padded_latih, label_latih,
                    epochs=num_epochs,
                    validation_data=(padded_test, label_test),
                    verbose=2,
                    callbacks=[callbacks])

Epoch 1/30
26/26 - 16s - loss: 1.6106 - accuracy: 0.1866 - val_loss: 1.6100 - val_accuracy: 0.1741 - 16s/epoch - 634ms/step
Epoch 2/30
26/26 - 13s - loss: 1.6077 - accuracy: 0.2177 - val_loss: 1.6105 - val_accuracy: 0.1692 - 13s/epoch - 508ms/step
Epoch 3/30
26/26 - 13s - loss: 1.5860 - accuracy: 0.3221 - val_loss: 1.6042 - val_accuracy: 0.2040 - 13s/epoch - 492ms/step
Epoch 4/30
26/26 - 13s - loss: 1.4811 - accuracy: 0.4440 - val_loss: 1.6052 - val_accuracy: 0.2488 - 13s/epoch - 497ms/step
Epoch 5/30
26/26 - 13s - loss: 1.0909 - accuracy: 0.5361 - val_loss: 1.9131 - val_accuracy: 0.2786 - 13s/epoch - 500ms/step
Epoch 6/30
26/26 - 13s - loss: 0.7971 - accuracy: 0.6269 - val_loss: 2.1827 - val_accuracy: 0.2935 - 13s/epoch - 489ms/step
Epoch 7/30
26/26 - 12s - loss: 0.5975 - accuracy: 0.7239 - val_loss: 2.1225 - val_accuracy: 0.2637 - 12s/epoch - 448ms/step
Epoch 8/30
26/26 - 11s - loss: 0.4727 - accuracy: 0.8371 - val_loss: 2.6719 - val_accuracy: 0.2935 - 11s/epoch - 434ms/step
Epoch 9/