# **Tambah libraries dan setup config untuk menjalankan Google Colab**

In [0]:
!apt-get install -y -qq software-properties-common python-software-properties module-init-tools
!add-apt-repository -y ppa:alessandro-strada/ppa 2>&1 > /dev/null
!apt-get update -qq 2>&1 > /dev/null
!apt-get -y install -qq google-drive-ocamlfuse fuse
from google.colab import auth
auth.authenticate_user()
from oauth2client.client import GoogleCredentials
creds = GoogleCredentials.get_application_default()
import getpass
!google-drive-ocamlfuse -headless -id={creds.client_id} -secret={creds.client_secret} < /dev/null 2>&1 | grep URL
vcode = getpass.getpass()
!echo {vcode} | google-drive-ocamlfuse -headless -id={creds.client_id} -secret={creds.client_secret}
!pip install gensim

# **Mount Google Coud ke  Google Drive**

In [0]:
!mkdir -p drive
!google-drive-ocamlfuse drive

# **Import frameworks dan libraries yang diperlukan**

In [3]:
from keras.models import Sequential
from keras.layers import Dense, Embedding, LSTM, SpatialDropout1D
from sklearn.model_selection import train_test_split
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from sklearn.utils import resample
from keras.layers import SimpleRNN
from keras.models import load_model

import pickle
import json
import re
import pandas as pd
import numpy as np
import tensorflow as tf

import gensim
import seaborn as sns
import matplotlib.pyplot as plt

tf.test.gpu_device_name()

import os
os.chdir("drive/app")

Using TensorFlow backend.


# **Load dataset, lakukan preprocessing, dan singkronisasi jumlah data.**

Contoh dataset seperti gambar di bawah:

![alt text](https://preview.ibb.co/f8zC59/Screenshot_from_2018_08_25_06_30_15.png)

In [4]:
stopwords = open('data/stopwords.txt', 'r').read().splitlines()

def clean_chars(sent):
    url_remove = re.sub(r'http\S+', ' ', sent.lower())
    char_remove = re.sub(r'[^a-zA-Z0-9#@]', ' ', url_remove)
    char_len = [i for i in char_remove.split() if len(i) > 2]
    temp = [i for i in char_len if not i.startswith('#') and not i.startswith('@') and i not in stopwords]
    return ' '.join(temp)

def sync_data(pos, neu, neg):
    dict = {'pos': len(pos), 'neu': len(neu), 'neg': len(neg)}
    lowest = min(dict.items(), key=lambda x: x[1])

    pos = resample(pos,replace=False, n_samples=lowest[1], random_state=123)
    neu = resample(neu,replace=False, n_samples=lowest[1], random_state=123)
    neg = resample(neg,replace=False, n_samples=lowest[1], random_state=123)

    return pos, neu, neg

pos = []
neg = []
neu = []

pos_label = []
neg_label = []
neu_label = []

with open('data/news.json') as f:
    data = json.load(f)
    
for d in data:
  if d['sentiment'] == 'positive':
      content = clean_chars(d['texts'])
      pos_label.append(d['sentiment'])
      pos.append(content)

  elif d['sentiment'] == 'negative':
      content = clean_chars(d['texts'])
      neg_label.append(d['sentiment'])
      neg.append(content)

  elif d['sentiment'] == 'neutral':
      content = clean_chars(d['texts'])
      neu_label.append(d['sentiment'])
      neu.append(content)
      
pos, neu, neg = sync_data(pos, neu, neg)

print("Pos: %s, Neu: %s, Neg: %s" % (len(pos), len(neu), len(neg)))

statuses = pos + neu + neg

print("Total data: %s" % len(statuses))

pos_label, neu_label, neg_label = sync_data(pos_label, neu_label, neg_label)

labels = pos_label + neu_label + neg_label

Pos: 22826, Neu: 22826, Neg: 22826
Total data: 68478


# **Ubah data menjadi embedding**

In [0]:
max_features = 3000
tokenizer = Tokenizer(num_words=max_features, split=' ')
tokenizer.fit_on_texts(statuses)

X = tokenizer.texts_to_sequences(statuses)
X = pad_sequences(X)

Y = pd.get_dummies(labels).values

# **Split data menjadi data training, testing, valid**



In [0]:
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.2, random_state=1)

X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, random_state=1)

# **Create Model**

In [0]:
max_features = 3000
embed_dim = 128
units = 196

model = Sequential()
model.add(Embedding(max_features, embed_dim, input_length=X.shape[1]))
model.add(LSTM(units, dropout=0.2))
# model.add(SimpleRNN(units, dropout=0.2))
model.add(Dense(3,activation='softmax'))
model.compile(loss = 'binary_crossentropy', optimizer='adam',metrics = ['accuracy'])
print(model.summary())


batch_size = 1000
history = model.fit(X_train, y_train, epochs=5, batch_size=batch_size, validation_data=(X_test, y_test), verbose=1)

score, accuracy = model.evaluate(X_val, y_val, batch_size=batch_size, verbose=1)
print("score: ", score)
print("accuracy: ", accuracy)

# **Visualisasi**

In [0]:
# df = pd.DataFrame({'epochs':history.epoch, 'accuracy': history.history['acc'], 'validation_accuracy': history.history['val_acc']})
# g = sns.pointplot(x="epochs", y="accuracy", data=df, fit_reg=False, color='green')
# g = sns.pointplot(x="epochs", y="validation_accuracy", data=df, fit_reg=False, color='red')

[rnn](https://image.ibb.co/cK9mVz/rnn.png)
[rnn-w2v](https://image.ibb.co/bTDDAz/rnn_w2v.png)

# **Save model**

In [0]:
model.save('model/model-rnn.h5')
# model.save('model/model-lstm.h5')
print("Model has created!")

Model has created!


# **Predict data**

In [15]:
#Neg
# input_text = """Meski begitu, yang menjadi catatan Nirwono yakni reformasi birokrasi 
#                 itu belum diiringi dengan penyerapan anggaran maksimal, 
#                 hanya berkisar antara 45-65 persen selama lima tahun terakhir. 
#                 Penyerapan anggaran pun lebih banyak dihabiskan untuk operasional 
#                 kantor dan modal badan usaha milik daerah (BUMD)."""

#Pos
# input_text = """Di sektor transportasi, lanjut Nirwono, Jokowi berhasil melakukan 
#                 groundbreaking pembangunan mass rapid transit (MRT). Jokowi bisa 
#                 mengeksekusi pola makro transportasi terpadu yang sudah disiapkan 
#                 sejak zaman mantan Gubernur Sutiyoso."""

#Neg
# input_text = """“Ketiga, Riza menyebut permasalahan hukum dan demokrasi yang perlu 
#                  diperbaiki. Ia lalu membandingkan dengan era pemerintahan Susilo 
#                  Bambang Yudyhono yang dinilainya lebih baik. “Sampai hari ini 
#                  banyak masalah hukum. Kedua masalah demokrasi.  Di zaman ini 
#                  reformasi dan dekmokrasi terasa tertinggal. Zaman Pak SBY kita 
#                  mengapresiasi kami mengapresiasi, hukum yang baik,” sebutnya. """

#Neg
# input_text = """Pemerintahan Jokowi gagal memajukan Indonesia"""

text = [clean_chars(input_text)]
predicted = tokenizer.texts_to_sequences(text)
guess = pad_sequences(predicted, maxlen=X.shape[1])

model = load_model('model/model-lstm.h5')
# model = load_model('model/model-rnn.h5')
prediction = model.predict(guess)
polarity = np.argmax(prediction[0])

sentiment = ['negative', 'neutral', 'positive']

print("Text: ",input_text)
print("Sentiment: ",sentiment[polarity])

Text:  “Ketiga, Riza menyebut permasalahan hukum dan demokrasi yang perlu 
                 diperbaiki. Ia lalu membandingkan dengan era pemerintahan Susilo 
                 Bambang Yudyhono yang dinilainya lebih baik. “Sampai hari ini 
                 banyak masalah hukum. Kedua masalah demokrasi.  Di zaman ini 
                 reformasi dan dekmokrasi terasa tertinggal. Zaman Pak SBY kita 
                 mengapresiasi kami mengapresiasi, hukum yang baik,” sebutnya. 
Sentiment:  negative


# **Create Model (Word2vec)**

In [0]:
max_features = 3000
embed_dim = 128
lstm_units = 196

def create_embedding_matrix(model):
    embedding_matrix = np.zeros((len(model.wv.vocab), embed_dim))
    for i in range(len(model.wv.vocab)):
        embedding_vector = model.wv[model.wv.index2word[i]]
        if embedding_vector is not None:
            embedding_matrix[i] = embedding_vector
    return embedding_matrix

# model = gensim.models.Word2Vec.load("w2v/w2v-cbow.bin")
model = gensim.models.Word2Vec.load("w2v/w2v-skipgram.bin")
embedding_matrix = create_embedding_matrix(model)

model = Sequential()
model.add(Embedding(input_dim=embedding_matrix.shape[0], output_dim=embedding_matrix.shape[1], weights=[embedding_matrix]))
# model.add(LSTM(lstm_units, dropout=0.2))
model.add(SimpleRNN(units, dropout=0.2))
model.add(Dense(3,activation='softmax'))
model.compile(loss = 'binary_crossentropy', optimizer='adam',metrics = ['accuracy'])
print(model.summary())


batch_size = 1000
history = model.fit(X_train, y_train, epochs=5, batch_size=batch_size, validation_data=(X_test, y_test), verbose=1)

score, accuracy = model.evaluate(X_val, y_val, batch_size=batch_size, verbose=1)
print("score: ", score)
print("accuracy: ", accuracy)

# **Visualisasi (Word2vec)**

In [0]:
df = pd.DataFrame({'epochs':history.epoch, 'accuracy': history.history['acc'], 'validation_accuracy': history.history['val_acc']})
g = sns.pointplot(x="epochs", y="accuracy", data=df, fit_reg=False, color='green')
g = sns.pointplot(x="epochs", y="validation_accuracy", data=df, fit_reg=False, color='red')

[lstm](https://image.ibb.co/eQv4He/lstm.png)
[lstm-w2v](https://image.ibb.co/fGrdce/lstm_w2v.png)

# **Predict data (Word2vec)**

In [0]:
# model.save('model/model-rnn-w2v.h5')
model.save('model/model-lstm-w2v.h5')
print("Model has created!")

Model has created!


In [0]:
from keras.models import load_model

#neg
input_text = """Meski begitu, yang menjadi catatan Nirwono yakni reformasi birokrasi 
                itu belum diiringi dengan penyerapan anggaran maksimal, 
                hanya berkisar antara 45-65 persen selama lima tahun terakhir. 
                Penyerapan anggaran pun lebih banyak dihabiskan untuk operasional 
                kantor dan modal badan usaha milik daerah (BUMD)."""

#pos
# input_text = """Di sektor transportasi, lanjut Nirwono, Jokowi berhasil melakukan 
#                 groundbreaking pembangunan mass rapid transit (MRT). Jokowi bisa 
#                 mengeksekusi pola makro transportasi terpadu yang sudah disiapkan 
#                 sejak zaman mantan Gubernur Sutiyoso."""

#neg
# input_text = """“Ketiga, Riza menyebut permasalahan hukum dan demokrasi yang perlu 
#                  diperbaiki. Ia lalu membandingkan dengan era pemerintahan Susilo 
#                  Bambang Yudyhono yang dinilainya lebih baik. “Sampai hari ini 
#                  banyak masalah hukum. Kedua masalah demokrasi.  Di zaman ini 
#                  reformasi dan dekmokrasi terasa tertinggal. Zaman Pak SBY kita 
#                  mengapresiasi kami mengapresiasi, hukum yang baik,” sebutnya. """

#neg
# input_text = """Pemerintahan Jokowi gagal memajukan Indonesia"""

text = [clean_chars(input_text)]
predicted = tokenizer.texts_to_sequences(text)
guess = pad_sequences(predicted, maxlen=X.shape[1])

# model = load_model('model/model-rnn-w2v.h5')
model = load_model('model/model-lstm-w2v.h5')
prediction = model.predict(guess)
polarity = np.argmax(prediction[0])

sentiment = ['negative', 'neutral', 'positive']

print("Text: ",input_text)
print("Sentiment: ",sentiment[polarity])

# **Perbandingan hasil antara RNN (atas) dan RNN yang menggunakan word2vec (bawah)**
![rnn](https://image.ibb.co/cK9mVz/rnn.png)
![rnn-w2v](https://image.ibb.co/bTDDAz/rnn_w2v.png)

# **Perbandingan hasil antara LSTM LSTM yang menggunakan word2vec (bawah)**

![lstm](https://image.ibb.co/eQv4He/lstm.png)
![lstm-w2v](https://image.ibb.co/fGrdce/lstm_w2v.png)