In [2]:
import pandas as pd
import requests
from io import StringIO
import tensorflow
import keras
import sklearn
import pickle




In [3]:
# Prepare data set
df = pd.read_csv('train_preprocess.tsv', sep='\t', header=None, names = ['Text', 'Kategori']) 
df.head()

Unnamed: 0,Text,Kategori
0,warung ini dimiliki oleh pengusaha pabrik tahu...,positive
1,mohon ulama lurus dan k212 mmbri hujjah partai...,neutral
2,lokasi strategis di jalan sumatera bandung . t...,positive
3,betapa bahagia nya diri ini saat unboxing pake...,positive
4,duh . jadi mahasiswa jangan sombong dong . kas...,negative


In [4]:
df.Kategori.value_counts()

Kategori
positive    6416
negative    3436
neutral     1148
Name: count, dtype: int64

## Text Cleansing

In [5]:
import re 

def cleansing(sent):
    # Mengubah kata menjadi huruf kecil semua dengan menggunakan fungsi lower()
    string = sent.lower()
    # Menghapus emoticon dan tanda baca menggunakan "RegEx" dengan script di bawah
    string = re.sub(r'[^a-zA-Z0-9]', ' ', string)
    return string

In [8]:
# Menambahkan kolom text clean
df['text_clean'] = df.Text.apply(cleansing)
df.head()

Unnamed: 0,Text,Kategori,text_clean
0,warung ini dimiliki oleh pengusaha pabrik tahu...,positive,warung ini dimiliki oleh pengusaha pabrik tahu...
1,mohon ulama lurus dan k212 mmbri hujjah partai...,neutral,mohon ulama lurus dan k212 mmbri hujjah partai...
2,lokasi strategis di jalan sumatera bandung . t...,positive,lokasi strategis di jalan sumatera bandung t...
3,betapa bahagia nya diri ini saat unboxing pake...,positive,betapa bahagia nya diri ini saat unboxing pake...
4,duh . jadi mahasiswa jangan sombong dong . kas...,negative,duh jadi mahasiswa jangan sombong dong kas...


In [11]:
neg = df.loc[df['Kategori'] == 'negative'].text_clean.tolist()
neu = df.loc[df['Kategori'] == 'neutral'].text_clean.tolist()
pos = df.loc[df['Kategori'] == 'positive'].text_clean.tolist()

neg_sentiment = df.loc[df['Kategori'] == 'negative'].Kategori.tolist()
neu_sentiment = df.loc[df['Kategori'] == 'neutral'].Kategori.tolist()
pos_sentiment = df.loc[df['Kategori'] == 'positive'].Kategori.tolist()

In [12]:
total_data = pos + neu + neg
labels = pos_sentiment + neu_sentiment + neg_sentiment

In [13]:
total_data[0]

'warung ini dimiliki oleh pengusaha pabrik tahu yang sudah puluhan tahun terkenal membuat tahu putih di bandung   tahu berkualitas   dipadu keahlian memasak   dipadu kretivitas   jadilah warung yang menyajikan menu utama berbahan tahu   ditambah menu umum lain seperti ayam   semuanya selera indonesia   harga cukup terjangkau   jangan lewatkan tahu bletoka nya   tidak kalah dengan yang asli dari tegal  '

In [14]:
#Tokenizing and Applying pad_sequences

import pickle
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

max_features = 100000
tokenizer = Tokenizer(num_words=max_features, split=' ', lower=True)
tokenizer.fit_on_texts(total_data)

with open('tokenizer.pickle', 'wb') as handle:
  pickle.dump(tokenizer, handle, protocol=pickle.HIGHEST_PROTOCOL)
  print('tokenizer.pickle has been created.')

X = tokenizer.texts_to_sequences(total_data)
X = pad_sequences(X)

with open('x_pad_sequences.pickle','wb') as handle:
  pickle.dump(X, handle, protocol=pickle.HIGHEST_PROTOCOL)
  print('x_pad_sequences.pickle has been created.')

tokenizer.pickle has been created.
x_pad_sequences.pickle has been created.


In [15]:
len(labels)

11000

In [17]:
Y = pd.get_dummies(labels)
Y = Y.values

print(Y)

[[False False  True]
 [False False  True]
 [False False  True]
 ...
 [ True False False]
 [ True False False]
 [ True False False]]


In [18]:
with open('y_labels.pickle','wb') as handle:
  pickle.dump(Y,handle, protocol=pickle.HIGHEST_PROTOCOL)
  print("y_labels.pickle has been created")

y_labels.pickle has been created


In [19]:
Y.shape

(11000, 3)

In [20]:
Y[0:10]

array([[False, False,  True],
       [False, False,  True],
       [False, False,  True],
       [False, False,  True],
       [False, False,  True],
       [False, False,  True],
       [False, False,  True],
       [False, False,  True],
       [False, False,  True],
       [False, False,  True]])

In [21]:
some_df = pd.DataFrame(data=Y, columns=['a','b','c'])
some_df.head()

Unnamed: 0,a,b,c
0,False,False,True
1,False,False,True
2,False,False,True
3,False,False,True
4,False,False,True


In [22]:
some_df['labels'] = labels
some_df.head()

Unnamed: 0,a,b,c,labels
0,False,False,True,positive
1,False,False,True,positive
2,False,False,True,positive
3,False,False,True,positive
4,False,False,True,positive


In [23]:
some_df[some_df['labels']=='negative'].iloc[0:1]

Unnamed: 0,a,b,c,labels
7564,True,False,False,negative


In [24]:
some_df[some_df['labels']=='neutral'].iloc[0:1]

Unnamed: 0,a,b,c,labels
6416,False,True,False,neutral


In [25]:
some_df[some_df['labels']=='positive'].iloc[0:1]

Unnamed: 0,a,b,c,labels
0,False,False,True,positive


In [26]:
from sklearn.model_selection import train_test_split

In [27]:
file = open("x_pad_sequences.pickle",'rb')
x = pickle.load(file)
file.close()

file = open("y_labels.pickle",'rb')
y = pickle.load(file)
file.close()

x_train, x_test, y_train, y_test = train_test_split(x,y,test_size=0.2, random_state=1)

In [29]:
import numpy as np
from tensorflow.keras import layers
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Embedding, LSTM
from tensorflow.keras import optimizers
from tensorflow.keras.callbacks import EarlyStopping

In [30]:
embed_dim = 100
units = 64

model=Sequential()
model.add(Embedding(max_features, embed_dim, input_length=X.shape[1]))
model.add(LSTM(units, dropout=0.2))
model.add(Dense(3,activation='softmax'))
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
print(model.summary())

adam=optimizers.Adam(lr=0.001)
model.compile(loss='categorical_crossentropy',optimizer=adam, metrics=['accuracy'])

es = EarlyStopping(monitor='val_loss', mode='min', verbose=1)
history = model.fit(x_train, y_train, epochs=2, batch_size=12, validation_data=(x_test, y_test), verbose=1, callbacks=[es])



Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 96, 100)           10000000  
                                                                 
 lstm (LSTM)                 (None, 64)                42240     
                                                                 
 dense (Dense)               (None, 3)                 195       
                                                                 
Total params: 10042435 (38.31 MB)
Trainable params: 10042435 (38.31 MB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________




None
Epoch 1/2












Epoch 2/2


In [31]:
from sklearn import metrics

predictions = model.predict(x_test)
y_pred = predictions
matrix_test = metrics.classification_report(y_test.argmax(axis=1), y_pred.argmax(axis=1))
print("Testing selesai")
print(matrix_test)

Testing selesai
              precision    recall  f1-score   support

           0       0.84      0.83      0.84       685
           1       0.86      0.80      0.83       233
           2       0.91      0.93      0.92      1282

    accuracy                           0.88      2200
   macro avg       0.87      0.85      0.86      2200
weighted avg       0.88      0.88      0.88      2200



In [32]:
model.save('model.h5')
print('Model has been created.')

  saving_api.save_model(


Model has been created.


In [33]:
import re
from keras.models import load_model

input_text = """
Rasa syukur, cukup.
"""

def cleansing(sent):
    # Mengubah kata menjadi huruf kecil semua dengan menggunakan fungsi lower()
    string = sent.lower()
    # Menghapus emoticon dan tanda baca menggunakan "RegEx" dengan script di bawah
    string = re.sub(r'[^a-zA-Z0-9]', ' ', string)
    return string

sentiment = ['negative', 'neutral', 'positive']

text = [cleansing(input_text)]
predicted = tokenizer.texts_to_sequences(text)
guess = pad_sequences(predicted, maxlen=X.shape[1])

model = load_model('model.h5')
prediction = model.predict(guess)
polarity = np.argmax(prediction[0])

print("Text: ",text[0])
print("Sentiment: ",sentiment[polarity])

Text:   rasa syukur  cukup  
Sentiment:  positive
