In [1]:
from __future__ import division, print_function
from gensim import models
from keras.callbacks import ModelCheckpoint
from keras.layers import Dense, Dropout, Reshape, Flatten, concatenate, Input, Conv1D, GlobalMaxPooling1D, Embedding
from keras.layers.recurrent import LSTM
from keras.models import Sequential
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.models import Model
from sklearn.model_selection import train_test_split
import numpy as np
import pandas as pd
import os
import collections
import re
import string

In [2]:
data = pd.read_csv('raw_data.csv')
data['Sentimen'] = pd.to_numeric(data['Sentimen'], errors='coerce').fillna(0)
data['Sentimen'] = data.Sentimen.astype(int)

In [3]:
data.columns = ['nama', 'ulasan_', 'tanggal', 'Sentimen']

data.head()

Unnamed: 0,nama,ulasan_,tanggal,Sentimen
0,Ransi Lintin,Aplikasinya bagus,2022-02-20 00:07:51,1
1,HAHAHAHA HAHAHA,Ok mantap,2022-02-20 00:48:59,1
2,Yundi Hartono,Mantap semoga semakin maju,2022-02-20 01:33:10,1
3,Nunu Nugraha,Bisa hemat biaya tf mantap ini sih rekomen ban...,2022-02-20 04:15:04,1
4,bahtiar hamzah,Mantap,2022-02-20 04:22:43,1


In [4]:
data.Sentimen.unique()

array([1, 0])

In [5]:
data.shape

(2091, 4)

In [6]:
pos = []
neg = []
for l in data.Sentimen:
    if l == 0:
        pos.append(0)
        neg.append(1)
    elif l == 1:
        pos.append(int(1))
        neg.append(0)

In [7]:
data['Pos']= pd.Series(pos)
data['Neg']= pd.Series(neg)

#data['Pos']= pos
#data['Neg']= neg

data.head(20)

Unnamed: 0,nama,ulasan_,tanggal,Sentimen,Pos,Neg
0,Ransi Lintin,Aplikasinya bagus,2022-02-20 00:07:51,1,1,0
1,HAHAHAHA HAHAHA,Ok mantap,2022-02-20 00:48:59,1,1,0
2,Yundi Hartono,Mantap semoga semakin maju,2022-02-20 01:33:10,1,1,0
3,Nunu Nugraha,Bisa hemat biaya tf mantap ini sih rekomen ban...,2022-02-20 04:15:04,1,1,0
4,bahtiar hamzah,Mantap,2022-02-20 04:22:43,1,1,0
5,muaddi aja,Selama ini bagus transaksi lancar,2022-02-20 05:04:37,1,1,0
6,JANUARDI .,Transaksi cepat tanpa biaya antar bank,2022-02-20 05:05:24,1,1,0
7,Mudi Pangestu,Aplikasi ya bagus sangat membantu Makasi flip ...,2022-02-20 05:06:48,1,1,0
8,TRISNANDA HILMI,Alhamdulillah. Banyak ngebantu dg aplikasi ini,2022-02-20 05:23:13,1,1,0
9,Padry Alfath,aplikasi terbaik yang sangat membantu pekerjaa...,2022-02-20 05:43:46,1,1,0


In [8]:
def remove_punct(ulasan_):
    text_nopunct = ''
    text_nopunct = re.sub('['+string.punctuation+']', '', str(ulasan_))
    return text_nopunct

data['Text_Clean'] = data['ulasan_'].apply(lambda x: remove_punct(x))

data.head()

Unnamed: 0,nama,ulasan_,tanggal,Sentimen,Pos,Neg,Text_Clean
0,Ransi Lintin,Aplikasinya bagus,2022-02-20 00:07:51,1,1,0,Aplikasinya bagus
1,HAHAHAHA HAHAHA,Ok mantap,2022-02-20 00:48:59,1,1,0,Ok mantap
2,Yundi Hartono,Mantap semoga semakin maju,2022-02-20 01:33:10,1,1,0,Mantap semoga semakin maju
3,Nunu Nugraha,Bisa hemat biaya tf mantap ini sih rekomen ban...,2022-02-20 04:15:04,1,1,0,Bisa hemat biaya tf mantap ini sih rekomen ban...
4,bahtiar hamzah,Mantap,2022-02-20 04:22:43,1,1,0,Mantap


In [9]:
import nltk
from nltk import word_tokenize, WordNetLemmatizer
#nltk.download()

tokens = [word_tokenize(sen) for sen in data.Text_Clean] 

In [10]:
def lower_token(tokens): 
    return [w.lower() for w in tokens]    
    
lower_tokens = [lower_token(token) for token in tokens] 

In [11]:
# from nltk.corpus import stopwords
# stoplist = stopwords.words('english')

# print(stoplist)

from Sastrawi.StopWordRemover.StopWordRemoverFactory import StopWordRemoverFactory
stoplist = StopWordRemoverFactory().get_stop_words()

#print(stoplist)

In [12]:
def remove_stop_words(tokens): 
    return [word for word in tokens if word not in stoplist]

In [13]:
filtered_words = [remove_stop_words(sen) for sen in lower_tokens] 

In [14]:
result = [' '.join(sen) for sen in filtered_words] 

In [15]:
data['Text_Final'] = result

data['tokens'] = filtered_words

data = data[['Text_Final', 'tokens', 'Sentimen', 'Pos', 'Neg']]

data.head()

Unnamed: 0,Text_Final,tokens,Sentimen,Pos,Neg
0,aplikasinya bagus,"[aplikasinya, bagus]",1,1,0
1,mantap,[mantap],1,1,0
2,mantap semoga semakin maju,"[mantap, semoga, semakin, maju]",1,1,0
3,hemat biaya tf mantap sih rekomen banget makas...,"[hemat, biaya, tf, mantap, sih, rekomen, bange...",1,1,0
4,mantap,[mantap],1,1,0


In [211]:
data_train, data_test = train_test_split(data, test_size=0.1, random_state=42)

In [212]:
all_training_words = [word for tokens in data_train["tokens"] for word in tokens]
training_sentence_lengths = [len(tokens) for tokens in data_train["tokens"]]
TRAINING_VOCAB = sorted(list(set(all_training_words)))
print("%s words total, with a vocabulary size of %s" % (len(all_training_words), len(TRAINING_VOCAB)))
print("Max sentence length is %s" % max(training_sentence_lengths))

15573 words total, with a vocabulary size of 2966
Max sentence length is 67


In [213]:
all_test_words = [word for tokens in data_test["tokens"] for word in tokens]
test_sentence_lengths = [len(tokens) for tokens in data_test["tokens"]]
TEST_VOCAB = sorted(list(set(all_test_words)))
print("%s words total, with a vocabulary size of %s" % (len(all_test_words), len(TEST_VOCAB)))
print("Max sentence length is %s" % max(test_sentence_lengths))

1627 words total, with a vocabulary size of 663
Max sentence length is 49


In [214]:
from gensim.models import Word2Vec

word2vec_path = 'idwiki_word2vec_200_new_lower.model'
word2vec_mod = Word2Vec.load(word2vec_path)
word2vec = word2vec_mod.wv

In [215]:
def get_average_word2vec(tokens_list, vector, generate_missing=False, k=200):
    if len(tokens_list)<1:
        return np.zeros(k)
    if generate_missing:
        vectorized = [vector[word] if word in vector else np.random.rand(k) for word in tokens_list]
    else:
        vectorized = [vector[word] if word in vector else np.zeros(k) for word in tokens_list]
    length = len(vectorized)
    summed = np.sum(vectorized, axis=0)
    averaged = np.divide(summed, length)
    return averaged

def get_word2vec_embeddings(vectors, clean_comments, generate_missing=False):
    embeddings = clean_comments['tokens'].apply(lambda x: get_average_word2vec(x, vectors, 
                                                                                generate_missing=generate_missing))
    return list(embeddings)

In [216]:
#training_embeddings = get_word2vec_embeddings(word2vec, data_train, generate_missing=True)

training_embeddings = get_word2vec_embeddings(word2vec, data_train, generate_missing=True)

In [217]:
MAX_SEQUENCE_LENGTH = 50
EMBEDDING_DIM = 200

In [218]:
tokenizer = Tokenizer(num_words=len(TRAINING_VOCAB), lower=True, char_level=False)
tokenizer.fit_on_texts(data_train["Text_Final"].tolist())
training_sequences = tokenizer.texts_to_sequences(data_train["Text_Final"].tolist())

train_word_index = tokenizer.word_index
print('Found %s unique tokens.' % len(train_word_index))

Found 2966 unique tokens.


In [219]:
train_cnn_data = pad_sequences(training_sequences, maxlen=MAX_SEQUENCE_LENGTH)

In [220]:
train_embedding_weights = np.zeros((len(train_word_index)+1, EMBEDDING_DIM))
for word,index in train_word_index.items():
    train_embedding_weights[index,:] = word2vec[word] if word in word2vec else np.random.rand(EMBEDDING_DIM)
print(train_embedding_weights.shape)

(2967, 200)


In [221]:
test_sequences = tokenizer.texts_to_sequences(data_test["Text_Final"].tolist())
test_cnn_data = pad_sequences(test_sequences, maxlen=MAX_SEQUENCE_LENGTH)

In [222]:
def ConvNet(embeddings, max_sequence_length, num_words, embedding_dim, labels_index):
    
    embedding_layer = Embedding(num_words,
                            embedding_dim,
                            weights=[embeddings],
                            input_length=max_sequence_length,
                            trainable=False)
    
    sequence_input = Input(shape=(max_sequence_length,), dtype='int32')
    embedded_sequences = embedding_layer(sequence_input)

    convs = []
    filter_sizes = [2,3,4,5,6]

    for filter_size in filter_sizes:
        l_conv = Conv1D(filters=200, kernel_size=filter_size, activation='relu')(embedded_sequences)
        l_pool = GlobalMaxPooling1D()(l_conv)
        convs.append(l_pool)


    l_merge = concatenate(convs, axis=1)

    x = Dropout(0.1)(l_merge)  
    x = Dense(128, activation='relu')(x)
    x = Dropout(0.2)(x)
    preds = Dense(labels_index, activation='sigmoid')(x)

    model = Model(sequence_input, preds)
    model.compile(loss='binary_crossentropy',
                  optimizer='adam',
                  metrics=['acc'])
    model.summary()
    return model

In [223]:
label_names = ['Pos', 'Neg']

In [224]:
y_train = data_train[label_names].values

In [225]:
x_train = train_cnn_data
y_tr = y_train

In [226]:
model = ConvNet(train_embedding_weights, MAX_SEQUENCE_LENGTH, len(train_word_index)+1, EMBEDDING_DIM, 
                len(list(label_names)))

Model: "model_3"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 input_4 (InputLayer)           [(None, 50)]         0           []                               
                                                                                                  
 embedding_3 (Embedding)        (None, 50, 200)      593400      ['input_4[0][0]']                
                                                                                                  
 conv1d_15 (Conv1D)             (None, 49, 200)      80200       ['embedding_3[0][0]']            
                                                                                                  
 conv1d_16 (Conv1D)             (None, 48, 200)      120200      ['embedding_3[0][0]']            
                                                                                            

In [227]:
num_epochs = 3
batch_size = 34

In [228]:
hist = model.fit(x_train, y_tr, epochs=num_epochs, validation_split=0.1, shuffle=True, batch_size=batch_size)

Epoch 1/3
Epoch 2/3
Epoch 3/3


In [229]:
predictions = model.predict(test_cnn_data, batch_size=1024, verbose=1)



In [230]:
labels = [1, 0]

In [231]:
prediction_labels=[]
for p in predictions:
    prediction_labels.append(labels[np.argmax(p)])

In [232]:
sum(data_test.Sentimen==prediction_labels)/len(prediction_labels)

0.9095238095238095

In [233]:
data_test.Sentimen.value_counts()

1    169
0     41
Name: Sentimen, dtype: int64

In [234]:
#print(data_test)

data_test.head()

Unnamed: 0,Text_Final,tokens,Sentimen,Pos,Neg
1939,kasih 5 bintang,"[kasih, 5, bintang]",1,1,0
29,uang ngilang ntah kmna ga recomend sama sekali...,"[uang, ngilang, ntah, kmna, ga, recomend, sama...",0,0,1
210,bagus cepat mudah dn hemat tentunya trs tingka...,"[bagus, cepat, mudah, dn, hemat, tentunya, trs...",1,1,0
952,gue kasih bintang 1 sengaja gue ikut rules yg ...,"[gue, kasih, bintang, 1, sengaja, gue, ikut, r...",0,0,1
583,cepat prosesnya terpercaya lanjutkan lebih baik,"[cepat, prosesnya, terpercaya, lanjutkan, lebi...",1,1,0


In [244]:

def get_encode(text1):
    x_1 = tokenizer.texts_to_sequences(text1)
    x_1 = pad_sequences(x_1, maxlen= 50, padding='post')
    return x_1

def get_predict(text1):
    #model = loaded_model
    result = model.predict(text1)
    #result = np.argmax(result1, axis=1)
    return result


data_kalimat = ['lumayan bagus fiturnya, tapi kalo bisa ditambahin lagi']
encode = get_encode(data_kalimat)
result = get_predict(encode)
#result = model.predict_classes(encode)

labels_test = [1, 0]

prediction_labels_test=[]
for p in result:
    prediction_labels_test.append(labels_test[np.argmax(p)])
    
print(prediction_labels_test)

[1]
