In [1]:
import pandas as pd
import numpy as np
from keras.preprocessing.text import Tokenizer
from keras_preprocessing.sequence import pad_sequences
import tensorflow as tf
from sklearn.metrics import classification_report
from sklearn.model_selection import train_test_split
import pickle

In [2]:
BATCH_SIZE = 128
EPOCHS = 50
VERBOSE = 1
EMBEDDING_DIM = 100

# Dataset

In [3]:
#data 1
df_1 = pd.read_csv('data_analisis_sentimen_1.csv', sep=';')[['tweet', 'hasil']].dropna()
#data 2
df_2 = pd.read_csv('data_analisis_sentimen_2.csv', sep=';')[['Tweet', 'hasil']].dropna().rename(columns={'Tweet':'tweet'})
#data 4
df_3 = pd.read_csv('data_analisis_sentimen_4.csv', sep=';')[['content', 'Hasil']].dropna().rename(columns={'content':'tweet', 'Hasil':'hasil'})
df = pd.concat([df_1, df_2, df_3])

In [4]:
df = df[df['hasil']<=1]
df['hasil'] = df['hasil'].apply(lambda x:x+1)
# 0 = negatif, 1 = netral, 2=positif
df['hasil'].value_counts()

1.0    10792
2.0     8535
0.0     7610
Name: hasil, dtype: int64

In [5]:
x, y = df['tweet'], df['hasil']
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42)
y_train = tf.one_hot(y_train, 3)
y_test = tf.one_hot(y_test, 3)

In [6]:
x_train.shape, x_test.shape, y_train.shape, y_test.shape

((21549,), (5388,), TensorShape([21549, 3]), TensorShape([5388, 3]))

# Tokenize

In [20]:
def tokenize(x_tr, x_val, load, tokenizer_file):
    #Tokenize the sentences
    if load:
        with open(tokenizer_file, 'rb') as handle:
            tokenizer = pickle.load(handle)
    elif not load:
        tokenizer = Tokenizer(num_words=50000)
        tokenizer.fit_on_texts(list(x_tr))
    #converting text into integer sequences
    x_tr_seq  = tokenizer.texts_to_sequences(x_tr) 
    x_val_seq = tokenizer.texts_to_sequences(x_val)
    #padding to prepare sequences of same length
    x_tr_seq  = pad_sequences(x_tr_seq, maxlen=100)
    x_val_seq = pad_sequences(x_val_seq, maxlen=100)
    return x_tr_seq, x_val_seq, tokenizer

In [8]:
def save_tokenizer(tokenizer, tokenizer_file):
    with open(tokenizer_file, 'wb') as handle:
        pickle.dump(tokenizer, handle, protocol=pickle.HIGHEST_PROTOCOL)

In [None]:
x_tr, x_test, tokenizer = tokenize(x_train, x_test)

# Model

In [9]:
def init_model(vocab_size, embedding_dim, architecture):
    """
    For baseline model comparison only
    """
    model=tf.keras.Sequential()
    #embedding layer
    model.add(tf.keras.layers.Embedding(vocab_size, embedding_dim)) 
    if architecture=='LSTM':
        #lstm layer
        model.add(tf.keras.layers.LSTM(128,return_sequences=True,dropout=0.2))
        #Global Maxpooling
        model.add(tf.keras.layers.GlobalMaxPooling1D())
    elif architecture=='BI-LSTM':
        #lstm layer
        model.add(tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(128,return_sequences=True,dropout=0.2)))
        #Global Maxpooling
        model.add(tf.keras.layers.GlobalMaxPooling1D())
    #Dense Layer
    model.add(tf.keras.layers.Dense(64,activation='relu')) 
    model.add(tf.keras.layers.Dense(3,activation='softmax')) 
    #Add loss function, metrics, optimizer
    model.compile(optimizer='adam', loss='categorical_crossentropy',metrics=["acc"]) 
    return model

In [13]:
def train_model(model, x_tr, y_tr, x_val, y_val, model_file, batch_size=BATCH_SIZE, epochs=EPOCHS, verbose=VERBOSE):
    #Adding callbacks
    es = tf.keras.callbacks.EarlyStopping(monitor='val_loss', mode='min', verbose=verbose,patience=5)  
    mc=tf.keras.callbacks.ModelCheckpoint(model_file, monitor='val_acc', mode='max', save_best_only=True,verbose=verbose)  
    history = model.fit(np.array(x_tr),np.array(y_tr),batch_size=batch_size,epochs=epochs,validation_data=(np.array(x_val),np.array(y_val)),verbose=verbose,callbacks=[es,mc])
    model.save(model_file)
    return history

In [11]:
def eval_model(x_val, y_val, model):
    predict = model.predict(x_val)
    y_pred = np.argmax(predict, axis=1)
    y_true = np.argmax(y_val, axis=1)
    print(classification_report(y_true, y_pred))

# Pipeline

In [21]:
def main(x_tr, x_test, y_tr, y_test, **args):
    """
    kalau reduced false artinya pakai autoencoder dulu asumsinya
    """
    #tokenize
    print("------------Now tokenizing data------------")
    x_tr, x_test, tokenizer = tokenize(x_tr, x_test, args['load_tokenizer'], args['tokenizer_file'])
    print("------------Tokenizing data done------------")
    #create model
    print("------------Now creating and training model------------")
    vocab_size = len(tokenizer.word_index) + 1
    model = init_model(vocab_size, EMBEDDING_DIM, args['architecture'])
    #train model
    train_model(model, x_tr, y_tr, x_test, y_test, args['model_file'])
    print("------------Training model done------------")
    #evaluate model
    print("------------Model evaluation------------")
    eval_model(x_test, y_test, model)

In [22]:
main(x_train, x_test, y_train, y_test, architecture='LSTM', model_file='model/test_lstm.h5', tokenizer_file ='model/tokenizer.pickle', load_tokenizer=True)

------------Now tokenizing data------------
------------Tokenizing data done------------
------------Now creating and training model------------
Epoch 1/50
Epoch 1: val_acc improved from -inf to 0.66722, saving model to model\test_lstm.h5
Epoch 2/50
Epoch 2: val_acc improved from 0.66722 to 0.68114, saving model to model\test_lstm.h5
Epoch 3/50
Epoch 3: val_acc did not improve from 0.68114
Epoch 4/50
Epoch 4: val_acc did not improve from 0.68114
Epoch 5/50
Epoch 5: val_acc did not improve from 0.68114
Epoch 6/50
Epoch 6: val_acc did not improve from 0.68114
Epoch 7/50
Epoch 7: val_acc did not improve from 0.68114
Epoch 7: early stopping
------------Training model done------------
------------Model evaluation------------
              precision    recall  f1-score   support

           0       0.64      0.60      0.62      1539
           1       0.61      0.66      0.64      2143
           2       0.67      0.64      0.65      1706

    accuracy                           0.64      538