In [1]:
import pandas as pd
import os
import numpy
import random
import time
from datetime import timedelta

import trax
from trax import fastmath
from trax import layers as tl
from trax.supervised import training

import EngineFiles.TweetFormat as tf

from EngineFiles.DeepLearning import NeuralNetworkDataPrepro as NND
from EngineFiles.DeepLearning import NeuralNetworkObject as NNO
from EngineFiles.DeepLearning import NeuralNetworkModel as NNM

INFO:tensorflow:tokens_length=568 inputs_length=512 targets_length=114 noise_density=0.15 mean_noise_span_length=3.0 


In [2]:
df = pd.read_csv('Data/indonesia_Tweet/clean_tweets.csv')
df.dropna(subset=['Tweet'],inplace=True)
alay_lang = tf.bahasa_slang()

In [3]:
with open('EngineFiles/Word2Vec/idwiki_clean.txt', 'r', encoding='utf-8') as f:
    idwiki = f.read()

idwiki = idwiki.split('\n')

In [4]:
# SPLIT TRAIN TEST SET
df_train, df_test, x_train, x_train_pos, x_train_neg, x_test, x_test_pos, x_test_neg, y_train, y_test, index_train, index_test = NND.splitDataset(df, 0.2, 42)

In [5]:
df_test = df.copy()
df_test['type'] = ['not_set'] * df_test.shape[0]
df_test.loc[index_train, 'type'] = 'train'
df_test.loc[index_test, 'type'] = 'test'
df_test.groupby(['label', 'type']).count()

Unnamed: 0_level_0,Unnamed: 1_level_0,Tweet
label,type,Unnamed: 2_level_1
negative,test,1527
negative,train,6106
positive,test,1312
positive,train,5248


In [6]:
# BUILD VOCABULARY
vocab = NND.createVocab(x_train, idwiki)
print(f'Total words in vocab : {len(vocab)}')

Total words in vocab : 1388642


In [7]:
# CONVERT TWEETS TO TENSORS (TEST FUNCTION)
print(f'Actual tweet : {x_test_pos[0]}')
print('Tensor of tweet :', NND.tweet2tensor(x_test_pos[0], vocabulary=vocab))

Actual tweet : tanggal pas pilih kepala daerah pilih ah
Tensor of tweet : [1070, 267, 25, 26, 27, 25, 1028]


In [8]:
# CREATE BATCH GENERATOR & DATA GENERATOR (TEST FUNCTION)
random.seed(30)
tmp = NND.train_generator(x_train_pos, x_train_neg, vocab, batch_size=4)
tmp_inputs, tmp_targets, tmp_exm_weights = next(tmp)

print(f'inputs shape : {tmp_inputs.shape}')
print(f'targets shape : {tmp_targets.shape}')
print(f'example weights shape : {tmp_exm_weights.shape}')
print()

for i, t in enumerate(tmp_inputs):
    print(f'input tensor : {t}, target {tmp_targets[i]}, exm_weights {tmp_exm_weights[i]}')

inputs shape : (4, 21)
targets shape : (4,)
example weights shape : (4,)

input tensor : [ 3  4  5  6  7  8  9 10 11 12 13 14 14 15 16 12 17 18 18 17 19], target 1, exm_weights 1
input tensor : [20 21 22 23  4 20 24 25 26 27 28 29 30 31 32 33 34 35 36 37  0], target 1, exm_weights 1
input tensor : [38 39 40  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0], target 0, exm_weights 1
input tensor : [65 66 67  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0], target 0, exm_weights 1


In [9]:
# TEST RELU CLASS
x = numpy.array([[-2.0, -1.0, 0.0], [0.0, 1.0, 2.0]], dtype=float)
relu_layer = NNO.ReLU()
print(f'Testing data : {x}')
print('Output of ReLU :', relu_layer(x))

Testing data : [[-2. -1.  0.]
 [ 0.  1.  2.]]
Output of ReLU : [[0. 0. 0.]
 [0. 1. 2.]]


In [10]:
# TEST DENSE CLASS
dense_layer = NNO.Dense(n_units=10)
random_key = fastmath.random.get_prng(seed=0)
z = fastmath.numpy.array([[2.0, 7.0, 25.0]])

dense_layer.init(z, random_key)
print('Weights :', dense_layer.weights)
print('Forward output :', dense_layer(z))

Weights : [[-0.02837108  0.09368162 -0.10050076  0.14165013  0.10543301  0.09108126
  -0.04265672  0.0986188  -0.05575325  0.00153249]
 [-0.20785688  0.0554837   0.09142365  0.05744595  0.07227863  0.01210617
  -0.03237354  0.16234995  0.02450038 -0.13809784]
 [-0.06111237  0.01403724  0.08410042 -0.1094358  -0.10775021 -0.11396459
  -0.05933381 -0.01557652 -0.03832145 -0.11144515]]
Forward output : [[-3.0395496   0.9266802   2.5414743  -2.050473   -1.9769388  -2.582209
  -1.7952735   0.94427425 -0.8980402  -3.7497487 ]]


In [11]:
# TEST MODEL
tmp = NNM.classifier(vocab_size=len(vocab))
print(type(tmp))
display(tmp)

<class 'trax.layers.combinators.Serial'>


Serial[
  Embedding_1388642_256
  Mean
  Dense_2
  LogSoftmax
]

In [12]:
# CREATE OUTPUT DIRECTORY FOR THE MODEL
output_dir = '~/Project/Purwadhika_Final_Project/EngineFiles/DeepLearning/model/'
output_dir_expand = os.path.expanduser(output_dir)
print(output_dir_expand)

/Users/blackdisk/Project/Purwadhika_Final_Project/EngineFiles/DeepLearning/model/


In [13]:
# TRAIN THE MODEL
batch_size = 16
random.seed(271)

train_task = training.TrainTask(labeled_data = NND.train_generator(x_train_pos, x_train_neg, vocab, batch_size, True),
                                loss_layer = tl.CrossEntropyLoss(),
                                optimizer = trax.optimizers.Adam(0.01),
                                n_steps_per_checkpoint = 10
                                )

eval_task = training.EvalTask(labeled_data = NND.val_generator(x_test_pos, x_test_neg, vocab, batch_size, True),
                                metrics = [tl.CrossEntropyLoss(), tl.Accuracy()]
                            )

model = NNM.classifier(vocab_size=len(vocab))

In [14]:
'''
OS : macOS Catalina
Processor : i5 @ 2.30GHz
RAM : 8GB @ 2133MHz
Storage : SSD 256GB
'''
start_time = time.time()
training_loop = NNM.train_model(model, train_task, eval_task, 1000, output_dir_expand)
finish_time = time.time()


Step      1: Ran 1 train steps in 83.89 secs
Step      1: train CrossEntropyLoss |  0.69781309
Step      1: eval  CrossEntropyLoss |  0.75031227
Step      1: eval          Accuracy |  0.43750000

Step     10: Ran 9 train steps in 153.14 secs
Step     10: train CrossEntropyLoss |  0.72035336
Step     10: eval  CrossEntropyLoss |  0.79513711
Step     10: eval          Accuracy |  0.43750000

Step     20: Ran 10 train steps in 159.24 secs
Step     20: train CrossEntropyLoss |  0.69483221
Step     20: eval  CrossEntropyLoss |  0.67076182
Step     20: eval          Accuracy |  0.56250000

Step     30: Ran 10 train steps in 146.35 secs
Step     30: train CrossEntropyLoss |  0.66534519
Step     30: eval  CrossEntropyLoss |  0.55639279
Step     30: eval          Accuracy |  0.81250000

Step     40: Ran 10 train steps in 144.52 secs
Step     40: train CrossEntropyLoss |  0.63103765
Step     40: eval  CrossEntropyLoss |  0.58525956
Step     40: eval          Accuracy |  0.62500000

Step     50:

In [15]:
print('Train model process elapsed time: {}'.format(timedelta(seconds=finish_time-start_time)))

Train model process elapsed time: 3:57:59.788814


In [17]:
# EVALUATE THE MODEL
eval_val_gen = NND.val_generator(x_test_pos, x_test_neg, vocab, 64, False)
eval_batch = next(eval_val_gen)
eval_inp, eval_targ, eval_ew = eval_batch

eval_pred = training_loop.eval_model(eval_inp)
eval_acc, eval_num_correct, eval_num_pred = NNM.compute_accuracy(eval_pred, eval_targ, eval_ew)

print(f'Model\'s prediction accuracy on a single batch is : {100*eval_acc}%')
print(f'Weighted number of correct prediction {eval_num_correct}, weighted number of total observations predicted {eval_num_pred}')

Model's prediction accuracy on a single batch is : 78.125%
Weighted number of correct prediction 50.0, weighted number of total observations predicted 64


In [19]:
# TESTING THE ACCURACY OF MODEL
mdl = training_loop.eval_model
accuracy = NNM.test_model(NND.test_generator(x_test_pos, x_test_neg, vocab, 16, False), mdl)

print(f'The accuracy of the model on the validation set is {accuracy:.4f}')

The accuracy of the model on the validation set is 0.8537


In [6]:
classfctnReport = NNM.confusionMatrix(x_test, y_test, mdl, alay_lang, vocab)
print(classfctnReport)

precision    recall  f1-score   support

           0       0.85      0.87      0.86      1527
           1       0.85      0.82      0.84      1312

    accuracy                           0.85      2839
   macro avg       0.85      0.85      0.85      2839
weighted avg       0.85      0.85      0.85      2839



In [7]:
# TEST PREDICT USING MANUAL INPUT 1
sentence = "emang dasar bangsat aja lo nya"
user_pred, user_sentiment = NNM.predictUserInput(sentence, mdl, alay_lang, vocab)
print(f'The sentiment of the tweet "{sentence}" is {user_sentiment}')

The sentiment of the tweet "emang dasar bangsat aja lo nya" is negative


In [8]:
# TEST PREDICT USING MANUAL INPUT 2
sentence = "kekuatan cinta tidak akan terkalahkan"
user_pred, user_sentiment = NNM.predictUserInput(sentence, mdl, alay_lang, vocab)
print(f'The sentiment of the tweet "{sentence}" is {user_sentiment}')

The sentiment of the tweet "kekuatan cinta tidak akan terkalahkan" is positive
