In [1]:
import pandas as pd 
import numpy as np 
from data_processor import Data_Processor
from sklearn.model_selection import train_test_split
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences




In [15]:
vn_stopwords = r'vietnamese-stopwords.txt'

train_processor = Data_Processor(r'data\train\sentiments.txt', r'data\train\sents.txt', vn_stopwords)
test_processor = Data_Processor(r'data\test\sentiments.txt', r'data\test\sents.txt', vn_stopwords)

train_df = train_processor.create_dataframe()
test_df = test_processor.create_dataframe()

In [16]:
X_train = train_df['Text'].values
y_train = train_df[['Negative', 'Neutral', 'Positive']].values
X_test = test_df['Text'].values
y_test = test_df[['Negative', 'Neutral', 'Positive']].values

In [17]:
# Tokenize and pad sequences
tokenizer = Tokenizer()
tokenizer.fit_on_texts(X_train)

X_train_seq = tokenizer.texts_to_sequences(X_train)
X_test_seq = tokenizer.texts_to_sequences(X_test)

max_len = max(len(x) for x in X_train_seq + X_test_seq)

X_train_pad = pad_sequences(X_train_seq, maxlen=max_len, padding='post')
X_test_pad = pad_sequences(X_test_seq, maxlen=max_len, padding='post')

LSTM with Bidirectional

In [18]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, Bidirectional, LSTM, Dense, Dropout

In [24]:
vocab_size = len(tokenizer.word_index) + 1
embedding_dim = 128
lstm_units = 64
dropout_rate = 0.5

# Build model
LSTM_Model = Sequential([
    Embedding(input_dim=vocab_size, output_dim=embedding_dim, input_length=max_len),
    Bidirectional(LSTM(lstm_units, return_sequences=True, activation='tanh', recurrent_activation='sigmoid')),
    Dropout(dropout_rate),
    Bidirectional(LSTM(lstm_units, activation='tanh', recurrent_activation='sigmoid')),
    Dropout(dropout_rate),
    Dense(3, activation='softmax')
])

LSTM_Model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
LSTM_Model.summary()

Model: "sequential_6"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_7 (Embedding)     (None, 130, 128)          180736    
                                                                 
 bidirectional_9 (Bidirecti  (None, 130, 128)          98816     
 onal)                                                           
                                                                 
 dropout_13 (Dropout)        (None, 130, 128)          0         
                                                                 
 bidirectional_10 (Bidirect  (None, 128)               98816     
 ional)                                                          
                                                                 
 dropout_14 (Dropout)        (None, 128)               0         
                                                                 
 dense_6 (Dense)             (None, 3)                

In [25]:
from tensorflow.keras.callbacks import EarlyStopping


early_stopping = EarlyStopping(monitor='val_loss', patience=5, restore_best_weights=True)

# Train the model with early stopping
batch_size = 128
epochs = 50

history = LSTM_Model.fit(
    X_train_pad, y_train,
    epochs=epochs,
    batch_size=batch_size,
    validation_split=0.2,
    callbacks=[early_stopping],
    verbose=1
)

Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50


In [25]:
loss, accuracy = LSTM_Model.evaluate(X_test_pad, y_test, verbose=1)
print(f'Test Accuracy: {accuracy:.4f}')

LSTM_Model.save('LSTM.h5')

Test Accuracy: 0.8752


In [28]:
import string
import unicodedata
from pyvi import ViTokenizer
import numpy as np
import os
def predict_sentiment(text):

    with open('vietnamese-stopwords.txt', 'r', encoding='utf-8') as sf:
        stopwords = [word.strip() for word in sf.readlines()]
    text = text.lower()

    # Remove diacritics (accents)
    text = unicodedata.normalize('NFD', text).encode('ascii', 'ignore').decode('utf-8')

    # Remove punctuation
    translator = str.maketrans('', '', string.punctuation)
    text = text.translate(translator)

    # Tokenize using Vietnamese tokenizer (pyvi)
    text = ViTokenizer.tokenize(text)

    # Remove stopwords
    text = ' '.join([word for word in text.split() if word not in stopwords])

    # Preprocess input text
    text_seq = tokenizer.texts_to_sequences([text])
    text_pad = pad_sequences(text_seq, maxlen=max_len, padding='post')
    # Predict sentiment
    prediction = LSTM_Model.predict(text_pad)
    
    # Interpret the prediction
    sentiment_labels = ['Negative', 'Neutral', 'Positive']
    sentiment_score = {sentiment_labels[i]: prediction[0][i] for i in range(3)}
    
    return sentiment_score

# Example usage
user_input = input("Enter a Vietnamese text: ")
sentiment_score = predict_sentiment(user_input)
print("Sentiment scores:", sentiment_score)

Sentiment scores: {'Negative': 2.265906e-05, 'Neutral': 4.2134852e-05, 'Positive': 0.99993527}
