<a href="https://colab.research.google.com/github/DataStrategisthjk/NLP/blob/main/Using_LSTM_model_for_sentimental_analysis.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import numpy as np
import pandas as pd

train = pd.read_csv("/content/ratings_train.txt", header=0, delimiter="\t", quoting=3)
train

#### I will be using NAVER movie review data to work on sentimental analysis.
#### To do so, I would need a simple preprocessing before working with a deep learning model.

In [None]:
!pip install konlpy # required to translate korean language

In [None]:
import re
from konlpy.tag import Okt

okt = Okt()

text = "안녕하세요." # hello in korean

okt.morphs(text, stem=True)

In [None]:
okt.morphs(text, stem=False)

1. extracted a list of string types
2. filtered with regular expression (i.e., special characters, emoticons).
3. eliminated stopwords and created a list.

In [None]:
stop_word = ['은', '는','이', '가','이다'] #morphological words in korean language.

def preprocessing(content, okt):
    content_re = re.sub("[^가-힣 ]", "",content)
    content_word = okt.morphs(content_re, stem=True)

    word_list = []

    for word in content_word:
        if word not in stop_word:
            word_list.append(word)

    return word_list


In [None]:
preprocessing("안녕하세요 HJK입니다. 감성분류를 하고 있습니다.", okt) #this means, "hello this is hjk(my initial). I am doing sentimental analysis" in korean language.


In [None]:
# Data preprocessing

train_review = [] # empty list for data preprocessing

for review in train['document'][:500]: # only 5 million words since not possible for 15 million words.
    train_review.append(preprocessing(review, okt)) # preprocessing function with reviews and stemming.
                                                    # append the return values, stack them at the train reviews.
                                                    # Then, train review becomes 2d array.


In [None]:
train_review

In [None]:
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.preprocessing.text import Tokenizer

tokenizer = Tokenizer() #tool that changes words into numbers.

#Defining the overall orders by tokenizers.
#Define numbers by words
#Construct dict for word - numbers.
tokenizer.fit_on_texts(train_review)

# change words into numbers by tokenizers for each reviews.
train_sequence = tokenizer.texts_to_sequences(train_review)
train_sequence # confirmation

In [None]:
# Deeplearning model's input size has a length
# Each reviews have different lengths.

# if input size > 17, then can not enter.
# Fit the size -> fill in with padding.

train_input = pad_sequences(train_sequence, maxlen=8, padding="post")

# maxlen=8: paddin, length size of 8.
# padding="post": fill in with 0 from the back.
train_input

In [None]:
# Target val.

train_label = np.array(train['label'])
train_label


In [None]:
#Constructing a model.

# Function to split the data in an 8(training):2(evaluation) ratio
from sklearn.model_selection import train_test_split

# Training data, evaluation data, training answers, evaluation answers
# Feature data, answer data, val data size ratio
x_train, x_val, y_train, y_val = train_test_split(train_input, train_label[:500], test_size=0.2)

from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Flatten,Dense, Embedding

model = Sequential() # Define model object
word_size = len(tokenizer.word_index)+1
model.add(Embedding(word_size, 128, input_length = 8)) # Word size, 128 output, 8 size input
model.add(Flatten()) # If the embedding result is 2D, flatten it to make it a 1D vector
model.add(Dense(1,activation='relu')) # Pass through the activation function relu to get an output of 1
model.compile(optimizer="adam",loss="binary_crossentropy", metrics =['accuracy'])
             # Model configuration section, set optimizer to adam, compute loss with binary_crossentropy,
             # Measure model performance with accuracy.

model.fit(x_train,y_train, epochs=5, batch_size = 32)

In [None]:
model.evaluate(x_val,y_val)


In [None]:
text = "이 영화 너무 다시볼거야 너무 재밌다" # "this movie is very fun, and i will watch this one again" in korean language.

re_text = preprocessing(text, okt) # Preprocessing: regular expression, stemming, stopword processing
text_data = []
text_data.append(re_text) # It must be made in the form of n x n, as there is only one data,
                          # It should go in like [[word list]].
                          # If there are 2 pieces of data, It should go in 2 x n like [[word list],[word list]].
text_seq = tokenizer.texts_to_sequences(text_data) # Convert word list to number list. It should be padded to a size of 8
text_seq = pad_sequences(text_seq, maxlen = 8, padding = "post")
model.predict(text_seq) # Evaluate positivity and negativity by inserting it into the model, negative towards 0, positive towards 1
                        # As only 500 sentences are currently entered, the accuracy is low.


Using LSTM model

In [None]:
from tensorflow.keras.layers import LSTM
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Flatten,Dense, Embedding
from tensorflow.keras.callbacks import EarlyStopping

model = Sequential()
model.add(Embedding(word_size, 128, input_length = 8)) #Embedding layer configuration
model.add(LSTM(units=128)) #Define LSTM model, units are the number of output features
model.add(Dense(1,activation="relu")) #Dense takes the output features of LSTM, passes through relu and outputs one.
model.compile(optimizer="adam",loss="binary_crossentropy", metrics =['accuracy'])
early = EarlyStopping(monitor = "val_loss" , mode = "min", verbose = 1, patience = 5)
model.fit(x_train,y_train, epochs=100, batch_size = 32, callbacks = [early],
          validation_split = 0.2) #Total learning epochs 5, batch size is 32

In [None]:
model.evaluate(x_val,y_val)

In [None]:
text = "이 영화 너무 다시볼거야 너무 재밌다"

re_text = preprocessing(text, okt) #Preprocessing: regular expression, stemming, stopword processing
text_data = []
text_data.append(re_text) #It must be made in the form of n x n, since there is only one data
                          #It should be entered like [[word list]]. If there are two data, it should go in as 2 x n like [[word list],[word list]].
text_seq = tokenizer.texts_to_sequences(text_data) #Convert the word list into a list of numbers
text_seq = pad_sequences(text_seq, maxlen = 8, padding = "post") #It should be padded to a size of 8.
model.predict(text_seq) #Put it in the model and evaluate positive/negative, the closer to 0, the more negative, the closer to 1, the more positive.
                        #As only 500 sentences are currently entered, the accuracy is low.
