# <center> Sentiment Analysis

### Import Libraries

In [0]:
import tensorflow as tf
import numpy as np
import pandas as pd
import re
import keras
from keras import Model
from tensorflow.keras.layers import Flatten,LSTM, Dense, Flatten, Embedding
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.models import Sequential
from keras_preprocessing.text import Tokenizer
from keras.initializers import glorot_uniform
from sklearn import model_selection

### Load Data

In [0]:
#Read in data
with open('train.csv', 'r') as file:
    text = file.readlines()

In [0]:
#create empty dataframe
x_train = pd.DataFrame()

In [0]:
# fill in dataframe
word=[]
label=[]
for n in text:
    n=n.split()
    label.append(1) if n[0] =="__label__2" else label.append(0)
    word.append(" ".join(n[1:]))
x_train['consumer_review'] = word
x_train['polarity_label'] = label

In [0]:
#view dataframe
x_train

### Data Preparation

In [0]:
#use only 20% of data to avoid overloading your system.You can reduce or increase this number according to your convenience.
_, x_set,_, y_set = \
    model_selection.train_test_split(x_train['consumer_review'], 
                                     x_train['polarity_label'], test_size=0.02)

In [0]:
#data cleaning function
def data_prep(in_tex):
    # Remove punctuations and numbers
    out_tex = re.sub('[^a-zA-Z]', ' ', in_tex)
    # Convert upper case to lower case
    out_tex="".join(list(map(lambda x:x.lower(),out_tex)))
    # Remove single character
    out_tex= re.sub(r"\s+[a-zA-Z]\s+", ' ', out_tex)
    return out_tex

In [0]:
#create new list with clean data
text_set=[]
for reviews in list(x_set):
    text_set.append(data_prep(reviews))


In [0]:
x_train= pd.DataFrame()
x_train['consumer_review'] = text_set
x_train['polarity_label'] = list(y_set)

In [0]:
#split data into 70% train and 30% test
x_train, x_test, y_train, y_test = \
    model_selection.train_test_split(x_train['consumer_review'], 
                                     x_train['polarity_label'], test_size=0.30)

In [0]:
#convert to array
x_train=np.array(x_train.values.tolist())
x_test=np.array(x_test.values.tolist())
y_train=np.array(y_train.values.tolist())
y_test=np.array(y_test.values.tolist())

In [0]:
#tokenizer
tokenizer = Tokenizer()
tokenizer.fit_on_texts(x_train)
word_index=tokenizer.word_index
total_size = len(word_index)+1

In [0]:
print(total_size)

22259


In [0]:
#text to sequence
x_train = tokenizer.texts_to_sequences(x_train)
x_test = tokenizer.texts_to_sequences(x_test)

In [0]:
#add padding to ensure the same length
max_length = 100
x_train = pad_sequences(x_train, padding='post', maxlen=max_length)
x_test = pad_sequences(x_test, padding='post', maxlen=max_length)

### Structure Model

In [0]:
#Create Model
model = Sequential()
model.add(Embedding(total_size, 20, input_length=max_length))
model.add(LSTM(32,dropout=0.2, recurrent_dropout=0.2))
model.add(Dense(1, activation='sigmoid'))

In [0]:
#compile
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['acc'])
print(model.summary())

### Train Model

In [0]:
model.fit(x_train, y_train, batch_size=128, epochs=5, verbose=1, validation_data=(x_test, y_test))


### Save Model (Optional)

In [0]:
model.save("model.h5")

### Load Model

In [0]:
model = keras.models.load_model("model.h5")