In [1]:
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.utils import to_categorical
from sklearn.model_selection import train_test_split

import numpy as np
import pandas as pd

import string
import re
from unicodedata import normalize

In [2]:
tf.config.run_functions_eagerly(True)

In [3]:
data = pd.read_csv("data/Sentiment.csv")

data.head()

Unnamed: 0,text,label
0,I grew up (b. 1965) watching and loving the Th...,0
1,"When I put this movie in my DVD player, and sa...",0
2,Why do people who do not know what a particula...,0
3,Even though I have great interest in Biblical ...,0
4,Im a die hard Dads Army fan and nothing will e...,1


In [4]:
def clean(inp):
    
    cleaned = []

    re_print = re.compile('[^%s]' % re.escape(string.printable))
    table = str.maketrans(string.punctuation, ' '*len(string.punctuation))

    for line in inp:
            
            line = normalize('NFD' , (line)).encode('ascii' , 'ignore')
            line = line.decode('UTF-8')  
            line = line.split()
            line = [word.lower() for word in line]
            line = [word.translate(table) for word in line]
            line = [re_print.sub('' , w) for w in line]
            line = [word for word in line if word.isalpha()]
            cleaned.append(' '.join(line))
        
    return np.array(cleaned)

In [5]:
X = list(data['text'])
Y = np.array(data[['label']])

X = clean(X)

In [6]:
print('0s :' , len([i for i in Y if i == [0]]))
print('1s :' , len([i for i in Y if i == [1]]))

0s : 20019
1s : 19981


In [7]:
X_train , X_test , Y_train , Y_test = train_test_split(X , Y , test_size=0.2 , shuffle = True)

In [8]:
t = Tokenizer(2000)
t.fit_on_texts(X_train)

In [9]:
import pickle

with open('token' , "wb") as file:
  pickle.dump(t , file)

In [10]:
X_train = tf.keras.preprocessing.sequence.pad_sequences(t.texts_to_sequences(X_train) , maxlen = 128)
X_test = tf.keras.preprocessing.sequence.pad_sequences(t.texts_to_sequences(X_test) , maxlen = 128)

In [11]:
X_train.shape

(32000, 128)

In [14]:
model = tf.keras.Sequential()
model.add(tf.keras.layers.Embedding(2000 , 32 , input_length = 128))
model.add(tf.keras.layers.LSTM(32 , return_sequences = False))
model.add(tf.keras.layers.Dense(32))
model.add(tf.keras.layers.Activation('relu'))
model.add(tf.keras.layers.Dense(1 , activation = 'sigmoid'))

model.compile(loss = 'binary_crossentropy' , optimizer = 'adam' , metrics = ['accuracy'])

In [15]:
his = model.fit(X_train,Y_train, epochs = 5 , validation_data = (X_test , Y_test))

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


In [20]:
text = "that is great"

word = pad_sequences(t.texts_to_sequences([text]) , 128)
if model.predict(np.array(word))[0][0] > 0.5:
  print('Input :' , text , '\nPrediction : Positive')
else:
  print('Input :' , text , '\nPrediction : Negative')

Input : that is great 
Prediction : Positive


In [21]:
model.save_weights('sentiment_final.h5')