In [371]:
import tensorflow as tf
from tensorflow import keras
import pandas as pd
import numpy as np
import string
import nltk
from nltk.corpus import stopwords
from collections import Counter
from tensorflow.keras import layers
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.preprocessing.text import Tokenizer

In [372]:
cd /content/drive/MyDrive/nlp_project

/content/drive/MyDrive/nlp_project


In [373]:
df = pd.read_csv('data.csv').dropna().drop(['Country','Restaurant Name','Review Title','Review Date'],axis = 1)

labels = ['Negative','Positive']

def change_labels(cell):
    return int(list(labels).index(cell))

df.Sentiment = df['Sentiment'].apply(change_labels)

def remove_punct(cell):
    translator = str.maketrans("","",string.punctuation)
    return cell.translate(translator)
df.Review = df.Review.map(remove_punct)

nltk.download('stopwords')
stop = set(stopwords.words('english'))
def remove_stopwords(text):
    filtered_words = [word.lower() for word in text.split() if word.lower() not in stop]
    return " ".join(filtered_words)
df.Review = df.Review.map(remove_stopwords)

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [374]:
def count(data_column):
    count = Counter()
    for text in data_column:
        for word in text.split():
            count[word] += 1
    return count

counter = count(df.Review)
nr_unique_words = len(counter)

In [375]:
train_size = int(df.shape[0]*0.8)
df = df.sample(frac=1).reset_index(drop=True)
train_df = df[:train_size]
test_df = df[train_size:]

train_sentences = train_df.Review.to_numpy()
train_labels = train_df.Sentiment.to_numpy()
test_sentences = test_df.Review.to_numpy()
test_labels = test_df.Sentiment.to_numpy()

In [376]:
tokenizer = Tokenizer(num_words = nr_unique_words)
tokenizer.fit_on_texts(train_sentences)

word_dict = tokenizer.word_index
train_sequences = tokenizer.texts_to_sequences(train_sentences)
test_sequences = tokenizer.texts_to_sequences(test_sentences)

max_length = 200

train_padded = pad_sequences(train_sequences, maxlen = max_length, padding = 'post',truncating = 'post')
test_padded = pad_sequences(test_sequences, maxlen = max_length, padding = 'post',truncating = 'post')
reverse_word_index = dict([(idx,word) for (word,idx) in word_dict.items()])
def decode(sequence):
    return " ".join([reverse_word_index.get(idx,'?') for idx in sequence])

In [377]:
model = keras.models.Sequential(
    [
        layers.Embedding(nr_unique_words, 32, input_length = max_length),
        layers.Bidirectional(layers.LSTM(64,dropout = 0.1)),
        layers.Dense(32,activation = 'relu'),
        layers.Dense(16,activation = 'relu'),
        layers.Dense(8,activation = 'relu'),
        layers.Dense(2,activation = 'softmax')
    ]
)

model.summary()

model.compile(
    loss = keras.losses.SparseCategoricalCrossentropy(from_logits = False),
    optimizer = keras.optimizers.Adam(learning_rate = 0.001),
    metrics = ['accuracy']
)

model.fit(train_padded, train_labels, epochs = 10,validation_data = (test_padded, test_labels), verbose = True)
model.evaluate(test_padded, test_labels)

Model: "sequential_14"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_14 (Embedding)    (None, 200, 32)           214304    
                                                                 
 bidirectional_14 (Bidirect  (None, 128)               49664     
 ional)                                                          
                                                                 
 dense_39 (Dense)            (None, 32)                4128      
                                                                 
 dense_40 (Dense)            (None, 16)                528       
                                                                 
 dense_41 (Dense)            (None, 8)                 136       
                                                                 
 dense_42 (Dense)            (None, 2)                 18        
                                                     

[0.17887690663337708, 0.9534883499145508]

In [389]:
def encode(text):
    arr = list()
    for i in text.split():
        arr.append(word_dict.get(i,0))
    t = list()
    t.append(arr)
    arr = pad_sequences(t, maxlen = max_length, padding = 'post',truncating = 'post')
    return np.array(arr)

pred = model.predict(encode('The food was good and the staff was friendly'))
print(pred)
print(labels[np.argmax(pred)])

[[1.9147257e-04 9.9980861e-01]]
Positive
