In [1]:
from keras.models import Sequential
from keras import layers
from keras.preprocessing.sequence import pad_sequences
from keras.preprocessing.text import Tokenizer
import pandas as pd
from sklearn.model_selection import train_test_split

In [2]:
filepath_dict = {'train':   'resturant train.xlsx',
                 'test': 'resturant test.xlsx'}

df_list = []
for source, filepath in filepath_dict.items():
    df = pd.read_excel(filepath, names=['text', 'category', 'polarity2', 'sentiment'])
    df['source'] = source
    df_list.append(df)

df = pd.concat(df_list)

In [3]:
df

Unnamed: 0,text,category,polarity2,sentiment,source
0,But the staff was so horrible to us.,service,negative,-1,train
1,"To be completely fair, the only redeeming fact...",food,positive,1,train
2,"To be completely fair, the only redeeming fact...",anecdotes/miscellaneous,negative,-1,train
3,"The food is uniformly exceptional, with a very...",food,positive,1,train
4,Where Gabriela personaly greets you and recomm...,service,positive,1,train
...,...,...,...,...,...
968,"I have never in my life sent back food before,...",food,negative,-1,test
969,"I have never in my life sent back food before,...",service,negative,-1,test
970,"Although the restaurant itself is nice, I pref...",ambience,positive,1,test
971,"Although the restaurant itself is nice, I pref...",food,negative,-1,test


In [4]:
df_train = df[df['source'] == 'train']

In [5]:
sentences = df_train['text'].values
y = df_train['sentiment'].values

In [6]:
sentences_test, sentences_train, y_train, y_test = train_test_split(sentences, y, test_size=0.1, random_state=1000)

In [28]:
len(sentences_train)

3155

In [21]:
tokenizer = Tokenizer(num_words=5000)
tokenizer.fit_on_texts(sentences_train)

In [22]:
X_train = tokenizer.texts_to_sequences(sentences_train)
X_test = tokenizer.texts_to_sequences(sentences_test)

In [23]:
vocab_size = len(tokenizer.word_index) + 1

In [24]:
embedding_dim = 100
maxlen = 100

In [25]:
X_train = pad_sequences(X_train, padding='post', maxlen=maxlen)
X_test = pad_sequences(X_test, padding='post', maxlen=maxlen)

In [26]:
model = Sequential()
model.add(layers.Embedding(vocab_size, embedding_dim, input_length=maxlen))
model.add(layers.Conv1D(128, 5, activation='relu'))
model.add(layers.GlobalMaxPooling1D())
model.add(layers.Dense(10, activation='relu'))
model.add(layers.Dense(1, activation='sigmoid'))
model.compile(optimizer='adam',
             loss='binary_crossentropy',
             metrics=['accuracy'])
model.summary()

Model: "sequential_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, 100, 100)          416700    
_________________________________________________________________
conv1d_1 (Conv1D)            (None, 96, 128)           64128     
_________________________________________________________________
global_max_pooling1d_1 (Glob (None, 128)               0         
_________________________________________________________________
dense_2 (Dense)              (None, 10)                1290      
_________________________________________________________________
dense_3 (Dense)              (None, 1)                 11        
Total params: 482,129
Trainable params: 482,129
Non-trainable params: 0
_________________________________________________________________


In [27]:
history = model.fit(X_train, y_train,
                   epochs=10,
                   verbose=False,
                   validation_data=(X_test, y_test),
                   batch_size=10)

In [29]:
loss, accuracy = model.evaluate(X_train, y_train, verbose=False)
print("Training Accuracy: {:.4f}".format(accuracy))
loss, accuracy = model.evaluate(X_test, y_test, verbose=False)
print("Testing Accuracy: {:.4f}".format(accuracy))

Training Accuracy: 0.6177
Testing Accuracy: 0.4615
