In [1]:
from __future__ import print_function
import numpy as np
import pandas as pd
import json
from sklearn.model_selection import train_test_split
import keras
from keras.datasets import reuters
from keras.models import Sequential
from keras.layers import Dense, Dropout, Activation
from keras.preprocessing.text import Tokenizer, text_to_word_sequence

Using TensorFlow backend.
  return f(*args, **kwds)


In [2]:
data = pd.read_csv("Sentiment Analysis Dataset.csv", skiprows = [8835, 535881], usecols = [1, 3])
data

Unnamed: 0,Sentiment,SentimentText
0,0,is so sad for my APL frie...
1,0,I missed the New Moon trail...
2,1,omg its already 7:30 :O
3,0,.. Omgaga. Im sooo im gunna CRy. I'...
4,0,i think mi bf is cheating on me!!! ...
5,0,or i just worry too much?
6,1,Juuuuuuuuuuuuuuuuussssst Chillin!!
7,0,Sunny Again Work Tomorrow :-| ...
8,1,handed in my uniform today . i miss you ...
9,1,hmmmm.... i wonder how she my number @-)


In [3]:
x = list(data['SentimentText'])
y = list(data['Sentiment'])
train_x, val_x, train_y, val_y = train_test_split(x, y, random_state = 0)

In [4]:
# create a new Tokenizer that finds the 3000 most popular words found in our dataset
tokenizer = Tokenizer(num_words = 3000)
tokenizer.fit_on_texts(train_x)

In [5]:
dictionary = tokenizer.word_index
with open('dictionary.json', 'w') as dictionary_file:
    json.dump(dictionary, dictionary_file)

In [7]:
train_WordIndices = []
# This converts strings of text into lists of index array
for text in train_x:
    wordIndices = [dictionary[word] for word in text_to_word_sequence(text)]
    train_WordIndices.append(wordIndices)

train_WordIndices_arr = np.asarray(train_WordIndices)

In [8]:
train_WordIndices_arr

array([list([2289, 6, 5374]),
       list([33685, 139, 24, 118, 11, 4, 174, 203, 1, 114, 58, 145, 596, 10, 4, 13248, 133]),
       list([113674, 69, 898, 3344, 306, 1, 33, 82, 16, 5497, 47, 38, 3300, 38, 6093]),
       ...,
       list([3804, 1419, 1419, 569812, 8, 20, 61, 2, 270, 31, 3, 77, 3786, 229, 2148, 244, 53, 59, 78, 9, 544, 270, 267, 33, 53, 20049, 123, 72, 58, 16, 50]),
       list([20, 361, 28, 51, 4, 683, 4142]),
       list([13377, 60, 142, 4335, 6, 477, 187])], dtype=object)

In [10]:
# create matrices out of the indexed tweets
# tokenizer.sequences_to_matrix returns a numpy matrix of (len(allWordindices), 3000)
train_x = tokenizer.sequences_to_matrix(train_WordIndices_arr, mode='binary')
train_y = keras.utils.to_categorical(train_y, 2)

In [11]:
model = Sequential()
model.add(Dense(512, input_shape=(3000,), activation='relu'))
model.add(Dropout(0.5))
model.add(Dense(256, activation='sigmoid'))
model.add(Dropout(0.5))
model.add(Dense(2, activation='softmax'))

In [12]:
model.compile(optimizer='adam', metrics=['accuracy'], loss='categorical_crossentropy')

In [13]:
model.fit(train_x, train_y, batch_size=32, epochs=5, verbose=1, validation_split=0.1, shuffle=True)

Train on 1065563 samples, validate on 118396 samples
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<keras.callbacks.History at 0x7f049ae3b630>

In [14]:
# Validation
val_WordIndices = []

def sentence_to_words(text) :
    wordIndices = []
    arr_word = text_to_word_sequence(text)
    for word in arr_word:
        if word in dictionary:
            wordIndices.append(dictionary[word])
    return wordIndices
            
# This converts strings of text into lists of index array
for text in val_x:
    val_WordIndices.append(sentence_to_words(text))

val_WordIndices_arr = np.asarray(val_WordIndices)
val_WordIndices_arr

array([list([294937, 504]),
       list([397, 545, 247, 13, 382, 132, 4, 851, 85, 8985, 9, 190025, 182, 21, 794, 18, 382, 96, 20, 4680, 742, 8985]),
       list([431, 808, 744, 51560, 9956, 6, 96, 1, 300, 16, 4, 3044]), ...,
       list([184, 39, 57, 5085, 5085, 8680, 10275, 5085, 13123, 65, 16111, 13123, 103, 65, 5014, 1273, 65, 1898, 160, 65, 1024, 16111, 13123, 479167]),
       list([46787, 413, 47, 5, 1717, 318, 4, 30, 105, 277, 194, 8, 381, 15, 58, 67, 719, 31, 413, 244, 7, 210, 59, 871, 85, 17, 8]),
       list([825, 25, 4, 1804, 12, 50])], dtype=object)

In [15]:
val_x = tokenizer.sequences_to_matrix(val_WordIndices_arr, mode='binary')

In [16]:
pred_y = model.predict(val_x)

In [39]:
pred_y

array([[ 0.0518201 ,  0.94817984],
       [ 0.63616741,  0.36383253],
       [ 0.13913073,  0.86086929],
       ..., 
       [ 0.08002418,  0.91997582],
       [ 0.0986807 ,  0.90131932],
       [ 0.57361299,  0.42638701]], dtype=float32)

In [44]:
val_y1 = keras.utils.to_categorical(val_y, 2)
val_y1

array([[ 0.,  1.],
       [ 1.,  0.],
       [ 0.,  1.],
       ..., 
       [ 0.,  1.],
       [ 0.,  1.],
       [ 1.,  0.]])

In [50]:
# evaluation metrics
from sklearn.metrics import accuracy_score
accuracy_score(val_y1, pred_y.round())

0.80600172810038184

In [53]:
model.save('my_model.h5')

In [58]:
tokenizer2 = Tokenizer(num_words=3000)
labels = ['negative', 'positive']
while True:
    text_input = input('Evaluate this:')
    if len(text_input) == 0:
        break
    words_input = sentence_to_words(text_input)
    input1 = tokenizer2.sequences_to_matrix([words_input], mode='binary')
    pred = model.predict(input1)
    print("%s sentiment; %f%% confidence" % (labels[np.argmax(pred)], pred[0][np.argmax(pred)] * 100))

Evaluate this:if you dont give up, you still have a chance. Giving up is the greatest failure
positive sentiment; 74.754125% confidence
Evaluate this:Chase interesting, work on things that matter
positive sentiment; 95.394254% confidence
Evaluate this:Today is hard. tomorrow will be worse, but the day after tomorrow will be sunshine.
negative sentiment; 63.721722% confidence
Evaluate this:When I am myself, I am happy and have a good result.
positive sentiment; 97.670412% confidence
Evaluate this:You are bad
negative sentiment; 57.280171% confidence
Evaluate this:Your attitude is everything
positive sentiment; 67.119724% confidence
Evaluate this:The very important thing you should have is patience
positive sentiment; 96.641892% confidence
Evaluate this:I love neural network
positive sentiment; 93.712616% confidence
Evaluate this:
