In [1]:
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential
from keras.layers import Dense, Embedding, LSTM, SpatialDropout1D
from matplotlib import pyplot
from sklearn.model_selection import train_test_split
from keras.utils.np_utils import to_categorical
import re
import tensorflow as tf
from keras.models import load_model


from sklearn.preprocessing import LabelEncoder

In [2]:
# Standard script to load GPU
physical_devices = tf.config.experimental.list_physical_devices('GPU')
print("Num GPUS Available: ", len(physical_devices))
tf.config.experimental.set_memory_growth(physical_devices[0], True)

device_name = tf.test.gpu_device_name()
if device_name != '/device:GPU:0':
  raise SystemError('GPU device not found')
print('Found GPU at: {}'.format(device_name))

Num GPUS Available:  1
Found GPU at: /device:GPU:0


In [3]:
data = pd.read_csv('Sentiment.csv')
# Keeping only the neccessary columns
data = data[['text','sentiment']]
print(data)
data['text'] = data['text'].apply(lambda x: x.lower())
data['text'] = data['text'].apply((lambda x: re.sub('[^a-zA-z0-9\s]', '', x)))

print(data)

                                                    text sentiment
0      RT @NancyLeeGrahn: How did everyone feel about...   Neutral
1      RT @ScottWalker: Didn't catch the full #GOPdeb...  Positive
2      RT @TJMShow: No mention of Tamir Rice and the ...   Neutral
3      RT @RobGeorge: That Carly Fiorina is trending ...  Positive
4      RT @DanScavino: #GOPDebate w/ @realDonaldTrump...  Positive
...                                                  ...       ...
13866  RT @cappy_yarbrough: Love to see men who will ...  Negative
13867  RT @georgehenryw: Who thought Huckabee exceede...  Positive
13868  RT @Lrihendry: #TedCruz As President, I will a...  Positive
13869  RT @JRehling: #GOPDebate Donald Trump says tha...  Negative
13870  RT @Lrihendry: #TedCruz headed into the Presid...  Positive

[13871 rows x 2 columns]
                                                    text sentiment
0      rt nancyleegrahn how did everyone feel about t...   Neutral
1      rt scottwalker didnt catch th

In [4]:
for idx, row in data.iterrows():
    row[0] = row[0].replace('rt', ' ')
print(data)
max_fatures = 2000
tokenizer = Tokenizer(num_words=max_fatures, split=' ')
tokenizer.fit_on_texts(data['text'].values)
X = tokenizer.texts_to_sequences(data['text'].values)

X = pad_sequences(X)

print(X[0])

                                                    text sentiment
0        nancyleegrahn how did everyone feel about th...   Neutral
1        scottwalker didnt catch the full gopdebate l...  Positive
2        tjmshow no mention of tamir rice and the gop...   Neutral
3        robgeorge that carly fiorina is trending  ho...  Positive
4        danscavino gopdebate w realdonaldtrump deliv...  Positive
...                                                  ...       ...
13866    cappy_yarbrough love to see men who will nev...  Negative
13867    georgehenryw who thought huckabee exceeded t...  Positive
13868    lrihendry tedcruz as president i will always...  Positive
13869    jrehling gopdebate donald trump says that he...  Negative
13870    lrihendry tedcruz headed into the presidenti...  Positive

[13871 rows x 2 columns]
[   0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0   52   78  341  456   22    2  420  365   95   29   51 1039    1]


In [5]:
embed_dim = 128
lstm_out = 196
def createmodel():
    model = Sequential()
    model.add(Embedding(max_fatures, embed_dim,input_length = X.shape[1]))
    model.add(LSTM(lstm_out, dropout=0.2, recurrent_dropout=0.2))
    model.add(Dense(3,activation='softmax'))
    model.compile(loss = 'categorical_crossentropy', optimizer='adam',metrics = ['accuracy'])
    return model
# print(model.summary())

In [6]:
labelencoder = LabelEncoder()
integer_encoded = labelencoder.fit_transform(data['sentiment'])
y = to_categorical(integer_encoded)
# Find how Hot encoding 
print(y[1], " Is positive")
print(y[0], " Is neutral")
print(y[13866], " Is negative")

X_train, X_test, Y_train, Y_test = train_test_split(X,y, test_size = 0.33, random_state = 42)

[0. 0. 1.]  Is positive
[0. 1. 0.]  Is neutral
[1. 0. 0.]  Is negative


In [7]:
from keras.wrappers.scikit_learn import KerasClassifier
model = KerasClassifier(build_fn=createmodel, verbose=0)
batch_size= [10, 20, 40]
epochs = [1, 2, 3]
param_grid= dict(batch_size=batch_size, epochs=epochs)

In [8]:
with tf.device('/gpu:0'):
  from sklearn.model_selection import GridSearchCV
  grid= GridSearchCV(estimator=model, param_grid=param_grid)
  grid_result= grid.fit(X_train, Y_train)
  # summarize results
  print("Best: %f using %s" % (grid_result.best_score_, grid_result.best_params_))

Best: 0.680189 using {'batch_size': 40, 'epochs': 2}


In [9]:
  batch_size = 40
  epoch = 2
  with tf.device('/gpu:0'):
    model = createmodel()
    model.fit(X_train, Y_train, epochs = epoch, batch_size=batch_size, verbose = 2)
    score,acc = model.evaluate(X_test,Y_test,verbose=2,batch_size=batch_size)
    print(score)
    print(acc)
    print(model.metrics_names)


Epoch 1/2
233/233 - 25s - loss: 0.8280 - accuracy: 0.6437
Epoch 2/2
233/233 - 23s - loss: 0.6847 - accuracy: 0.7096
115/115 - 1s - loss: 0.7408 - accuracy: 0.6861
0.7407616376876831
0.6861074566841125
['loss', 'accuracy']


In [112]:
model: model.save('model.h5')

In [132]:
# Import the saved model
saved = load_model('model.h5')
# Numpy for manipulatation
import numpy as np

# create the sample tweet and subject it to the same preprocessing
tweet = ["@realDonaldTrump: A lot of good things are happening. We are respected again throughout the world, and that's a great thing"]
tweet[0] = tweet[0].lower()
tweet[0] = re.sub('[^a-zA-z0-9\s]', '', tweet[0])

# Check if string is prepped
print(tweet, "\n")

# Tokenize string
max_fatures = 2000
tokenizer = Tokenizer(num_words=max_fatures, split=' ')
tokenizer.fit_on_texts(tweet)
X = tokenizer.texts_to_sequences(tweet)

X = pad_sequences(X, maxlen=28)

# Show the output neurons
print(saved(X), "\n")

# Display the prediciton
result = np.argmax(saved(X))

if(result == 0):
  print("Sentence is Negative")
elif(result == 1):
  print("Sentence is Neutral")
elif(result == 2):
  print("Sentence is Positive")



['realdonaldtrump a lot of good things are happening we are respected again throughout the world and thats a great thing'] 

tf.Tensor([[0.88424104 0.07245608 0.04330288]], shape=(1, 3), dtype=float32) 

Sentence is Negative
