In [1]:
import numpy as np
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
import re
import string
import pandas as pd
from nltk.corpus import stopwords
from sklearn.model_selection import train_test_split 


In [2]:
twits = pd.read_csv("twitter.csv")
twits

Unnamed: 0,ID,Topic,Sentiment,Text
0,2401,Borderlands,Positive,im getting on borderlands and i will murder yo...
1,2401,Borderlands,Positive,I am coming to the borders and I will kill you...
2,2401,Borderlands,Positive,im getting on borderlands and i will kill you ...
3,2401,Borderlands,Positive,im coming on borderlands and i will murder you...
4,2401,Borderlands,Positive,im getting on borderlands 2 and i will murder ...
...,...,...,...,...
74677,9200,Nvidia,Positive,Just realized that the Windows partition of my...
74678,9200,Nvidia,Positive,Just realized that my Mac window partition is ...
74679,9200,Nvidia,Positive,Just realized the windows partition of my Mac ...
74680,9200,Nvidia,Positive,Just realized between the windows partition of...


In [3]:
def clean_text(text): 
    text = str(text)
    text = text.lower()
    text = re.sub("\d", " ", text)
    text = re.sub("@\S+", " ", text)
    text = re.sub("https*\S+", " ", text)
    text = re.sub("#\S+", " ", text)
    text = re.sub('[%s]' % re.escape(string.punctuation), ' ', text)
    text = re.sub('\n', ' ', text)
    text = re.sub('\s{2,}',' ', text)
    stop_words = stopwords.words("english")
    text = ' '.join([word for word in text.split(' ') if word not in stop_words])
    return text

In [4]:
twits['Text'] = [clean_text(twit) for twit in twits['Text']]
twits["Sentiment"] = twits["Sentiment"].replace({'Positive':0, 'Neutral':0, 'Irrelevant':1, 'Negative':1})

In [5]:
vocab_size = 1000
max_length = 20

tokenizer = Tokenizer(num_words=vocab_size, oov_token="<OOV>")
tokenizer.fit_on_texts(twits['Text'])

sequences = tokenizer.texts_to_sequences(twits['Text'])
padded_sequences = pad_sequences(sequences, maxlen=max_length, padding='post', truncating='post')

x = np.array(padded_sequences)
y = np.array(twits["Sentiment"])

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.30)

In [6]:
model = tf.keras.Sequential([
    tf.keras.layers.Embedding(input_dim=vocab_size, output_dim=16, input_length=max_length),
    tf.keras.layers.LSTM(128),
    tf.keras.layers.Dense(1, activation='sigmoid')
])

model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

model.fit(x_train, 
          y_train, 
          epochs=10,
          validation_data=(x_test, y_test),
          verbose=2
          )


Epoch 1/10
1634/1634 - 17s - loss: 0.5833 - accuracy: 0.6883 - val_loss: 0.5582 - val_accuracy: 0.7056 - 17s/epoch - 10ms/step
Epoch 2/10
1634/1634 - 14s - loss: 0.5482 - accuracy: 0.7188 - val_loss: 0.5426 - val_accuracy: 0.7169 - 14s/epoch - 8ms/step
Epoch 3/10
1634/1634 - 15s - loss: 0.5305 - accuracy: 0.7260 - val_loss: 0.5381 - val_accuracy: 0.7187 - 15s/epoch - 9ms/step
Epoch 4/10
1634/1634 - 16s - loss: 0.5123 - accuracy: 0.7367 - val_loss: 0.5266 - val_accuracy: 0.7204 - 16s/epoch - 10ms/step
Epoch 5/10
1634/1634 - 15s - loss: 0.4938 - accuracy: 0.7471 - val_loss: 0.5154 - val_accuracy: 0.7357 - 15s/epoch - 9ms/step
Epoch 6/10
1634/1634 - 15s - loss: 0.4737 - accuracy: 0.7602 - val_loss: 0.5039 - val_accuracy: 0.7434 - 15s/epoch - 9ms/step
Epoch 7/10
1634/1634 - 15s - loss: 0.4542 - accuracy: 0.7720 - val_loss: 0.5207 - val_accuracy: 0.7360 - 15s/epoch - 9ms/step
Epoch 8/10
1634/1634 - 15s - loss: 0.4356 - accuracy: 0.7839 - val_loss: 0.4957 - val_accuracy: 0.7529 - 15s/epoch -

<keras.callbacks.History at 0x2148579ae90>