In [42]:
import json
import string
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import re
import numpy as np
import random
from keras.preprocessing.sequence import pad_sequences
from keras.preprocessing.text import Tokenizer
from keras.utils import to_categorical
from sklearn.model_selection import train_test_split
import tqdm
from keras.layers import Embedding, LSTM, Dropout, Dense,GRU,Input
from keras.models import Sequential
from keras.utils import to_categorical
from keras.callbacks import ModelCheckpoint, TensorBoard
from keras.layers import LeakyReLU

from tensorflow.contrib.layers import fully_connected
import keras_metrics

In [43]:
with open("dataset_tweets.json",'r') as f:
    data = json.load(f)

In [44]:
SEQUENCE_LENGTH = 30
EMBEDDING_SIZE = 300  
TEST_SIZE = 0.25 

BATCH_SIZE = 64
EPOCHS = 10 

In [45]:
def preprocess_tweet(text):

    # Check characters to see if they are in punctuation
    nopunc = [char for char in text if char not in string.punctuation]
    # Join the characters again to form the string.
    nopunc = ''.join(nopunc)
    # convert text to lower-case
    nopunc = nopunc.lower()
    # remove URLs
    nopunc = re.sub('((www\.[^\s]+)|(https?://[^\s]+)|(http?://[^\s]+))', '', nopunc)
    nopunc = re.sub(r'http\S+', '', nopunc)
    # remove usernames
    nopunc = re.sub('@[^\s]+', '', nopunc)
    # remove the # in #hashtag
    nopunc = re.sub(r'#([^\s]+)', r'\1', nopunc)
    # remove repeated characters
    nopunc = word_tokenize(nopunc)
    # remove stopwords from final word list
    return [word for word in nopunc if word not in stopwords.words('english')]

In [46]:
data[0]

{'tweet': 'No comparison #superbowl http://t.co/DV91J3zA', 'label': 0}

In [47]:
random.shuffle(data)

In [48]:
len(data)

12000

In [49]:
pre_processed = []
for i in range(0,len(data)):
    pre_processed.append(preprocess_tweet(data[i]['tweet']))

In [50]:
len(pre_processed)

12000

In [51]:
tokenizer = Tokenizer()
tokenizer.fit_on_texts(pre_processed)
process = tokenizer.texts_to_sequences(pre_processed)

In [52]:
y = []
for l in range(0,len(data)):
    y.append(data[l]['label'])
y = to_categorical(y)

In [53]:
SEQUENCE_LENGTH = 30
# convert to numpy arrays
process = np.array(process)
y = np.array(y)
process = pad_sequences(process, maxlen=30)

In [54]:
X_train, X_test, y_train, y_test = train_test_split(process, y, test_size=TEST_SIZE, random_state=7)

In [55]:
def get_embedding_vectors(tokenizer, dim=300):
    embedding_index = {}
    with open("numberbatch-en.txt", encoding='utf8') as f:
        for line in tqdm.tqdm(f, "Reading Numberbatch"):
            values = line.split()
            word = values[0]
            vectors = np.asarray(values[1:], dtype='float32')
            embedding_index[word] = vectors

    word_index = tokenizer.word_index
    embedding_matrix = np.zeros((len(word_index)+1, dim))
    for word, i in word_index.items():
        embedding_vector = embedding_index.get(word)
        if embedding_vector is not None:
            # words not found will be 0s
            embedding_matrix[i] = embedding_vector
            
    return embedding_matrix

In [56]:
def get_model(tokenizer, lstm_units):
    """
    Constructs the model,
    Embedding vectors => LSTM => 2 output Fully-Connected neurons with softmax activation
    """
    # get the GloVe embedding vectors
    embedding_matrix = get_embedding_vectors(tokenizer)
    model = Sequential()
    model.add(Embedding(len(tokenizer.word_index)+1,
              EMBEDDING_SIZE,
              weights=[embedding_matrix],
              trainable=False,
              input_length=SEQUENCE_LENGTH))

    model.add(LSTM(lstm_units, recurrent_dropout=0.2))
#     model.add(Dropout(0.3))
    model.add(Dense(3, activation="softmax"))
    # compile as rmsprop optimizer
    # aswell as with recall metric
    model.compile(optimizer="adam", loss="categorical_crossentropy",
                  metrics=["accuracy", keras_metrics.precision(), keras_metrics.recall()])
    model.summary()
    return model

In [57]:
model = get_model(tokenizer=tokenizer, lstm_units=128)

Reading Numberbatch: 516783it [00:43, 11962.17it/s]


_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_3 (Embedding)      (None, 30, 300)           5849100   
_________________________________________________________________
lstm_3 (LSTM)                (None, 128)               219648    
_________________________________________________________________
dense_3 (Dense)              (None, 3)                 387       
Total params: 6,069,135
Trainable params: 220,035
Non-trainable params: 5,849,100
_________________________________________________________________


In [58]:
print("X_train.shape:", X_train.shape)
print("X_test.shape:", X_test.shape)
print("y_train.shape:", y_train.shape)
print("y_test.shape:", y_test.shape)
# train the model
model.fit(X_train, y_train, validation_data=(X_test, y_test),batch_size=BATCH_SIZE, epochs=EPOCHS)

X_train.shape: (9000, 30)
X_test.shape: (3000, 30)
y_train.shape: (9000, 3)
y_test.shape: (3000, 3)
Train on 9000 samples, validate on 3000 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x7f5401f1a9b0>

In [59]:
# get the loss and metrics
result = model.evaluate(X_test, y_test)
# extract those
loss = result[0]
accuracy = result[1]
precision = result[2]
recall = result[3]

print("[+] Accuracy: " , accuracy*100)
print("[+] Precision: " , precision*100)
print("[+] Recall: " , recall*100)

[+] Accuracy:  97.46666666666667
[+] Precision:  99.90205679726719
[+] Recall:  99.31840310620073
