In [1]:
import os
import numpy as np
import keras
from keras.datasets import reuters, imdb
from keras.models import Sequential
from keras.layers import LSTM, SimpleRNN, GRU, Dense, Dropout, Activation, Embedding
from keras.preprocessing.text import Tokenizer
from keras.preprocessing import sequence
from sklearn.model_selection import train_test_split
import pandas as pd

Using TensorFlow backend.


In [2]:
EMBEDDING_DIM = 50
# load in training/test set
data = pd.read_csv('tweets.160k.random.csv', encoding='utf-8')
data.head()

data['label'].value_counts()

vocab_size = 20000
tokenizer = Tokenizer(num_words= vocab_size)
tokenizer.fit_on_texts(data['text'])
sequences = tokenizer.texts_to_sequences(data['text'])
word_index = tokenizer.word_index
tweets = sequence.pad_sequences(sequences, padding='post', maxlen=50)

In [3]:
labels = data['label']
labels = labels.replace(4,1) # replace label '4' with '1' to facilitate one-hot encoding
x_train, x_test, y_train, y_test = train_test_split(tweets, labels, test_size=0.2)

print(len(x_train), 'train sequences')
print(len(x_test), 'test sequences')

y_train = keras.utils.to_categorical(y_train) # 2 classes
y_test = keras.utils.to_categorical(y_test)

128000 train sequences
32000 test sequences


In [4]:
embeddings_index = {}
f = open('glove.6B.50d.txt')
for line in f:
    values = line.split()
    word = values[0]
    coefs = np.asarray(values[1:], dtype='float32')
    embeddings_index[word] = coefs
f.close()

print('Found %s word vectors.' % len(embeddings_index))

embedding_matrix = np.zeros((len(word_index) + 1, EMBEDDING_DIM))
for word, i in word_index.items():
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        # words not found in embedding index will be all-zeros.
        embedding_matrix[i] = embedding_vector

Found 400000 word vectors.


In [6]:
model = Sequential()
model.add(Embedding(len(word_index)+1, EMBEDDING_DIM, weights=[embedding_matrix], trainable=False))
model.add(SimpleRNN(128))
model.add(Dense(2))
model.add(Activation('softmax'))
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
model.build()
print(model.summary())

Model: "sequential_2"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_2 (Embedding)      (None, None, 50)          6910550   
_________________________________________________________________
simple_rnn_2 (SimpleRNN)     (None, 128)               22912     
_________________________________________________________________
dense_2 (Dense)              (None, 2)                 258       
_________________________________________________________________
activation_2 (Activation)    (None, 2)                 0         
Total params: 6,933,720
Trainable params: 23,170
Non-trainable params: 6,910,550
_________________________________________________________________
None


In [7]:
history = model.fit(x_train, y_train, batch_size=128, epochs=3, verbose=1, validation_split=0.2)

score = model.evaluate(x_test, y_test, batch_size=128, verbose=1)
print('Test score:', score[0])
print('Test accuracy:', score[1])

Train on 102400 samples, validate on 25600 samples
Epoch 1/3
Epoch 2/3
Epoch 3/3
Test score: 0.6956788473129273
Test accuracy: 0.49918749928474426


In [8]:
model1 = Sequential()
model1.add(Embedding(len(word_index)+1, EMBEDDING_DIM, weights=[embedding_matrix], trainable=False))
model1.add(LSTM(128))
model1.add(Dense(2))
model1.add(Activation('softmax'))
model1.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
model1.build()
print(model1.summary())

Model: "sequential_3"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_3 (Embedding)      (None, None, 50)          6910550   
_________________________________________________________________
lstm_1 (LSTM)                (None, 128)               91648     
_________________________________________________________________
dense_3 (Dense)              (None, 2)                 258       
_________________________________________________________________
activation_3 (Activation)    (None, 2)                 0         
Total params: 7,002,456
Trainable params: 91,906
Non-trainable params: 6,910,550
_________________________________________________________________
None


In [9]:
history = model1.fit(x_train, y_train, batch_size=128, epochs=3, verbose=1, validation_split=0.2)

score = model1.evaluate(x_test, y_test, batch_size=128, verbose=1)
print('Test score:', score[0])
print('Test accuracy:', score[1])

Train on 102400 samples, validate on 25600 samples
Epoch 1/3
Epoch 2/3
Epoch 3/3
Test score: 0.5147442498207092
Test accuracy: 0.7477187514305115


In [11]:
drop_out_rate = 0.1
model2 = Sequential()
model2.add(Embedding(len(word_index)+1, EMBEDDING_DIM, weights=[embedding_matrix], trainable=False))
model2.add(Dropout(drop_out_rate))
model2.add(LSTM(128))
model2.add(Dense(2))
model2.add(Activation('softmax'))
model2.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
model2.build()
print(model2.summary())

Model: "sequential_5"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_5 (Embedding)      (None, None, 50)          6910550   
_________________________________________________________________
dropout_1 (Dropout)          (None, None, 50)          0         
_________________________________________________________________
lstm_2 (LSTM)                (None, 128)               91648     
_________________________________________________________________
dense_4 (Dense)              (None, 2)                 258       
_________________________________________________________________
activation_4 (Activation)    (None, 2)                 0         
Total params: 7,002,456
Trainable params: 91,906
Non-trainable params: 6,910,550
_________________________________________________________________
None


In [12]:
history = model2.fit(x_train, y_train, batch_size=128, epochs=3, verbose=1, validation_split=0.2)

score = model2.evaluate(x_test, y_test, batch_size=128, verbose=1)
print('Test score:', score[0])
print('Test accuracy:', score[1])

Train on 102400 samples, validate on 25600 samples
Epoch 1/3
Epoch 2/3
Epoch 3/3
Test score: 0.5176777384281158
Test accuracy: 0.7455312609672546


In [13]:
drop_out_rate = 0.2
model3 = Sequential()
model3.add(Embedding(len(word_index)+1, EMBEDDING_DIM, weights=[embedding_matrix], trainable=False))
model3.add(Dropout(drop_out_rate))
model3.add(LSTM(128))
model3.add(Dense(2))
model3.add(Activation('softmax'))
model3.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
model3.build()
print(model3.summary())

Model: "sequential_6"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_6 (Embedding)      (None, None, 50)          6910550   
_________________________________________________________________
dropout_2 (Dropout)          (None, None, 50)          0         
_________________________________________________________________
lstm_3 (LSTM)                (None, 128)               91648     
_________________________________________________________________
dense_5 (Dense)              (None, 2)                 258       
_________________________________________________________________
activation_5 (Activation)    (None, 2)                 0         
Total params: 7,002,456
Trainable params: 91,906
Non-trainable params: 6,910,550
_________________________________________________________________
None


In [14]:
history = model3.fit(x_train, y_train, batch_size=128, epochs=3, verbose=1, validation_split=0.2)

score = model3.evaluate(x_test, y_test, batch_size=128, verbose=1)
print('Test score:', score[0])
print('Test accuracy:', score[1])

Train on 102400 samples, validate on 25600 samples
Epoch 1/3
Epoch 2/3
Epoch 3/3
Test score: 0.5299306291341782
Test accuracy: 0.738031268119812


In [15]:
drop_out_rate = 0.5
model4 = Sequential()
model4.add(Embedding(len(word_index)+1, EMBEDDING_DIM, weights=[embedding_matrix], trainable=False))
model4.add(Dropout(drop_out_rate))
model4.add(LSTM(128))
model4.add(Dense(2))
model4.add(Activation('softmax'))
model4.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
model4.build()
print(model4.summary())

Model: "sequential_7"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_7 (Embedding)      (None, None, 50)          6910550   
_________________________________________________________________
dropout_3 (Dropout)          (None, None, 50)          0         
_________________________________________________________________
lstm_4 (LSTM)                (None, 128)               91648     
_________________________________________________________________
dense_6 (Dense)              (None, 2)                 258       
_________________________________________________________________
activation_6 (Activation)    (None, 2)                 0         
Total params: 7,002,456
Trainable params: 91,906
Non-trainable params: 6,910,550
_________________________________________________________________
None


In [16]:
history = model4.fit(x_train, y_train, batch_size=128, epochs=3, verbose=1, validation_split=0.2)

score = model4.evaluate(x_test, y_test, batch_size=128, verbose=1)
print('Test score:', score[0])
print('Test accuracy:', score[1])

Train on 102400 samples, validate on 25600 samples
Epoch 1/3
Epoch 2/3
Epoch 3/3
Test score: 0.5344355206489563
Test accuracy: 0.7308750152587891


In [17]:
drop_out_rate = 0.8
model5 = Sequential()
model5.add(Embedding(len(word_index)+1, EMBEDDING_DIM, weights=[embedding_matrix], trainable=False))
model5.add(LSTM(128))
model5.add(Dropout(drop_out_rate))
model5.add(Dense(2))
model5.add(Activation('softmax'))
model5.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
model5.build()
print(model5.summary())

Model: "sequential_8"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_8 (Embedding)      (None, None, 50)          6910550   
_________________________________________________________________
dropout_4 (Dropout)          (None, None, 50)          0         
_________________________________________________________________
lstm_5 (LSTM)                (None, 128)               91648     
_________________________________________________________________
dense_7 (Dense)              (None, 2)                 258       
_________________________________________________________________
activation_7 (Activation)    (None, 2)                 0         
Total params: 7,002,456
Trainable params: 91,906
Non-trainable params: 6,910,550
_________________________________________________________________
None


In [18]:
history = model5.fit(x_train, y_train, batch_size=128, epochs=3, verbose=1, validation_split=0.2)

score = model5.evaluate(x_test, y_test, batch_size=128, verbose=1)
print('Test score:', score[0])
print('Test accuracy:', score[1])

Train on 102400 samples, validate on 25600 samples
Epoch 1/3
Epoch 2/3
Epoch 3/3
Test score: 0.6173382894992828
Test accuracy: 0.6788750290870667
