In [4]:
from numpy import asarray
import numpy as np
import pickle
from numpy import zeros
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.preprocessing.text import Tokenizer
from keras.models import Sequential
from keras.layers import Dense , Bidirectional
from keras.layers import Flatten , Dropout
from keras.layers import Embedding
from tensorflow.python.keras.layers import LSTM, CuDNNLSTM
import preprocessing as pre
from keras.callbacks import EarlyStopping
import LSTM_models as mod
import os
import wget
root = 'data/'
os.makedirs(root, exist_ok=True)

#  import data after removing duplicated tweets

In [5]:
# Download negative full
neg_url = 'https://api.onedrive.com/v1.0/shares/u!aHR0cHM6Ly8xZHJ2Lm1zL3QvcyFBclREZ3U5ejdJT1ZqcDQ0eDZMdDI5WXBlVXYyZGc_ZT1ZZDJn/root/content'
neg_filename = root + 'train_neg_full_u.txt'
wget.download(neg_url, neg_filename)
neg_tweets = mod.txt_to_list(neg_filename)

# Download positive full
pos_url = 'https://api.onedrive.com/v1.0/shares/u!aHR0cHM6Ly8xZHJ2Lm1zL3QvcyFBclREZ3U5ejdJT1ZqcDQzcTc3QmNPbUdIWHQ3TXc_ZT01ejdG/root/content'
pos_filename = root + 'train_pos_full_u.txt'
wget.download(pos_url, pos_filename)
pos_tweets = mod.txt_to_list(pos_filename)

# Merge positive and negative tweets
all_tweets=np.concatenate([pos_tweets,neg_tweets])


# Import pre-trained Glove embeddings

In [8]:
#Download embeddings
emb_url="https://api.onedrive.com/v1.0/shares/u!aHR0cHM6Ly8xZHJ2Lm1zL3QvcyFBclREZ3U5ejdJT1ZqcU00WDhTRmwzcWpCcVhKdXc_ZT1nMjRX/root/content"
emb_filename= root +'embeddings.txt'
wget.download(emb_url, emb_filename)


'data/embeddings (1).txt'

#  Pre-process the data

In [9]:
#Preprocess the tweets with the standard_pipeline in preprocessing
processed_tweets=[pre.process_sentence(tweet.split(' '),pre.standard_pipeline) for tweet in all_tweets]
docs=[' '.join(tweet) for tweet in processed_tweets]

#  Prepare inputs for LSTM

In [11]:
# prepare tokenizer
t = Tokenizer()
t.fit_on_texts(docs)
vocab_size = len(t.word_index) + 1
# integer encode the documents
encoded_docs = t.texts_to_sequences(docs)
print(encoded_docs)
# pad documents to a max length of 40 words
max_length = 40
padded_docs = pad_sequences(encoded_docs, maxlen=max_length, padding='post')
print(padded_docs)

IOPub data rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_data_rate_limit`.

Current values:
NotebookApp.iopub_data_rate_limit=1000000.0 (bytes/sec)
NotebookApp.rate_limit_window=3.0 (secs)



[[    32      1    302 ...      0      0      0]
 [     1   7687     26 ...      0      0      0]
 [     1      2   1496 ...      0      0      0]
 ...
 [ 29812      1     12 ...      0      0      0]
 [ 29812      1      5 ...      0      0      0]
 [423219    180  73400 ...      0      0      0]]


# Create dictionnary with Glove embeddings

In [12]:
word_emb = dict()
with open(emb_filename) as f:
    for idx, line in enumerate(f):
        K=line.split()
        word_emb[K[0]]=np.float_(K[1:])


# Create embedding_matrix

In [13]:
embedding_matrix = zeros((vocab_size, 200))
for word, i in t.word_index.items():
	embedding_vector = word_emb.get(word)
	if embedding_vector is not None:
		embedding_matrix[i] = embedding_vector



# Create labels vector

In [None]:
labels=np.zeros(len(all_tweets))
labels[:len(pos_tweets)]=0
labels[len(pos_tweets):]=1

# Shuffle and split the data into training and validation

In [16]:
random_idxs = np.random.permutation(len(labels))
padded_docs = padded_docs[random_idxs]
labels = labels[random_idxs]

N_train = int(0.9*len(labels))

X_train, X_val = padded_docs[:N_train], padded_docs[N_train:]
y_train, y_val = labels[:N_train], labels[N_train:]

# Create the LSTM model

In [18]:
# define model
model = Sequential()
e = Embedding(vocab_size, 200, weights=[embedding_matrix],input_length=50, trainable=False)
model.add(e)
model.add(Bidirectional(CuDNNLSTM(50)))
model.add(Dropout(0.2))
model.add(Flatten())
model.add(Dense(1, activation='sigmoid'))
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
# summarize the model
print(model.summary())


Model: "sequential_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, 50, 200)           84644000  
_________________________________________________________________
bidirectional_1 (Bidirection (None, 100)               100800    
_________________________________________________________________
dropout (Dropout)            (None, 100)               0         
_________________________________________________________________
flatten (Flatten)            (None, 100)               0         
_________________________________________________________________
dense (Dense)                (None, 1)                 101       
Total params: 84,744,901
Trainable params: 100,901
Non-trainable params: 84,644,000
_________________________________________________________________
None


# Train the model and evaluate it

In [21]:
# fit the model
es = EarlyStopping(monitor='accuracy', mode='max', min_delta=0, patience=1)
history = model.fit(X_train, y_train, epochs=10, verbose=1, callbacks=[es])
# evaluate the model
loss, accuracy = model.evaluate(X_val, y_val, verbose=0)
print('Accuracy: %f' % (accuracy*100))

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Accuracy: 84.676433
