In [30]:
from utils import *
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.utils import to_categorical

In [24]:
X,y = load_tweets('../data/clean_train.txt', True)

In [69]:
y = [1 if l == 1 else 0 for l in y] 
y_cat = to_categorical(y, dtype='int32')

In [131]:
all_words = set()
for tweet in X:
    for word in tweet.split():
        all_words.add(word)
vocab_length = len(all_words)
print('Vocabulary size: %d' % (vocab_length))

# create tokenizer with an extra token for unkown words
tokenizer = Tokenizer(num_words=vocab_length, oov_token=1, filters='')
tokenizer.fit_on_texts(X)

# tokenize data
X_tokenized = tokenizer.texts_to_sequences(X)

# get longest tweet and pad others with 0s to obtain same length
max_length = max([len(x) for x in X])
X_train_padded = pad_sequences(X_tokenized, max_length, padding='post')
print('Longest tweet: %d' % (max_length))

Vocabulary size: 79443
Longest tweet: 286


In [132]:
word_index = tokenizer.word_index
word_index

{1: 1,
 '<user>': 2,
 'i': 3,
 'be': 4,
 'the': 5,
 'to': 6,
 '<number>': 7,
 'you': 8,
 'a': 9,
 '<url>': 10,
 'and': 11,
 'it': 12,
 'my': 13,
 'me': 14,
 'of': 15,
 'for': 16,
 'in': 17,
 'have': 18,
 '<elong>': 19,
 'im': 20,
 'so': 21,
 'this': 22,
 'get': 23,
 'on': 24,
 'that': 25,
 'with': 26,
 'go': 27,
 'do': 28,
 'but': 29,
 'just': 30,
 'love': 31,
 'your': 32,
 'rt': 33,
 'not': 34,
 'u': 35,
 'like': 36,
 'at': 37,
 'all': 38,
 'know': 39,
 'dont': 40,
 'lol': 41,
 'up': 42,
 'day': 43,
 'follow': 44,
 'good': 45,
 'one': 46,
 'no': 47,
 'thank': 48,
 'now': 49,
 'want': 50,
 'make': 51,
 'we': 52,
 'when': 53,
 'see': 54,
 'frame': 55,
 'can': 56,
 'if': 57,
 'x': 58,
 'out': 59,
 'what': 60,
 'think': 61,
 'will': 62,
 'cant': 63,
 'too': 64,
 'please': 65,
 'time': 66,
 'miss': 67,
 'from': 68,
 'he': 69,
 'come': 70,
 'back': 71,
 'say': 72,
 'about': 73,
 'today': 74,
 'really': 75,
 'how': 76,
 'there': 77,
 'need': 78,
 'haha': 79,
 'feel': 80,
 '<heart>': 81,
 'we

In [82]:
embeddings_index = {}
with open('../data/glove.twitter.27B.200d.txt', 'r', encoding='utf-8') as f:
    for line in f:
        values = line.split()
        word = values[0]
        coefs = np.asarray(values[1:], dtype='float32')
        embeddings_index[word] = coefs

print('Found %s word vectors.' % len(embeddings_index))

Found 1193514 word vectors.


In [133]:
embedding_matrix = np.zeros((len(word_index) + 1, 200))
not_found = []
for word, i in word_index.items():
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        # words not found in embedding index will be all-zeros.
        embedding_matrix[i] = embedding_vector
    else:
        not_found.append(word)

In [134]:
np.save('../out/embedding_matrix', embedding_matrix)

In [None]:
from keras.layers import Embedding

embedding_layer = Embedding(len(word_index) + 1,
                            EMBEDDING_DIM,
                            weights=[embedding_matrix],
                            input_length=MAX_SEQUENCE_LENGTH,
                            trainable=False)

In [10]:
from sklearn.model_selection import train_test_split

In [12]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=42)

In [14]:
len(y_test)

18133

In [19]:
from utils import *
from keras.preprocessing.text import one_hot
from keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split
from keras.layers.embeddings import Embedding
from keras.preprocessing.text import Tokenizer
from keras.layers import Flatten, Dense

# load data
X, y = load_tweets('../data/clean_train.txt', True)

# split to test and train set
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=42)

# get all unique words
all_words = set()
for tweet in X:
    for word in tweet.split():
        all_words.add(word)
vocab_length = len(all_words)
print('Vocabulary size: %d' % (vocab_length))

# create tokenizer with an extra token for unkown words
tokenizer = Tokenizer(num_words=vocab_length, oov_token=1)
tokenizer.fit_on_texts(X_train)

Using TensorFlow backend.


Vocabulary size: 92486


In [29]:
from keras.models import Sequential

# tokenize data
X_train_tokenized = tokenizer.texts_to_sequences(X_train)

# get longest tweet and pad others with 0s to obtain same length
max_length = max([len(x) for x in X_train])
X_train_padded = pad_sequences(X_train_tokenized, max_length, padding='post')
print('Longest tweet: %d' % (max_length))

# create model
model = Sequential()
embedding_layer = Embedding(vocab_length + 2, 200, input_length=max_length)
model.add(embedding_layer)
model.add(Flatten())
model.add(Dense(1, activation='sigmoid'))
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
print(model.summary())

Longest tweet: 137





Instructions for updating:
Use tf.where in 2.0, which has the same broadcast rule as np.where
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, 137, 200)          18497600  
_________________________________________________________________
flatten_1 (Flatten)          (None, 27400)             0         
_________________________________________________________________
dense_1 (Dense)              (None, 1)                 27401     
Total params: 18,525,001
Trainable params: 18,525,001
Non-trainable params: 0
_________________________________________________________________
None


In [34]:
# fit and evaluate the model
model.fit(X_train_padded, y_train, batch_size=32, epochs=3, verbose=1, validation_split=0.2)
loss, accuracy = model.evaluate(X_train_padded, y_train, verbose=0)
print('Accuracy: %f' % (accuracy*100))

Train on 130550 samples, validate on 32638 samples
Epoch 1/3
Epoch 2/3

KeyboardInterrupt: 