In [1]:
import numpy as np
import keras
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential
from keras.layers import Dense, Dropout, Activation, Embedding, LSTM
from keras.utils import np_utils

Using TensorFlow backend.


In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


# Loading the data

Training data

In [0]:
f = open('/content/drive/My Drive/Projects/Premoji/Data/Train/tweets.txt.text', 'r')
tweets_train = f.readlines()[:50000] # using 10% of the actual data for fast training
f.close()

In [4]:
f = open('/content/drive/My Drive/Projects/Premoji/Data/Train/tweets.txt.labels', 'r')
labels_train = f.readlines()[:50000] # using 10% of the actual data for fast training
f.close

<function TextIOWrapper.close>

In [0]:
y_train = np_utils.to_categorical([int(l.strip()) for l in labels_train], 20)

Test data

In [0]:
f = open('/content/drive/My Drive/Projects/Premoji/Data/Test/us_test.text', 'r')
tweets_test = f.readlines()
f.close()

In [7]:
f = open('/content/drive/My Drive/Projects/Premoji/Data/Test/us_test.labels', 'r')
labels_test = f.readlines()
f.close

<function TextIOWrapper.close>

In [0]:
y_test = np_utils.to_categorical([int(l.strip()) for l in labels_test], 20)

In [9]:
len(y_test)

50000

# Preprocessing the text

Tokenizing the data

In [0]:
vocab_size = 50000
tokenizer = Tokenizer(filters='!"#$%&()*+,-./:;<=>?@[\\]^_`{|}~\t\n…', num_words=vocab_size, oov_token="UNK")
tokenizer.fit_on_texts(texts=tweets_train)

In [11]:
tokenizer.get_config()

{'char_level': False,
 'document_count': 50000,
 'filters': '!"#$%&()*+,-./:;<=>?@[\\]^_`{|}~\t\n…',
 'index_docs': '{"132": 478, "4470": 8, "221": 313, "64": 950, "8": 6877, "1953": 26, "60": 1038, "291": 229, "17": 3686, "683": 97, "2390": 20, "9516": 3, "2": 12811, "65": 944, "3": 9768, "354": 194, "9": 5951, "1954": 25, "5965": 6, "224": 307, "38": 1412, "2003": 25, "4880": 8, "85": 729, "2071": 23, "4": 9942, "139": 454, "18721": 1, "6": 8110, "984": 64, "1678": 31, "150": 401, "1166": 53, "77": 799, "4881": 8, "30": 1622, "9517": 3, "18722": 1, "18723": 1, "3038": 13, "944": 66, "54": 1141, "18725": 1, "84": 686, "18724": 1, "396": 175, "1591": 35, "1475": 39, "1250": 47, "1167": 53, "18726": 1, "2211": 22, "355": 188, "269": 253, "4471": 9, "3869": 11, "45": 1323, "61": 1048, "2072": 21, "22": 2561, "1364": 43, "332": 206, "645": 105, "345": 196, "12": 5385, "13": 5393, "231": 293, "1148": 54, "18727": 1, "14": 4902, "652": 99, "18728": 1, "1913": 27, "286": 227, "15": 4785, "20

Encoding and adding a padding to the training data

In [0]:
tokenized_train = tokenizer.texts_to_sequences(tweets_train)
x_train = pad_sequences(tokenized_train, padding='post')

Retrieving unified vector length after padding

In [0]:
maxlen = len(x_train[0])

Encoding and adding a padding to the test
data

In [0]:
tokenized_test = tokenizer.texts_to_sequences(tweets_test)
x_test = pad_sequences(tokenized_test, padding='post', maxlen=maxlen)

# Training the model

Building the model

In [0]:
model = Sequential()

In [0]:
model.add(Embedding(vocab_size, 200))
model.add(LSTM(100))
model.add(Dense(20, activation='softmax'))

In [0]:
model.compile(loss='categorical_crossentropy',
             optimizer='rmsprop',
             metrics=['accuracy'])

Training the model

In [65]:
model.fit(x_train, y_train,
         validation_split=0.3, epochs=1, batch_size=16)

Train on 35000 samples, validate on 15000 samples
Epoch 1/1


<keras.callbacks.History at 0x7f98e048ecf8>

# Evaluation

In [66]:
loss, accuracy = model.evaluate(x_test, y_test)



Results :

In [67]:
print('Loss :', loss)
print('Accuracy :', accuracy)

Loss : 2.2343658641815187
Accuracy : 0.32514


# Prediction

In [21]:
tweet = "RuPaul's Drag Race bingo fun. Drag Queens be SEXY! #rupaulsdragrace @user abwyman #la…"
label = 9

tokens = tokenizer.texts_to_sequences([tweet])
x = pad_sequences(tokens, padding='post', maxlen=maxlen)

y = np_utils.to_categorical([label], 20)

print(x[0])
print(y)

[18722  3038  1678  4881    77  3038  1166    30   944  9517     4 18723
   150     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0]
[[0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]]


In [22]:
prediction = model.predict(x)

print(sum(prediction[0]))

print(prediction)

print(np.argmax(prediction[0]))

print(range(1,21)[np.argmax(prediction[0])])

0.999999976484105
[[0.06322975 0.12749825 0.12998687 0.06055667 0.07100341 0.05539239
  0.0555721  0.03783628 0.04071476 0.04605405 0.03772033 0.03650237
  0.00384136 0.03920964 0.03389506 0.03732823 0.02606802 0.03205599
  0.03246753 0.03306692]]
2
3


# Saving the model

In [0]:
model.save('/content/drive/My Drive/Projects/Premoji/lstm.h5')