In [64]:
import pandas
import collections

from nltk.tokenize import TweetTokenizer
from keras.utils import to_categorical

training_df = pandas.read_csv('trainingdata.txt', sep='\t', encoding='latin1')
STANCE = ['FAVOR', 'AGAINST', 'NONE']
all_words = []

for tweet in training_df['Tweet']:
    all_words.extend(TweetTokenizer().tokenize(tweet))
    

def build_dataset(words, n_words):
    """Process raw inputs into a dataset."""
    count = [['UNK', -1]]
    count.extend(collections.Counter(words).most_common(n_words - 1))
    dictionary = dict()
    for word, _ in count:
        dictionary[word] = len(dictionary)
    data = list()
    unk_count = 0
    for word in words:
        if word in dictionary:
            index = dictionary[word]
        else:
            index = 0  # dictionary['UNK']
            unk_count += 1
        data.append(index)
    count[0][1] = unk_count
    reversed_dictionary = dict(zip(dictionary.values(), dictionary.keys()))
    return data, count, dictionary, reversed_dictionary


def words_to_numbers(dictionary, word_arr):
    for i, word in enumerate(word_arr):
        word_arr[i] = dictionary[word] if word in dictionary else 0
    return word_arr


def get_stance_to_num(stance):
    return STANCE.index(stance)


def create_training_data(num_top_words):
    data, count, dictionary, reversed_dictionary = build_dataset(all_words, num_top_words)
    X_data = []
    Y_data = []
    for _, row in training_df.iterrows():
        X_data.append(words_to_numbers(dictionary, TweetTokenizer().tokenize(row['Tweet'])))
        Y_data.append([get_stance_to_num(row['Stance'])])
    return X_data, to_categorical(Y_data, 3)

### Create the training set

Here we create the training set and pad it out to get everything prepped for the model

In [70]:
from keras.preprocessing import sequence

num_top_words = 5000

X_train, y_train = create_training_data(num_top_words)

# Magic number that I computed so I can pad out the input arrays
longest_row = 37
# Lets pad those suckers out
X_train = sequence.pad_sequences(X_train, maxlen=longest_row, padding='post')

print(X_train[0], y_train[0])

[1709 1189  616  131   14   37    7  337    0 2026   26 1531 1189  194   38
 1565   10  877   14   32  795  113 1649 2799 1261    1    0    0    0    0
    0    0    0    0    0    0    0] [ 0.  1.  0.]


In [72]:
import numpy
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import Flatten
from keras.layers.embeddings import Embedding
# fix random seed for reproducibility
seed = 7
numpy.random.seed(seed)

# create the model
model = Sequential()
model.add(Embedding(num_top_words, 32, input_length=longest_row))
model.add(Flatten())
model.add(Dense(250, activation='relu'))
model.add(Dense(3, activation='sigmoid'))
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
print(model.summary())

model.fit(X_train, y_train, validation_split=0.33, epochs=4, batch_size=128, verbose=2)

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_6 (Embedding)      (None, 37, 32)            160000    
_________________________________________________________________
flatten_6 (Flatten)          (None, 1184)              0         
_________________________________________________________________
dense_11 (Dense)             (None, 250)               296250    
_________________________________________________________________
dense_12 (Dense)             (None, 3)                 753       
Total params: 457,003
Trainable params: 457,003
Non-trainable params: 0
_________________________________________________________________
None


Train on 1885 samples, validate on 929 samples
Epoch 1/4


0s - loss: 0.6507 - acc: 0.6566 - val_loss: 0.6094 - val_acc: 0.6667
Epoch 2/4
0s - loss: 0.6164 - acc: 0.6683 - val_loss: 0.6000 - val_acc: 0.6685
Epoch 3/4
0s - loss: 0.5828 - acc: 0.6918 - val_loss: 0.5880 - val_acc: 0.6889
Epoch 4/4


0s - loss: 0.5176 - acc: 0.7521 - val_loss: 0.5896 - val_acc: 0.6921


<keras.callbacks.History at 0x124a0e7b8>