In [1]:
import pandas
import collections

from nltk.tokenize import TweetTokenizer
from keras.utils import to_categorical

training_df = pandas.read_csv('trainingdata.txt', sep='\t', encoding='latin1')
STANCE = ['FAVOR', 'AGAINST', 'NONE']
all_words = []

for tweet in training_df['Tweet']:
    all_words.extend(TweetTokenizer().tokenize(tweet))
    

def build_dataset(words, n_words):
    """Process raw inputs into a dataset."""
    count = [['UNK', -1]]
    count.extend(collections.Counter(words).most_common(n_words - 1))
    dictionary = dict()
    for word, _ in count:
        dictionary[word] = len(dictionary)
    data = list()
    unk_count = 0
    for word in words:
        if word in dictionary:
            index = dictionary[word]
        else:
            index = 0  # dictionary['UNK']
            unk_count += 1
        data.append(index)
    count[0][1] = unk_count
    reversed_dictionary = dict(zip(dictionary.values(), dictionary.keys()))
    return data, count, dictionary, reversed_dictionary


def words_to_numbers(dictionary, word_arr):
    for i, word in enumerate(word_arr):
        word_arr[i] = dictionary[word] if word in dictionary else 0
    return word_arr


def get_stance_to_num(stance):
    return STANCE.index(stance)


def create_training_data(training_data, num_top_words):
    data, count, dictionary, reversed_dictionary = build_dataset(all_words, num_top_words)
    X_data = []
    Y_data = []
    for _, row in training_data.iterrows():
        X_data.append(words_to_numbers(dictionary, TweetTokenizer().tokenize(row['Tweet'])))
        Y_data.append([get_stance_to_num(row['Stance'])])
    return X_data, to_categorical(Y_data, 3)

Using TensorFlow backend.


### Create the training set

Here we create the training set and pad it out to get everything prepped for the model

In [20]:
from keras.preprocessing import sequence

num_top_words = 5000

X_train, y_train = create_training_data(training_df, num_top_words)

# Magic number that I computed so I can pad out the input arrays
longest_row = 37
# Lets pad those suckers out
X_train = sequence.pad_sequences(X_train, maxlen=longest_row, padding='post')

In [None]:
import numpy
import keras
import matplotlib.pyplot as plt
from keras import regularizers
from keras.models import Sequential
from keras.layers import Dense, Dropout, LSTM
from keras.layers import Flatten
from keras.layers.convolutional import Conv1D
from keras.layers.convolutional import MaxPooling1D
from keras.layers.embeddings import Embedding
# fix random seed for reproducibility
seed = 7
numpy.random.seed(seed)

# create the model
embed_dim = 128
lstm_out = 196

model = Sequential()
model.add(Embedding(num_top_words, embed_dim, input_length=longest_row, dropout=0.2))
model.add(LSTM(lstm_out, dropout_U=0.2, dropout_W=0.2))
model.add(Dense(3, activation='softmax'))
model.compile(loss = 'categorical_crossentropy', optimizer='adam',metrics = ['accuracy'])
print(model.summary())

history = model.fit(X_train, y_train, epochs=25, validation_split=0.33, batch_size=16, verbose=1)

# summarize history for accuracy
plt.plot(history.history['acc'])
plt.plot(history.history['val_acc'])
plt.title('model accuracy')
plt.ylabel('accuracy')
plt.xlabel('epoch')
plt.legend(['train', 'test'], loc='upper left')
plt.show()
# # summarize history for loss
# plt.plot(history.history['loss'])
# plt.plot(history.history['val_loss'])
# plt.title('model loss')
# plt.ylabel('loss')
# plt.xlabel('epoch')
# plt.legend(['train', 'test'], loc='upper left')
# plt.savefig("thing.png")



_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_20 (Embedding)     (None, 37, 128)           640000    
_________________________________________________________________
lstm_2 (LSTM)                (None, 196)               254800    
_________________________________________________________________
dense_55 (Dense)             (None, 3)                 591       
Total params: 895,391
Trainable params: 895,391
Non-trainable params: 0
_________________________________________________________________
None
Train on 1885 samples, validate on 929 samples
Epoch 1/25


In [32]:
# Evaluate F-score
def listOfTopics(dataframe):
    listOfTopics = []
    for row in traindf['Target']:
        if row not in listOfTopics:
            listOfTopics.append(row)
    return listOfTopics


traindf = pandas.read_csv('./trainingData.txt', sep='\t', encoding='latin1')
print(traindf.head())
testdf = pandas.read_csv('./subtaskA-testdata-gold.txt', sep='\t', encoding='latin1')

X_gold, _ = create_training_data(testdf, num_top_words)
X_gold = sequence.pad_sequences(X_gold, maxlen=longest_row, padding='post')

predictions = model.predict(X_gold)
gold_predictions = [0] * len(predictions)

for idx, prediction in enumerate(predictions):
    gold_predictions[idx] = STANCE[prediction.tolist().index(max(prediction))]

outdf = pandas.DataFrame(columns=['ID','Target','Tweet','Stance'])

for topic in listOfTopics(traindf):
    testExtract = testdf.loc[testdf['Target'] == topic]
    testCorpus = list(map(lambda x: x[:-6], list(testExtract['Tweet'])))

    ID = list(map(lambda x: str(x), list(testExtract['ID'])))
    s = zip(ID, list(testExtract['Target']), testCorpus, list(gold_predictions))

    for x in s:
        outdf.loc[len(outdf)] = list(x)

outdf.set_index('ID')
outdf.to_csv('output.txt', sep='\t', index=False)

    ID   Target                                              Tweet   Stance
0  101  Atheism  dear lord thank u for all of ur blessings forg...  AGAINST
1  102  Atheism  Blessed are the peacemakers, for they shall be...  AGAINST
2  103  Atheism  I am not conformed to this world. I am transfo...  AGAINST
3  104  Atheism  Salah should be prayed with #focus and #unders...  AGAINST
4  105  Atheism  And stay in your houses and do not display you...  AGAINST
