In [38]:
import pandas
import nltk
import keras
import matplotlib.pyplot as plt

from keras.models import Sequential
from keras.layers import Dense, Activation
from numpy import array as np_array
from nltk.data import load
from nltk.tokenize import TweetTokenizer


TENSED = True

TENSED_POS_TAGS = ['VBD', 'VBG', 'VBN', 'VBP', 'VBZ']
STANCES = ['FAVOR', 'NONE', 'AGAINST']
# TAG_SET = {} 
tagdict = load('help/tagsets/upenn_tagset.pickle')
TAG_SET = list(tagdict.keys())
TAG_SET.append('#')


def listOfTopics(dataframe):
    listOfTopics = []
    for row in traindf['Target']:
        if row not in listOfTopics:
            listOfTopics.append(row)
    return listOfTopics


def pos_tag(s):
    return nltk.pos_tag(nltk.word_tokenize(s))


def add_to_tag_set(s):
    tags = nltk.pos_tag(nltk.word_tokenize(s['Tweet']))
    for tag in tags:
        if tag[1] in TAG_SET:
            TAG_SET[tag[1]] = TAG_SET[tag[1]] + 1
        else:
            TAG_SET[tag[1]] = 1


def get_tag_set(rows):
    list(map(add_to_tag_set, rows))


def tag_is_tensed(tag):
    return tag[1] in TENSED_POS_TAGS


def get_target_rows_in_frame(d_frame, target):
    if target == 'ALL':
        return list(
            map((lambda x: x[1]), d_frame.iterrows()))
    return list(
        filter((lambda row: row['Target'] == target),
            map((lambda x: x[1]), d_frame.iterrows())))


def get_target_rows(filename, target):
    training_df = pandas.read_csv(filename, sep='\t', encoding='latin1')
    return get_target_rows_in_frame(training_df, target)


def get_tagged_words(rows):
    return list( map((lambda row: pos_tag(row['Tweet'])), rows) )


def tag_counts(tags):
    just_tags = list(map((lambda x: x[1]), tags))
    return list(map((lambda tag: just_tags.count(tag)), TAG_SET))

def tensed_tag_counts(tags):
    just_tags = list(map((lambda x: x[1]), tags))
    return list(map((lambda tag: just_tags.count(tag)), TENSED_POS_TAGS))

def get_x_train(tags_for_tweets, tag_count_func):
    return list(map(tag_count_func, tags_for_tweets))


def onehot_for_stance(stance):
    one_hot = [0,0,0]
    one_hot[STANCES.index(stance)] = 1
    return one_hot


def get_y_train(rows):
    return list(
        map(onehot_for_stance,
            map((lambda row: row['Stance']), rows)))


def get_xy_data(filename, target, tensed=False):
    target_rows = get_target_rows(filename, target)
    # Get tags
#     get_tag_set(target_rows)

    tag_count_func = tag_counts
    if tensed:
        tag_count_func = tensed_tag_counts
        
    tagged_tweets = get_tagged_words(target_rows)
    x_train = get_x_train(tagged_tweets, tag_count_func)
    y_train = get_y_train(target_rows)
    return (x_train, y_train)



# Training the model

In [39]:
def train_model(label, x_train, y_train, tensed=False):
    # LEARN
    input_dimen = len(TENSED_POS_TAGS) if tensed else len(TAG_SET)
    model = Sequential()
    model.add(Dense(units=32, input_dim=input_dimen)) # layer
    model.add(Activation('relu')) # layer
    model.add(Dense(units=64))
    model.add(Activation('relu'))
    model.add(Dense(units=3)) # layer
    model.add(Activation('softmax')) #layer

    model.compile(loss='categorical_crossentropy',
                  optimizer=keras.optimizers.Adam(lr=0.001),
                  metrics=['accuracy']) # not layer

    history = model.fit(np_array(x_train), np_array(y_train), validation_split=0.33, epochs=100, batch_size=32, verbose=0)
#     # summarize history for accuracy
#     plt.plot(history.history['acc'])
#     plt.plot(history.history['val_acc'])
#     plt.title(label + ' model accuracy')
#     plt.ylabel('accuracy')
#     plt.xlabel('epoch')
#     plt.legend(['train', 'test'], loc='upper left')
#     plt.show()
#     # summarize history for loss
#     plt.plot(history.history['loss'])
#     plt.plot(history.history['val_loss'])
#     plt.title(label + ' model loss')
#     plt.ylabel('loss')
#     plt.xlabel('epoch')
#     plt.legend(['train', 'test'], loc='upper left')
#     plt.show()
    # Return our model yo
    return model

In [40]:
labels = [
    'Hillary Clinton',
    'Climate Change is a Real Concern',
    'Feminist Movement',
    'Legalization of Abortion',
    'Atheism'
]

models = {}

for label in labels:
    (x_train, y_train) = get_xy_data('trainingdata.txt', label, TENSED)
    models[label] = train_model(label, x_train, y_train, TENSED)
    

(x_test, y_test) = get_xy_data('trialdata.txt', 'ALL', TENSED)
(x_gold, y_gold) = get_xy_data('./Dans/subtaskA-testdata-gold.txt', 'ALL', TENSED)

In [41]:
def get_predictions(rows, vec_data):
    targets = list(map((lambda row: row['Target']), rows))
    targets_and_data = zip(targets, vec_data)
    predictions = []
    for (target, data) in targets_and_data:
        prediction = (models[target].predict(np_array([data])))[0]
        predict_stance = STANCES[prediction.tolist().index(max(prediction))]
        predictions.append(predict_stance)
    return predictions

all_rows = get_target_rows('./Dans/subtaskA-testdata-gold.txt', 'ALL')
predictions = get_predictions(all_rows, x_gold)

In [42]:
traindf = pandas.read_csv('./Dans/trainingData.txt', sep='\t', encoding='latin1')
print(traindf.head())
testdf = pandas.read_csv('./Dans/subtaskA-testdata-gold.txt', sep='\t', encoding='latin1')

outdf = pandas.DataFrame(columns=['ID','Target','Tweet','Stance'])

for topic in listOfTopics(traindf):
    testExtract = testdf.loc[testdf['Target'] == topic]
    testCorpus = list(map(lambda x: x[:-6], list(testExtract['Tweet'])))

    ID = list(map(lambda x: str(x), list(testExtract['ID'])))
    s = zip(ID, list(testExtract['Target']), testCorpus, list(predictions))

    for x in s:
        outdf.loc[len(outdf)] = list(x)


outdf.set_index('ID')
outdf.to_csv('output.txt', sep='\t', index=False)

    ID   Target                                              Tweet   Stance
0  101  Atheism  dear lord thank u for all of ur blessings forg...  AGAINST
1  102  Atheism  Blessed are the peacemakers, for they shall be...  AGAINST
2  103  Atheism  I am not conformed to this world. I am transfo...  AGAINST
3  104  Atheism  Salah should be prayed with #focus and #unders...  AGAINST
4  105  Atheism  And stay in your houses and do not display you...  AGAINST
