In [None]:
import regex as re
import pandas as pd
import numpy as np
import glob
import nltk
import random as rd
import tensorflow as tf

from nltk.tokenize import TweetTokenizer
from collections import Counter
from keras.utils import to_categorical
from keras.preprocessing.sequence import pad_sequences
from textblob import TextBlob

from keras.layers import Dense, Embedding, Flatten, Input, concatenate
from keras.models import Model
from keras.optimizers import Adagrad

#Import another jupyter notebook
import import_ipynb
from Productive_CrossValidation_inclTest import *
from Productive_Data_GetTrainTest import samEval_hash

In [None]:
#for reproducability of the results
seed = 7
np.random.seed(seed)
tf.set_random_seed(seed)
rd.seed(seed)

In [None]:
path = 'data/'
output= 'output_trainTestData/'
outputpath= 'outputs/'

# Read SamEval2018 data

##### Had to be adopted for new data sets

In [None]:
TRAIN_SAMEVAL = path + 'SemEval2018-T3-train-taskA_emoji_ironyHashtags.txt'
GOLD_TEST_SAMEVAL  = path + 'SemEval2018-T3_gold_test_taskA_emoji.txt'

In [None]:
#Preprocessing
def text_to_wordlist(tweet, vocab):
    #Remove hashtags
    #tweet = re.sub('#','', tweet)
    
    #Remove usermentions
    tweet= re.sub(r'(\w+|^|)@\w+','', tweet)
    
    #Treats url's as special tokens (actually twitter specific)
    tweet=re.sub(r'((http|https)://)(\w|[.]|/)+', 'URL', tweet)
    
    #Tokenize
    tokenizer = TweetTokenizer()
    tweet = tokenizer.tokenize(tweet)
    
    vocab.update(tweet)
    return tweet

def process_tweets(list_sentences, vocab):
    tweets = []
    for tweet in list_sentences:
        twt = text_to_wordlist(tweet, vocab)
        tweets.append(twt)
    return tweets

In [None]:
#Create train data, test data and word_index
def create_train_test_wordindex(vocab, tweets, corpustrain):
    MAX_NB_WORDS = len(vocab)

    MAX_SEQUENCE_LENGTH= 0
    for tweet in tweets:
        if len(tweet) > MAX_SEQUENCE_LENGTH:
            MAX_SEQUENCE_LENGTH = len(tweet)
            
    print('Maximal Sequence Length: '+str(MAX_SEQUENCE_LENGTH))

    word_index = {t[0]: i+1 for i,t in enumerate(vocab.most_common(MAX_NB_WORDS))}

    sequences = [[word_index.get(t, 0) for t in tweet]
                 for tweet in tweets[:len(corpustrain)]]

    test_sequences = [[word_index.get(t, 0) for t in tweet] 
                      for tweet in tweets[len(corpustrain):]]

    train_data = pad_sequences(sequences, maxlen=MAX_SEQUENCE_LENGTH, 
                         padding="pre", truncating="post")

    test_data = pad_sequences(test_sequences, maxlen=MAX_SEQUENCE_LENGTH, padding="pre",
                              truncating="post")

    return train_data, test_data, word_index, MAX_SEQUENCE_LENGTH

In [None]:
def read_data_samEval(FILENAME):
    labels = []
    corpus = []   
    with open(FILENAME, 'r', encoding='utf-8') as train: #Decoding utf-8, else exception
        for line in train:
            if not line.lower().startswith("tweet index"): #skip header
                line = line.rstrip().split("\t")
                label = line[1] #erste Spalte - label
                labels.append(int(label))
                tweet = line[2] #zweite Spalte - tweet
                corpus.append(tweet)  
    returnvalue = []
    returnvalue.append(labels)
    returnvalue.append(corpus)
    return returnvalue

resulttrain = read_data_samEval(TRAIN_SAMEVAL)
labelstrain_sameval = resulttrain[0]
corpustrain_sameval = resulttrain[1]

resulttest = read_data_samEval(GOLD_TEST_SAMEVAL)
labelstest_sameval = resulttest[0]
corpustest_sameval = resulttest[1]

#Prepocess the data
vocab_sameval = Counter()
tweets_sameval = process_tweets(corpustrain_sameval + corpustest_sameval, vocab_sameval)

train_data, test_data, word_index, max_len = create_train_test_wordindex(vocab_sameval, tweets_sameval, corpustrain_sameval)

# Create POS features

In [None]:
def create_pos_sequence(data, MAXLEN):
         
    pos = []
    pos_list = []
    for entry in data:
        postags = nltk.pos_tag(entry)
        for tag in postags:
            if tag[1] not in pos_list:
                pos_list.append(tag[1])
        pos.append(postags)
        
    pos_dict ={}
    i = 1

    for entry in pos_list:
        pos_dict[entry] = i
        i += 1
        
    decoded_pos =[]
    for entry in pos:
        tweet = []
        for tup in entry:
            decode = pos_dict.get(tup[1])
            tweet.append(decode)
        decoded_pos.append(tweet)
        
    pos_sequences = pad_sequences(decoded_pos, maxlen=MAXLEN,padding="pre", truncating="post")
    
    return pos_sequences

In [None]:
train_pos = create_pos_sequence(tweets_sameval[:len(corpustrain_sameval)], max_len)
test_pos = create_pos_sequence(tweets_sameval[len(corpustrain_sameval):], max_len)

# Create sentiment features

In [None]:
def blob_sentiment(data, MAXLEN):
    sentiment_blob = []

    for entry in data:
        sentiment_line = []
        for word in entry:
            word = TextBlob(word)
            polarity = word.sentiment.polarity
            sentiment_line.append(polarity)
        sentiment_blob.append(sentiment_line)
        
    pad_data= pad_sequences(sentiment_blob, maxlen=MAXLEN,padding="pre", truncating="post", dtype='float32')
    return pad_data

In [None]:
train_senti_blob = blob_sentiment(tweets_sameval[:len(corpustrain_sameval)], max_len)
test_senti_blob = blob_sentiment(tweets_sameval[len(corpustrain_sameval):], max_len)

# Load pretrained embeddings

In [None]:
#Load pretrained embeddings
import gensim
from gensim.models import KeyedVectors

#Pretrained Google News Embeddings, Dimension 300
GOOGLEEMB = path + 'GoogleNews-vectors-negative300.bin'

embedding= KeyedVectors.load_word2vec_format(GOOGLEEMB, binary=True)

word_vectors = embedding.wv

In [None]:
#Create embedding-matrix, serves the Keras Embedding Layer as weigths
EMBEDDING_DIM=300

embedding_matrix = np.zeros((len(word_index)+1, EMBEDDING_DIM))

for word, i in word_index.items():
    try:
        embedding_vector = word_vectors[word]
        embedding_matrix[i] = embedding_vector   
    except:
        # words not found in embeddings will be zero
        pass

# Train the model

In [None]:
BATCH_SIZE = 64
NUM_EPOCHS = 30
softmax = True
lossfunction ='binary_crossentropy'
optimizer = Adagrad(lr=0.001)

In [None]:
#The model
wv_layer = Embedding(len(word_index)+1,
                 EMBEDDING_DIM,
                 weights=[embedding_matrix],
                 input_length=max_len,
                 trainable=True)

comment_input = Input(shape=(max_len,), dtype='float32')
embedded_sequences = wv_layer(comment_input)
x_first = Dense(600, activation='relu')(embedded_sequences)
x_first = Flatten()(x_first)

#Second part
senti_input_blob = Input(shape=(max_len,), dtype='float')
x_second = Dense(600, activation='relu')(senti_input_blob)

#Thrid part
senti_input_pos = Input(shape=(max_len,), dtype='float')
x_third = Dense(600, activation='relu')(senti_input_pos)

#Concatination
concat = concatenate([x_first, x_second, x_third])

preds = Dense(2, activation='softmax')(concat)

model = Model(inputs=[comment_input, senti_input_blob, senti_input_pos], outputs=preds)
model.summary()

# 10-fold cross validation

In [None]:
cross_val_three_inputs(model, NUM_EPOCHS, BATCH_SIZE, optimizer, lossfunction, train_data, train_senti_blob, train_pos, labelstrain_sameval, softmax, outputpath, 'samEvalCrosval')

# Test the model with the heldout test data set

In [None]:
train_evaluate_three_inputs(model, NUM_EPOCHS, BATCH_SIZE, optimizer, lossfunction, train_data, train_senti_blob, train_pos, labelstrain_sameval, test_data, test_senti_blob, test_pos, labelstest_sameval, softmax, outputpath, 'SamEval')