# Sarcasm Detection using a CNN, LSTM, and DNN

# Preprocessors
We combine the data at the bottom of the notebook in 

functions:
- load unicode mapping
- load word2vec
- load fast text
- initialize words
- normalize words
- split words
- split hashtags
- load abbreviations
- filter text
- parse data
- load data
- build vocab
- build reverse vocab
- vectorize word dimensions
- pad sequence 1D
- write vocab
- get fasttext weight
- get word2vec weight
- create ngram set
- prepend line
- prepend slow
- checksum
- check num lines in glove
- checksum in glove
- load glove word2vec
- get glove model

In [236]:
import os
import boto3
import re
from sagemaker import get_execution_role
import sagemaker.amazon.common as smac
import sys
from sys import platform

sys.path.append('../')
from collections import defaultdict
import gensim
from gensim.models.keyedvectors import KeyedVectors
from gensim.models.wrappers import FastText
import numpy as np
from nltk.tokenize import TweetTokenizer
import itertools
import shutil
import hashlib

role = get_execution_role()
bucket = 'sagemaker-lign167'# enter your s3 bucket where you will copy data and model artifacts
prefix = 'finalproject' # place to upload training files within the bucket

## Loading the mapping as well as processing the data
Vectorizing the words using GloVe and Word 2 Vec

In [238]:
def load_unicode_mapping(path):
    emoji_dict = defaultdict()
    with open(path, 'r', encoding='utf-8') as f:
        lines = f.readlines()
        for line in lines:
            tokens = line.strip().split('\t')
            emoji_dict[tokens[0]] = tokens[1]
    return emoji_dict

In [239]:
def load_word2vec(path=None):
    word2vecmodel = KeyedVectors.load_word2vec_format(path, binary=True)
    return word2vecmodel

In [240]:
def load_fasttext(path=None):
    word2vecmodel = FastText.load_fasttext_format(path)
    return word2vecmodel

In [241]:
def InitializeWords(word_file_path):
    word_dictionary = defaultdict()
    
    with open(word_file_path, 'r', encoding='utf-8') as f:
        lines = f.readlines()
        for line in lines:
            tokens = line.lower().strip().split('\t')
            word_dictionary[tokens[0]] = int(tokens[1])
            
    for alphabet in "bcdefghijklmnopqrstuvwxyz":
        if (alphabet in word_dictionary):
                word_dictionary.__delitem__(alphabet)
                
    for word in ['ann', 'assis'
                 'bz',
                 'ch', 'cre', 'ct',
                 'di',
                 'ed', 'ee',
                 'ic',
                 'le',
                 'ng', 'ns',
                 'pr', 'picon',
                 'th', 'tle', 'tl', 'tr',
                 'um',
                 've',
                 'yi']:
        if (word in word_dictionary):
            word_dictionary.__delitem__(word)
                
    return word_dictionary

In [242]:
def normalize_word(word):
    temp = word
    while True:
        w = re.sub(r"([a-zA-Z])\1\1", r"\1\1", temp)
        if (w == temp):
            break
        else:
            temp = w
    return w

In [243]:
def load_split_word(split_word_file_path):
    split_word_dictionary = defaultdict()
    with open(split_word_file_path, 'r', encoding='utf-8') as f:
        lines = f.readlines()
        for line in lines:
            tokens = line.lower().strip().split('\t')
            if (len(tokens) >= 2):
                split_word_dictionary[tokens[0]] = tokens[1]
    
    print('split entry found:', len(split_word_dictionary.keys()))
    return split_word_dictionary

In [244]:
def split_hashtags(term, wordlist, split_word_list, dump_file=''):
    if (len(term.strip()) == 1):
        return ['']
    
    if (split_word_list != None and term.lower() in split_word_list):
        return split_word_list.get(term.lower()).split(' ')
    #else:
        #print(term)
        
    if (term.startswith('#')):
        term = term[1:]
    
    if (wordlist != None and term.lower() in wordlist):
        return [term.lower()]
    
    words = []
    penalty = -69971
    max_coverage = penalty
    
    split_words_count = 6
    
    term = re.sub(r'([0-9]+)', r' \1', term)
    term = re.sub(r'(1st|2nd|3rd|4th|5th|6th|7th|8th|9th|0th)', r'\1', term)
    term = re.sub(r'([A-Z][^A-Z ]+)', r' \1', term.strip())
    term = re.sub(r'([A=Z]{2,})+', r' \1', term)
    words = term.strip().split(' ')
    
    n_splits = 0
    
    if (len(words) < 3):
        chars = [c for c in term.lower()]
        
        found_all_words = False
        
        while (n_splits < split_words_count and not found_all_words):
            for idx in itertools.combinations(range(0, len(chars)), n_splits):
                output = np.split(chars, idx)
                line = [''.join(o) for o in output]
                
                score = (1. / len(line)) *sum([wordlist.get( word.strip()) if word.strip() in wordlist else 0. 
                                               if word.strip().isnumeric() else penalty for word in line])
                
                if (score > max_coverage):
                    words = line
                    max_coverage = score
                    
                    line_is_valid_word = [word.strip() in wordlist if not word.isnumeric() else True for word in line]
                    
                    if (all(line_is_valid_word)):
                        found_all_words = True
            n_splits += 1
    
    with open(dump_file, 'a', encoding='utf-8') as f:
        if (term != '' and len(words) > 0):
            f.write('#'+str(term).strip() + '\t' + ' '.join(words) + '\t' + str(n_splits) + '\n')
        
    return words

In [245]:
def load_abbreviation(path='../data/twitter/abbreviations.txt'):
    abbreviation_dict = defaultdict()
    with open(path, 'r', encoding='utf-8') as f:
        lines = f.readlines()
        for line in lines:
            token = line.lower().strip().split('\t')
            abbreviation_dict[token[0]] = token[1]
        return abbreviation_dict

In [246]:
def filter_text(text, word_list, split_word_list, emoji_dict, abbreviation_dict, dump_file_path,
                normalize_text = False, split_hashtag = False,
                ignore_profiles = False, replace_emoji= True):
    filtered_text = []
    
    filter_list = ['/', '-', '=', '+', '…', '\\', '(', ')', '&', ':']
    
    for t in text:
        word_tokens = None
        
        if (ignore_profiles and str(t).startswith("@")):
            continue
            
        if (str(t).startswith('http')):
            continue
            
        if (str(t).lower() in ['#sarcasm']):
            continue
            
        if (replace_emoji):
            if (t in emoji_dict):
                t = emoji_dict.get(t).split('_')
                filtered_text.extend(t)
                continue
        if (split_hashtag and str(t).startswith("#")):
            splits = split_hashtags(t, word_list, split_word_list, 
                                    dump_file=dump_file_path)
            if (splits != None):
                filtered_text.extend([s for s in splits if (not filtered_text.__contains__(s))])
                continue
        
        if (normalize_text):
            t = normalize_word(t)
            
        if (t in abbreviation_dict):
            tokens = abbreviation_dict.get(t).split(' ')
            filtered_text.extend(tokens)
            continue
        
        filtered_text.append(t)
    
    return filtered_text

In [247]:
def parsedata(lines, word_list, split_word_list, emoji_dict, abbreviation_dict, dump_file_path,
              normalize_text = False, split_hashtag = False, ignore_profiles = False,
              lower_case = False, replace_emoji = True, n_grams = None, at_character = False,):
    data = []
    for i, line in enumerate(lines):
        if (i % 10000 == 0):
            print(str(i) + '...', end='', flush=True)
        
        try:
            if (lower_case):
                line = line.lower()
            
            token = line.split('\t')
            id = token[0]
            
            label = int(token[1].strip())
            
            target_text = TweetTokenizer().tokenize(token[2].strip())
            if (at_character):
                target_text = [c for c in token[2].strip()]
                
            if (n_grams != None):
                n_grams_list = list(create_ngram_set(target_text, ngram_values=n_grams))
                target_text.extend(['_'.join(n) for n in n_grams_list])
                
            target_text = filter_text(target_text, word_list, split_word_list, emoji_dict,
                                      abbreviation_dict, dump_file_path, normalize_text, split_hashtag,
                                      ignore_profiles, replace_emoji=replace_emoji)
            
            dimensions = []
            if (len(token) > 3 and token[3].strip() != 'NA'):
                dimensions = [dimension.split('@@')[1] for dimension in token[3].strip().split('|')]
                
            context = []
            if (len(token) > 4):
                if (token[4] != 'NA'):
                    context = TweetTokenizer().tokenize(token[4].strip())
                    context = filter_text(context, word_list, split_word_list, emoji_dict,
                                         abbreviation_dict, dump_file_path, normalize_text, split_hashtag,
                                         ignore_profiles, replace_emoji = replace_emoji)
                    
            author = 'NA'
            if (len(token) > 5):
                author = token[5]
            
            if (len(target_text) != 0):
                data.append((id, label, target_text, dimensions, context, author))
        except:
            raise
    print('')
    return data

## Loading Data

In [248]:
def loaddata(filename, word_file_path, split_word_path, emoji_file_path, dump_file_path, normalize_text=False,
             split_hashtag=False, ignore_profiles=False, lower_case=True, replace_emoji=True,
             n_grams=None, at_character=False):
    word_list = None
    emoji_dict = None
    
    split_word_list = load_split_word(split_word_path)
    
    if (split_hashtag):
        word_list = InitializeWords(word_file_path)
        
    if (replace_emoji):
        emoji_dict = load_unicode_mapping(emoji_file_path)
        
    abbreviation_dict = load_abbreviation()
    
    lines = open(filename, 'r', encoding='utf-8').readlines()
    
    data = parsedata(lines, word_list, split_word_list, emoji_dict, abbreviation_dict, dump_file_path, 
                     normalize_text=normalize_text, split_hashtag=split_hashtag,
                     ignore_profiles=ignore_profiles, lower_case=lower_case, 
                     replace_emoji=replace_emoji, n_grams=n_grams, at_character=at_character)
    return data

In [249]:
def build_vocab(data, without_dimension=True, ignore_context=False, min_freq=0):
    vocab = defaultdict(int)
    vocab_freq = defaultdict(int)
    
    total_words = 1
    if ( not without_dimension):
        for i in range(1, 101):
            vocab_freq[str(i)] = 0
    for sentence_no, token in enumerate(data):
        for word in token[2]:
            if (word not in vocab_freq):
                vocab_freq[word] = 0
            vocab_freq[word] = vocab_freq.get(word) + 1
        
        if (not without_dimension):
            for word in token[3]:
                vocab_freq[word] = vocab_freq.get(word) + 1
        
        if (ignore_context == False):
            for word in token[4]:
                if (not word in vocab):
                    vocab_freq[word] = 0
                vocab_freq[word] = vocab_freq.get(word) + 1
    
    for k, v in vocab_freq.items():
        if (v >= min_freq):
            vocab[k] = total_words
            total_words = total_words + 1
    return vocab

In [250]:
def build_reverse_vocab(vocab):
    rev_vocab = defaultdict(str)
    for k, v in vocab.items():
        rev_vocab[v] = k
    return rev_vocab

In [251]:
def vectorize_word_dimension(data, vocab, drop_dimension_index = None):
    X, Y, D, C, A = [], [], [], [], []
    
    known_words_set = set()
    unknown_words_set = set()
    
    tokens = 0
    token_coverage = 0
    
    for id, label, line, dimensions, context, author in data:
        vec = []
        context_vec = []
        if (len(dimensions) != 0):
            dvec = [vocab.get(d) for d in dimensions]
        else:
            dvec = [vocab.get('unk')] * 11
        
        if drop_dimension_index != None:
            dvec.pop(drop_dimensions_index)
            
        for words in line:
            tokens = tokens + 1
            if (words in vocab):
                vec.append(vocab[words])
                token_coverage = token_coverage + 1
                known_words_set.add(words)
            else:
                vec.append(vocab['unk'])
                unknown_words_set.add(words)
                
        if (len(context) != 0):
            for words in line:
                tokens = tokens + 1
                if (words in vocab):
                    context_vec.append(vocab[words])
                    token_coverage = token_coverage + 1
                    known_words_set.add(words)
                else:
                    context_vec.append(vocab['unk'])
                    unknown_words_set.add(words)
        else:
            context_vec = [vocab['unk']]
            
        X.append(vec)
        Y.append(label)
        D.append(dvec)
        C.append(context_vec)
        A.append(author)
        
    print('Token coverage:', token_coverage / float(tokens))
    print('Word coverage:', len(known_words_set) / float(len(vocab.keys())))
    return np.asarray(X), np.asarray(Y), np.asarray(D), np.asarray(C), np.asarray(A)

In [252]:
def pad_sequence_1d(sequences, maxlen=None, dtype='float32',
                    padding='pre', truncating='pre', value=0.):
    X = [vectors for vectors in sequences]
    
    nb_samples = len(X)
    
    x = (np.zeros((nb_samples, maxlen)) * value).astype(dtype)
    
    for idx, s in enumerate(X):
        if truncating == 'pre':
            trunc = s[-maxlen:]
        elif truncation == 'post':
            trunc = s[:maxlen]
        else:
            raise ValueError("padding type '%s' not understood" % padding)
            
        if padding == 'post':
            x[idx, :len(trunc)] = trunc
        elif padding == 'pre':
            x[idx, -len(trunc):] = trunc
        else:
            raise ValueError("Padding type '%s' not understood" % padding)
        
    return x

In [253]:
def write_vocab(filepath, vocab):
    with open(filepath, 'w', encoding='utf-8') as fw:
        for key, value in vocab.items():
            fw.write(str(key) + '\t' + str(value) + '\n')

In [254]:
def get_fasttext_weight(vocab, n=300, path=None):
    fasttextmodel = load_fasttext(path=path)
    emb_weights = np.zeros((len(vocab.keys()) + 1, n))
    for k, v in vocab.items():
        if (fasttext.__contains__(k)):
            emb_weights[v, :] = fasttext[k][:n]
    
    return emb_weights

In [255]:
def get_word2vec_weight(vocab, n=300, path=None):
    word2vecmodel = load_word2vec(path=path)
    emb_weights = np.zeros((len(vocab.keys()) + 1, n))
    for k, v in vocab.items():
        if (word2vecmodel.__contains__(k)):
            emb_weights[v, :] = word2vecmodel[k][:n]

    return emb_weights

In [256]:
def create_ngram_set(input_list, ngram_value=2):
    """
    Extract a set of n-grams from a list of integers.
    >>> create_ngram_set([1, 4, 9, 4, 1, 4], ngram_value=2)
    {(4, 9), (4, 1), (1, 4), (9, 4)}
    >>> create_ngram_set([1, 4, 9, 4, 1, 4], ngram_value=3)
    [(1, 4, 9), (4, 9, 4), (9, 4, 1), (4, 1, 4)]
    """
    return set(zip(*[input_list[i:] for i in range(ngram_value)]))

In [257]:
def prepend_line(infile, outfile, line):
    with open(infile, 'r', encoding='utf-8') as old:
        with open(outfile, 'w', encoding='utf-8') as new:
            new.write(str(line) + "\n")
            shutil.copyfileobj(old, new)

In [258]:
def prepend_slow(infile, outfile, line):
    with open(infile, 'r', encoding='utf-8') as fin:
        with open(outfile, 'w', encoding='utf-8') as fout:
            fout.write(line + "\n")
            for line in fin:
                fout.write(line)

In [259]:
def checksum(filename):
    BLOCKSIZE = 65536
    hasher = hashlib.md5()
    with open(filename, 'rb') as afile:
        buf = afile.read(BLOCKSIZE)
        while len(buf) > 0:
            hasher.update(buf)
            buf = afile.read(BLOCKSIZE)
    return hasher.hexdigest()

In [260]:
pretrain_num_lines = {"glove.840B.300d.txt": 2196017, "glove.42B.300d.txt": 1917494}

pretrain_checksum = {
    "glove.6B.300d.txt": "b78f53fb56ec1ce9edc367d2e6186ba4",
    "glove.twitter.27B.50d.txt": "6e8369db39aa3ea5f7cf06c1f3745b06",
    "glove.42B.300d.txt": "01fcdb413b93691a7a26180525a12d6e",
    "glove.6B.50d.txt": "0fac3659c38a4c0e9432fe603de60b12",
    "glove.6B.100d.txt": "dd7f3ad906768166883176d69cc028de",
    "glove.twitter.27B.25d.txt": "f38598c6654cba5e6d0cef9bb833bdb1",
    "glove.6B.200d.txt": "49fa83e4a287c42c6921f296a458eb80",
    "glove.840B.300d.txt": "eec7d467bccfa914726b51aac484d43a",
    "glove.twitter.27B.100d.txt": "ccbdddec6b9610196dd2e187635fee63",
    "glove.twitter.27B.200d.txt": "e44cdc3e10806b5137055eeb08850569",
}

def check_num_lines_in_glove(filename, check_checksum=False):
    if check_checksum:
        assert checksum(filename) == pretrain_checksum[filename]
    if filename.startswith('glove.6B.'):
        return 400000
    elif filename.startswith('glove.twitter.27B.'):
        return 1193514
    else:
        return pretrain_num_lines[filename]

In [261]:
def load_glove_word2vec(filename):
    # Input: GloVe Model File
    # More models can be downloaded from http://nlp.stanford.edu/projects/glove/
    # print(filename[filename.rfind('/')+1:])

    glove_file = filename[filename.rfind('/')+1:]
    _, _,tokens, dimensions, _ = glove_file.split('.')
    num_lines = check_num_lines_in_glove(glove_file)
    dims = int(dimensions[:-1])

    # Output: Gensim Model text format.
    gensim_file = '/data/twitter/glove/glove_model.txt'
    gensim_first_line = "{} {}".format(num_lines, dims)

    # Prepends the line.
    if platform == "linux" or platform == "linux2":
        prepend_line(filename, gensim_file, gensim_first_line)
    else:
        prepend_slow(filename, gensim_file, gensim_first_line)

    # Demo: Loads the newly created glove_model.txt into gensim API.
    model = gensim.models.Word2Vec.load_word2vec_format(gensim_file, binary=False)  # GloVe Model

    return model

In [262]:
def get_glove_model(vocab, n=200):
    word2vecmodel = load_glove_word2vec('../data/twitter/glove/glove.twitter.27B/glove.twitter.27B.200d.txt')

    emb_weights = np.zeros((len(vocab.keys()) + 1, n))
    for k, v in vocab.items():
        if (word2vecmodel.__contains__(k)):
            emb_weights[v, :] = word2vecmodel[k][:n]

    return emb_weights

# Importing Datasets

In [263]:
    basepath = os.path.dirname(os.getcwd())
    tw_train_file = basepath + '/data/twitter/train/Train_v1.txt'
    tw_validation_file = basepath + '/data/twitter/Dev_v1.txt'
    tw_test_file = basepath + '/data/twitter/test/Test_v1.txt'
    tw_word_file_path = basepath + '/data/twitter/word_list_freq.txt'
    tw_split_word_path = basepath + '/data/twitter/word_split.txt'
    tw_emoji_file_path = basepath + '/data/twitter/emoji_unicode_names_final.txt'
    comb_train_file = basepath + '/data/combinded/train/Train_v1.txt'
    comb_validation_file = basepath + '/data/combinded/Dev_v1.txt'
    comb_test_file = basepath + '/data/combinded/test/Test_v1.txt'
    comb_word_file_path = basepath + '/data/combinded/word_list_freq.txt'
    comb_split_word_path = basepath + '/data/combinded/word_split.txt'
    comb_emoji_file_path = basepath + '/data/combinded/emoji_unicode_names_final.txt'

    tw_lstm_output_file = basepath + '/models/twitter/LSTM/TestResults.txt'
    tw_lstm_model_file = basepath + '/models/twitter/LSTM/weights/'
    tw_lstm_vocab_file_path = basepath + '/models/twitter/LSTM/vocab_list.txt'
    tw_cnn_output_file = basepath + '/models/twitter/CNN/TestResults.txt'
    tw_cnn_model_file = basepath + '/models/twitter/CNN/weights/'
    tw_cnn_vocab_file_path = basepath + '/models/twitter/CNN/vocab_list.txt'
    tw_dnn_output_file = basepath + '/models/twitter/DNN/TestResults.txt'
    tw_dnn_model_file = basepath + '/models/twitter/DNN/weights/'
    tw_dnn_vocab_file_path = basepath + '/models/twitter/DNN/vocab_list.txt'
    tw_hashtag_split_file_path = basepath + '/data/twitter/hashtag_split_dump.txt'
    comb_lstm_output_file = basepath + '/models/combinded/LSTM/TestResults.txt'
    comb_lstm_model_file = basepath + '/models/combinded/LSTM/weights/'
    comb_lstm_vocab_file_path = basepath + '/models/combinded/LSTM/vocab_list.txt'
    comb_cnn_output_file = basepath + '/models/combinded/CNN/TestResults.txt'
    comb_cnn_model_file = basepath + '/models/combinded/CNN/weights/'
    comb_cnn_vocab_file_path = basepath + '/models/combinded/CNN/vocab_list.txt'
    comb_dnn_output_file = basepath + '/models/combinded/DNN/TestResults.txt'
    comb_dnn_model_file = basepath + '/models/combinded/DNN/weights/'
    comb_dnn_vocab_file_path = basepath + '/models/combinded/DNN/vocab_list.txt'
    comb_hashtag_split_file_path = basepath + '/data/combinded/hashtag_split_dump.txt'   

In [309]:
np.random.seed(1337)
from sklearn import metrics
from keras.models import Sequential, model_from_json
from keras.layers.core import Dropout, Dense, Activation, Flatten
from keras.layers.embeddings import Embedding
from keras.layers.recurrent import LSTM
from keras.layers.convolutional import Convolution1D, MaxPooling1D,Convolution2D
from keras.callbacks import ModelCheckpoint
from keras.callbacks import EarlyStopping
from keras.optimizers import Adam, SGD
from keras.utils import np_utils
from collections import defaultdict
import collections
import time

# Declaring the models

In [336]:
class sarcasm_model():
    _train_file = None
    _test_file = None
    _tweet_file = None
    _output_file = None
    _model_file_path = None
    _word_file_path = None
    _split_word_file_path = None
    _emoji_file_path = None
    _vocab_file_path = None
    _input_weight_file_path = None
    _vocab = None
    _line_maxlen = None
    _dump_file_path = None
    def __init__(self):
        self._line_maxlen = 30
        
    def _build_CNN_LSTM_DNN_network(self, vocab_size, maxlen, embedding_dimension=256,
                                    hidden_units=256, trainable=False):
        print('Build model...')
        model = Sequential()
        
        model.add(Embedding(vocab_size, embedding_dimension, input_length=maxlen,
                           embeddings_initializer='glorot_normal'))
        
        model.add(Convolution1D(hidden_units, 3, kernel_initializer='he_normal', padding='valid',
                               activation='sigmoid',input_shape=(1, maxlen)))
        
        model.add(Convolution1D(hidden_units, 3, kernel_initializer='he_normal', padding='valid',
                               activation='sigmoid',input_shape=(1, maxlen - 2)))
        model.add(LSTM(hidden_units, kernel_initializer='he_normal', activation='sigmoid', dropout=0.5,
                       return_sequences=True))
        
        model.add(LSTM(hidden_units, kernel_initializer='he_normal', activation='sigmoid', dropout=0.5))
        model.add(Dense(hidden_units, kernel_initializer='he_normal', activation='sigmoid'))
        model.add(Dense(2))
        model.add(Activation('softmax'))
        adam = Adam(lr=0.0001)
        model.compile(loss='categorical_crossentropy', optimizer=adam, metrics=['accuracy'])
        print('No of parameter:', model.count_params())

        print(model.summary())
        return model
    def _build_LSTM_network(self, vocab_size, maxlen, embedding_dimension=256, hidden_units=256,
                            trainable=False):
        print('Build model...')
        model = Sequential()
        model.add(Embedding(vocab_size, embedding_dimension, input_length=maxlen,
                           embeddings_initializer='glorot_normal'))
        model.add(LSTM(hidden_units, kernel_initializer='he_normal', activation='sigmoid',
                       dropout=0.2, return_sequences=True))
        model.add(LSTM(2, kernel_initializer='he_normal', activation='sigmoid'))
        adam = Adam(lr=0.00001)
        model.compile(loss='categorical_crossentropy', optimizer=adam, metrics=['accuracy'])
        print('No of parameter:', model.count_params())

        print(model.summary())
        return model
    def _build_CNN_network(self, vocab_size, maxlen, embedding_dimension=256, hidden_units=256,
                           trainable=False):
        print('Build model...')
        model = Sequential()
        model.add(Embedding(vocab_size, embedding_dimension, input_length=maxlen,
                           embeddings_initializer='glorot_normal'))
        
        model.add(Convolution1D(hidden_units, 3, kernel_initializer='he_normal', padding='valid',
                               activation='sigmoid',input_shape=(1, maxlen)))
        model.add(MaxPooling1D(pool_size=3))
        model.add(Dropout(0.25))
        model.add(Dense(activation = 'softmax', units=2))
        adam = Adam(lr=0.00001)
        model.compile(loss='categorical_crossentropy', optimizer=adam, metrics=['accuracy'])
        print('No of parameter:', model.count_params())

        print(model.summary())
        return model

    
    def _build_standalone_CNN_network(self, vocab_size, maxlen, embedding_dimension=256, hidden_units=256,
                           trainable=False):
        model = Sequential()
        model.add(Embedding(vocab_size, embedding_dimension, input_length=maxlen,
                           embeddings_initializer='glorot_normal'))
        model.add(Convolution1D(hidden_units, 3, kernel_initializer='he_normal', padding='valid',
                               activation='sigmoid',input_shape=(1, maxlen)))
        model.add(Convolution1D(hidden_units, 3, kernel_initializer='he_normal', padding='valid',
                               activation='sigmoid',input_shape=(1, maxlen - 2)))
        model.add(Flatten())
        model.add(Dense(2))
        sgd = SGD(lr=0.00001)
        model.compile(loss='categorical_crossentropy', optimizer=sgd, metrics=['accuracy'])
        print('No of parameter:', model.count_params())

        print(model.summary())
        return model

# Train Model

In [337]:
class train_model(sarcasm_model):
    train = None
    validation = None
    model_type = 'CNN'
    print("Loading data...")
    
    def __init__(self, model_type, train_file, validation_file, word_file_path, split_word_path, emoji_file_path,
                 model_file, vocab_file, output_file, dump_file_path, input_weight_file_path=None):
        sarcasm_model.__init__(self)
        self.model_type = model_type
        self._train_file = train_file
        self._validation_file = validation_file
        self._word_file_path = word_file_path
        self._split_word_file_path = split_word_path
        self._emoji_file_path = emoji_file_path
        self._model_file = model_file
        self._vocab_file_path = vocab_file
        self._output_file = output_file
        self._input_weight_file_path = input_weight_file_path
        self._dump_file_path = dump_file_path
        
        self.load_train_validation_data()
        
        print(self._line_maxlen)
        
        self._vocab = build_vocab(self.train, min_freq=1)
        if ('unk' not in self._vocab):
            self._vocab['unk'] = len(self._vocab.keys()) + 1
        
        print(len(self._vocab.keys()) + 1)
        print('unk::', self._vocab['unk'])
        write_vocab(self._vocab_file_path, self._vocab)
        
        X, Y, D, C, A = vectorize_word_dimension(self.train, self._vocab)
        X = pad_sequence_1d(X, maxlen=self._line_maxlen)
        
        vX, vY, vD, vC, vA = vectorize_word_dimension(self.validation, self._vocab)
        vX = pad_sequence_1d(vX, maxlen=self._line_maxlen)
        
        dimension_size = 256
        
        ratio = self.calculate_label_ratio(Y)
        ratio = [max(ratio.values()) / value for key, value in ratio.items()]
        print('class ratio::', ratio)
        
        Y, vY = [np_utils.to_categorical(x) for x in (Y, vY)]

        print('train_X', X.shape)
        print('train_Y', Y.shape)
        print('validation_X', vX.shape)
        print('validation_Y', vY.shape)
        
        if model_type == 'CNN':
            model = self._build_CNN_network(len(self._vocab.keys()) + 1, self._line_maxlen,
                                            embedding_dimension=dimension_size, trainable=True)
        elif model_type == 'LSTM':
            model = self._build_LSTM_network(len(self._vocab.keys()) + 1, self._line_maxlen,
                                             embedding_dimension=dimension_size, trainable=True)
        elif model_type == 'DNN':
            model = self._build_CNN_LSTM_DNN_network(len(self._vocab.keys()) + 1, self._line_maxlen,
                                                     embedding_dimension=dimension_size, 
                                                     trainable=True)
        elif model_type == 'CNN_1D':
              model = self._build_standalone_CNN_network(len(self._vocab.keys()) + 1, self._line_maxlen,
                                                     embedding_dimension=dimension_size, 
                                                     trainable=True)
        open(self._model_file + 'model.json', 'w').write(model.to_json())
        save_best = ModelCheckpoint(model_file + 'model.json.hdf5', save_best_only=True)
        save_all = ModelCheckpoint(self._model_file + 'weights.{epoch:02d}__.hdf5',
                                   save_best_only=False)
        early_stopping = EarlyStopping(monitor='val_loss', patience=20, verbose=1)

        # training
        model.fit(X, Y, batch_size=8, epochs=10, validation_data=(vX, vY), shuffle=True,
                  callbacks=[save_best, save_all, early_stopping], class_weight=ratio)
        
    def load_train_validation_data(self):
        self.train = loaddata(self._train_file, self._word_file_path, self._split_word_file_path,
                                 self._emoji_file_path, self._dump_file_path, normalize_text=True,
                                 split_hashtag=False,
                                 ignore_profiles=False)
        print('Training data loading finished...')

        self.validation = loaddata(self._validation_file, self._word_file_path, self._split_word_file_path,
                                      self._emoji_file_path, self._dump_file_path,
                                      normalize_text=True,
                                      split_hashtag=False,
                                      ignore_profiles=False)
        print('Validation data loading finished...')

        if (self._test_file != None):
            self.test = loaddata(self._test_file, self._word_file_path, self._split_word_file_path,
                                        self._emoji_file_path, self._dump_file_path,
                                        normalize_text=True,
                                        split_hashtag=True,
                                        ignore_profiles=True)
    def get_maxlen(self):
        return max(map(len, (x for _, x in self.train + self.validation)))

    def write_vocab(self):
        with open(self._vocab_file_path, 'w', encoding='utf-8') as fw:
            for key, value in self._vocab.iteritems():
                fw.write(str(key) + '\t' + str(value) + '\n')

    def calculate_label_ratio(self, labels):
        return collections.Counter(labels)


Loading data...


# Test Model 

In [338]:
class test_model(sarcasm_model):
    test = None
    model = None

    
    def __init__(self, model_file, word_file_path, split_word_path, emoji_file_path, vocab_file_path,
                output_file, dump_file_path, input_weight_file_path=None):
        print('Initializing...')
        sarcasm_model.__init__(self)
        
        self._model_file_path = model_file
        self._word_file_path = word_file_path
        self._split_word_file_path = split_word_path
        self._emoji_file_path = emoji_file_path
        self._vocab_file_path = vocab_file_path
        self._output_file = output_file
        self._input_weight_file_path = input_weight_file_path
        self._dump_file_path = dump_file_path
        
        print('test_maxlen', self._line_maxlen)
    
    def load_trained_model(self, model_file='model.json', weight_file='model.json.hdf5'):
        start = time.time()
        self.__load_model(self._model_file_path + model_file, self._model_file_path + weight_file)
        end = time.time()
        print('model loading time::', (end - start))

    def __load_model(self, model_path, model_weight_path):
        self.model = model_from_json(open(model_path, encoding='utf-8').read())
        print('model loaded from file...')
        self.model.load_weights(model_weight_path)
        print('model weights loaded from file...')

    def load_vocab(self):
        vocab = defaultdict()
        with open(self._vocab_file_path, 'r', encoding='utf-8') as f:
            for line in f.readlines():
                key, value = line.split('\t')
                vocab[key] = value
        return vocab
    
    def predict(self, test_file, verbose=False):
        try:
            start = time.time()
            self.test = loaddata(test_file, self._word_file_path, self._split_word_file_path, self._emoji_file_path,
                                    self._dump_file_path, normalize_text=True, split_hashtag=True,
                                    ignore_profiles=False)
            end = time.time()
            if (verbose == True):
                print('test resource loading time::', (end - start))

            self._vocab = self.load_vocab()
            print('vocab loaded...')

            start = time.time()
            tX, tY, tD, tC, tA = vectorize_word_dimension(self.test, self._vocab)
            tX = pad_sequence_1d(tX, maxlen=self._line_maxlen)
            end = time.time()
            if (verbose == True):
                print('test resource preparation time::', (end - start))

            self.__predict_model(tX, self.test)
        except Exception as e:
            print('Error:', e)
    
    def __predict_model(self, tX, test):
        y = []
        y_pred = []
        prediction_probability = self.model.predict(tX, batch_size=1, verbose=1)
        try:
            fd = open(self._output_file + '.analysis', 'w', encoding='utf-8')
            for i, (label) in enumerate(prediction_probability):
                id = test[i][0]
                gold_label = test[i][1]
                words = test[i][2]
                dimensions = test[i][3]
                context = test[i][4]
                author = test[i][5]

                predicted = np.argmax(prediction_probability[i])

                y.append(int(gold_label))
                y_pred.append(predicted)

                fd.write(str(label[0]) + '\t' + str(label[1]) + '\t'
                         + str(gold_label) + '\t'
                         + str(predicted) + '\t'
                         + ' '.join(words))

                fd.write('\n')

            print('accuracy::', metrics.accuracy_score(y, y_pred))
            print('precision::', metrics.precision_score(y, y_pred, average='weighted'))
            print('recall::', metrics.recall_score(y, y_pred, average='weighted'))
            print('f_score::', metrics.f1_score(y, y_pred, average='weighted'))
            print('f_score::', metrics.classification_report(y, y_pred))
            fd.close()
        except Exception as e:
            print(e)
            

# Twitter Dataset Evaluation

In [166]:
twitter_LSTM_tr = train_model('LSTM', tw_train_file, tw_validation_file, tw_word_file_path, 
                              tw_split_word_path, tw_emoji_file_path, tw_lstm_model_file,
                              tw_lstm_vocab_file_path, tw_hashtag_split_file_path, tw_lstm_output_file)

split entry found: 10942
0...10000...20000...30000...
Training data loading finished...
split entry found: 10942
0...
Validation data loading finished...
30
33892
unk:: 33891
Token coverage: 1.0
Word coverage: 0.9999704936413797
Token coverage: 0.9858098520170282
Word coverage: 0.13006402879820603
class ratio:: [1.0, 1.151665945478148]
train_X (39780, 30)
train_Y (39780, 2)
validation_X (1605, 30)
validation_Y (1605, 2)
Build model...
No of parameter: 9203736
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_4 (Embedding)      (None, 30, 256)           8676352   
_________________________________________________________________
lstm_7 (LSTM)                (None, 30, 256)           525312    
_________________________________________________________________
lstm_8 (LSTM)                (None, 2)                 2072      
Total params: 9,203,736
Trainable params: 9,203,736
Non-trainable params: 

In [170]:
twitter_LSTM_te = test_model(tw_lstm_model_file, tw_word_file_path, tw_split_word_path,
                             tw_emoji_file_path, tw_lstm_vocab_file_path, tw_hashtag_split_file_path, tw_lstm_output_file)
twitter_LSTM_te.load_trained_model(weight_file='weights.07__.hdf5')
twitter_LSTM_te.predict(tw_test_file)

Initializing...
test_maxlen 30
model loaded from file...
model weights loaded from file...
model loading time:: 1.713092565536499
split entry found: 10942
0...
vocab loaded...
Token coverage: 0.9882122357552849
Word coverage: 0.12487090968103626
accuracy:: 0.772
precision:: 0.7721088435374149
recall:: 0.772
f_score:: 0.771977197719772
f_score::              precision    recall  f1-score   support

          0       0.77      0.78      0.77      1000
          1       0.78      0.76      0.77      1000

avg / total       0.77      0.77      0.77      2000



In [334]:
# twitter_CNN_tr = train_model('CNN_1D', tw_train_file, tw_validation_file, tw_word_file_path, 
#                               tw_split_word_path, tw_emoji_file_path, tw_cnn_model_file,
#                               tw_cnn_vocab_file_path, tw_hashtag_split_file_path, tw_cnn_output_file)

In [None]:
twitter_CNN_te = test_model(tw_cnn_model_file, tw_word_file_path, tw_split_word_path,
                             tw_emoji_file_path, tw_cnn_vocab_file_path, tw_hashtag_split_file_path, tw_cnn_output_file)
twitter_CNN_te.load_trained_model(weight_file='weights.05__.hdf5')
twitter_CNN_te.predict(tw_test_file)

In [208]:
twitter_DNN_tr = train_model('DNN', tw_train_file, tw_validation_file, tw_word_file_path, 
                              tw_split_word_path, tw_emoji_file_path, tw_dnn_model_file,
                              tw_dnn_vocab_file_path, tw_hashtag_split_file_path, tw_dnn_output_file)

split entry found: 10942
0...10000...20000...30000...
Training data loading finished...
split entry found: 10942
0...
Validation data loading finished...
30
33892
unk:: 33891
Token coverage: 1.0
Word coverage: 0.9999704936413797
Token coverage: 0.9858098520170282
Word coverage: 0.13006402879820603
class ratio:: [1.0, 1.151665945478148]
train_X (39780, 30)
train_Y (39780, 2)
validation_X (1605, 30)
validation_Y (1605, 2)
Build model...
No of parameter: 10187010
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_15 (Embedding)     (None, 30, 256)           8676352   
_________________________________________________________________
conv1d_11 (Conv1D)           (None, 28, 256)           196864    
_________________________________________________________________
conv1d_12 (Conv1D)           (None, 26, 256)           196864    
_________________________________________________________________
lstm_9 

In [209]:
twitter_DNN_te = test_model(tw_dnn_model_file, tw_word_file_path, tw_split_word_path,
                             tw_emoji_file_path, tw_dnn_vocab_file_path, tw_hashtag_split_file_path, tw_dnn_output_file)
twitter_DNN_te.load_trained_model(weight_file='weights.09__.hdf5')
twitter_DNN_te.predict(tw_test_file)

Initializing...
test_maxlen 30
model loaded from file...
model weights loaded from file...
model loading time:: 2.7530746459960938
split entry found: 10942
0...
vocab loaded...
Token coverage: 0.9882122357552849
Word coverage: 0.12487090968103626
accuracy:: 0.8825
precision:: 0.8855539730202938
recall:: 0.8825
f_score:: 0.8822668589474306
f_score::              precision    recall  f1-score   support

          0       0.92      0.84      0.88      1000
          1       0.85      0.93      0.89      1000

avg / total       0.89      0.88      0.88      2000



## The Start of the Combined Dataset

In [347]:
comb_LSTM_tr = train_model('LSTM', comb_train_file, comb_validation_file, comb_word_file_path, 
                              comb_split_word_path, comb_emoji_file_path, comb_lstm_model_file,
                              comb_lstm_vocab_file_path, comb_hashtag_split_file_path, comb_lstm_output_file)

split entry found: 10942
0...10000...20000...30000...40000...50000...60000...70000...80000...
Training data loading finished...
split entry found: 10942
0...
Validation data loading finished...
30
56946
unk:: 56945
Token coverage: 1.0
Word coverage: 0.9999824391957152
Token coverage: 0.9737100635109518
Word coverage: 0.1363772060760383
class ratio:: [1.08667772523582, 1.0]
train_X (84283, 30)
train_Y (84283, 2)
validation_X (4681, 30)
validation_Y (4681, 2)
Build model...
No of parameter: 15105560
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_43 (Embedding)     (None, 30, 256)           14578176  
_________________________________________________________________
lstm_15 (LSTM)               (None, 30, 256)           525312    
_________________________________________________________________
lstm_16 (LSTM)               (None, 2)                 2072      
Total params: 15,105,560
Trainable 

In [348]:
comb_LSTM_te = test_model(comb_lstm_model_file, comb_word_file_path, comb_split_word_path,
                             comb_emoji_file_path, comb_lstm_vocab_file_path, comb_hashtag_split_file_path, comb_lstm_output_file)
comb_LSTM_te.load_trained_model(weight_file='weights.10__.hdf5')
comb_LSTM_te.predict(comb_test_file)

Initializing...
test_maxlen 30
model loaded from file...
model weights loaded from file...
model loading time:: 4.124856472015381
split entry found: 10942
0...
vocab loaded...
Token coverage: 0.9754688361831219
Word coverage: 0.13620159803318993
accuracy:: 0.6432673899170389
precision:: 0.6429775830002576
recall:: 0.6432673899170389
f_score:: 0.6399590314882202
f_score::              precision    recall  f1-score   support

          0       0.64      0.73      0.68      2495
          1       0.64      0.55      0.59      2206

avg / total       0.64      0.64      0.64      4701



In [None]:
comb_CNN_tr = train_model('CNN_1D', comb_train_file, comb_validation_file, comb_word_file_path, 
                              comb_split_word_path, comb_emoji_file_path, comb_cnn_model_file,
                              comb_cnn_vocab_file_path, comb_hashtag_split_file_path, comb_cnn_output_file)

In [None]:
comb_CNN_te = test_model(comb_cnn_model_file, comb_word_file_path, comb_split_word_path,
                             comb_emoji_file_path, comb_cnn_vocab_file_path, comb_hashtag_split_file_path, comb_cnn_output_file)
comb_CNN_te.load_trained_model(weight_file='weights.05__.hdf5')
comb_CNN_te.predict(comb_test_file)

In [350]:
comb_DNN_tr = train_model('DNN', comb_train_file, comb_validation_file, comb_word_file_path, 
                              comb_split_word_path, comb_emoji_file_path, comb_dnn_model_file,
                              comb_dnn_vocab_file_path, comb_hashtag_split_file_path, comb_dnn_output_file)

split entry found: 10942
0...10000...20000...30000...40000...50000...60000...70000...80000...
Training data loading finished...
split entry found: 10942
0...
Validation data loading finished...
30
56946
unk:: 56945
Token coverage: 1.0
Word coverage: 0.9999824391957152
Token coverage: 0.9737100635109518
Word coverage: 0.1363772060760383
class ratio:: [1.08667772523582, 1.0]
train_X (84283, 30)
train_Y (84283, 2)
validation_X (4681, 30)
validation_Y (4681, 2)
Build model...
No of parameter: 16088834
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_45 (Embedding)     (None, 30, 256)           14578176  
_________________________________________________________________
conv1d_57 (Conv1D)           (None, 28, 256)           196864    
_________________________________________________________________
conv1d_58 (Conv1D)           (None, 26, 256)           196864    
___________________________________

In [354]:
comb_DNN_te = test_model(comb_dnn_model_file, comb_word_file_path, comb_split_word_path,
                             comb_emoji_file_path, comb_dnn_vocab_file_path, comb_hashtag_split_file_path, comb_dnn_output_file)
comb_DNN_te.load_trained_model(weight_file='weights.06__.hdf5')
comb_DNN_te.predict(comb_test_file)

Initializing...
test_maxlen 30
model loaded from file...
model weights loaded from file...
model loading time:: 5.138683319091797
split entry found: 10942
0...
vocab loaded...
Token coverage: 0.9754688361831219
Word coverage: 0.13620159803318993
accuracy:: 0.6773027015528611
precision:: 0.6770971219720416
recall:: 0.6773027015528611
f_score:: 0.6771775510849605
f_score::              precision    recall  f1-score   support

          0       0.69      0.70      0.70      2495
          1       0.66      0.65      0.65      2206

avg / total       0.68      0.68      0.68      4701



# Reddit Data Evaluation

In [352]:
red_train_file = basepath + '/data/reddit/train/Train_v1.txt'
red_validation_file = basepath + '/data/reddit/Dev_v1.txt'
red_test_file = basepath + '/data/reddit/test/Test_v1.txt'
red_word_file_path = basepath + '/data/reddit/word_list_freq.txt'
red_split_word_path = basepath + '/data/reddit/word_split.txt'
red_emoji_file_path = basepath + '/data/reddit/emoji_unicode_names_final.txt'

red_lstm_output_file = basepath + '/models/reddit/LSTM/TestResults.txt'
red_lstm_model_file = basepath + '/models/reddit/LSTM/weights/'
red_lstm_vocab_file_path = basepath + '/models/reddit/LSTM/vocab_list.txt'
red_cnn_output_file = basepath + '/models/reddit/CNN/TestResults.txt'
red_cnn_model_file = basepath + '/models/reddit/CNN/weights/'
red_cnn_vocab_file_path = basepath + '/models/reddit/CNN/vocab_list.txt'
red_dnn_output_file = basepath + '/models/reddit/DNN/TestResults.txt'
red_dnn_model_file = basepath + '/models/reddit/DNN/weights/'
red_dnn_vocab_file_path = basepath + '/models/reddit/DNN/vocab_list.txt'
red_hashtag_split_file_path = basepath + '/data/reddit/hashtag_split_dump.txt'

In [355]:
red_LSTM_tr = train_model('LSTM', red_train_file, red_validation_file, red_word_file_path, 
                              red_split_word_path, red_emoji_file_path, red_lstm_model_file,
                              red_lstm_vocab_file_path, red_hashtag_split_file_path, red_lstm_output_file)

split entry found: 10942
0...10000...20000...30000...40000...
Training data loading finished...
split entry found: 10942
0...
Validation data loading finished...
30
34074
unk:: 34073
Token coverage: 1.0
Word coverage: 0.9999706512487894
Token coverage: 0.9657298157231871
Word coverage: 0.142546884630059
class ratio:: [1.005113961997972, 1.0]
train_X (45482, 30)
train_Y (45482, 2)
validation_X (2527, 30)
validation_Y (2527, 2)
Build model...
No of parameter: 9250328
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_46 (Embedding)     (None, 30, 256)           8722944   
_________________________________________________________________
lstm_21 (LSTM)               (None, 30, 256)           525312    
_________________________________________________________________
lstm_22 (LSTM)               (None, 2)                 2072      
Total params: 9,250,328
Trainable params: 9,250,328
Non-trainable pa

In [356]:
red_LSTM_te = test_model(red_lstm_model_file, red_word_file_path, red_split_word_path,
                             red_emoji_file_path, red_lstm_vocab_file_path, red_hashtag_split_file_path, red_lstm_output_file)
red_LSTM_te.load_trained_model(weight_file='weights.10__.hdf5')
red_LSTM_te.predict(red_test_file)

Initializing...
test_maxlen 30
model loaded from file...
model weights loaded from file...
model loading time:: 5.097896099090576
split entry found: 10942
0...
vocab loaded...
Token coverage: 0.9677645186953063
Word coverage: 0.14134358583042292
accuracy:: 0.6632370399683419
precision:: 0.6673562246543437
recall:: 0.6632370399683419
f_score:: 0.659818574484986
f_score::              precision    recall  f1-score   support

          0       0.64      0.76      0.70      1291
          1       0.69      0.56      0.62      1236

avg / total       0.67      0.66      0.66      2527



In [None]:
red_CNN_tr = train_model('CNN_1D', red_train_file, red_validation_file, red_word_file_path, 
                              red_split_word_path, red_emoji_file_path, red_cnn_model_file,
                              red_cnn_vocab_file_path, red_hashtag_split_file_path, red_cnn_output_file)

In [None]:
red_CNN_te = test_model(red_cnn_model_file, red_word_file_path, red_split_word_path,
                             red_emoji_file_path, red_cnn_vocab_file_path, red_hashtag_split_file_path, red_cnn_output_file)
red_CNN_te.load_trained_model(weight_file='weights.05__.hdf5')
red_CNN_te.predict(red_test_file)

In [357]:
red_DNN_tr = train_model('DNN', red_train_file, red_validation_file, red_word_file_path, 
                              red_split_word_path, red_emoji_file_path, red_dnn_model_file,
                              red_dnn_vocab_file_path, red_hashtag_split_file_path, red_dnn_output_file)

split entry found: 10942
0...10000...20000...30000...40000...
Training data loading finished...
split entry found: 10942
0...
Validation data loading finished...
30
34074
unk:: 34073
Token coverage: 1.0
Word coverage: 0.9999706512487894
Token coverage: 0.9657298157231871
Word coverage: 0.142546884630059
class ratio:: [1.005113961997972, 1.0]
train_X (45482, 30)
train_Y (45482, 2)
validation_X (2527, 30)
validation_Y (2527, 2)
Build model...
No of parameter: 10233602
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_47 (Embedding)     (None, 30, 256)           8722944   
_________________________________________________________________
conv1d_59 (Conv1D)           (None, 28, 256)           196864    
_________________________________________________________________
conv1d_60 (Conv1D)           (None, 26, 256)           196864    
_________________________________________________________________
l

In [358]:
red_DNN_te = test_model(red_dnn_model_file, red_word_file_path, red_split_word_path,
                             red_emoji_file_path, red_dnn_vocab_file_path, red_hashtag_split_file_path, red_dnn_output_file)
red_DNN_te.load_trained_model(weight_file='weights.06__.hdf5')
red_DNN_te.predict(red_test_file)

Initializing...
test_maxlen 30
model loaded from file...
model weights loaded from file...
model loading time:: 5.716418743133545
split entry found: 10942
0...
vocab loaded...
Token coverage: 0.9677645186953063
Word coverage: 0.14134358583042292
accuracy:: 0.6640284922833399
precision:: 0.664410511188802
recall:: 0.6640284922833399
f_score:: 0.6640613240282733
f_score::              precision    recall  f1-score   support

          0       0.68      0.66      0.67      1291
          1       0.65      0.67      0.66      1236

avg / total       0.66      0.66      0.66      2527



# Combining Data

In [342]:
tw_tr = pd.read_table(basepath + '/data/twitter/train/Train_v1.txt', encoding='utf-8', header=None)
tw_te = pd.read_table(basepath + '/data/twitter/test/Test_v1.txt', encoding='utf-8', header=None)
tw_va = pd.read_table(basepath + '/data/twitter/Dev_v1.txt', encoding='utf-8', header=None)
tw_tr = tw_tr.sample(frac=1)
tw_te = tw_te.sample(frac=1)
tw_va = tw_va.sample(frac=1)
tw_va.reset_index(drop=True, inplace=True)
tw_tr.reset_index(drop=True, inplace=True)
tw_te.reset_index(drop=True, inplace=True)
tw_te.to_csv(basepath + '/data/twitter/test/Test_v1.txt', header=False, index=False, sep='\t', encoding='utf-8')
tw_va.to_csv(basepath + '/data/twitter/Dev_v1.txt', header=False, index=False, sep='\t', encoding='utf-8')
tw_tr.to_csv(basepath + '/data/twitter/train/Train_v1.txt', index=False ,sep='\t', header=False, encoding='utf-8')
print(tw_tr.head(),tw_te.head(), re_tr.head())

          0  1                                                  2
0  TrainSen  0  likers get tbh ; dun really know u but i know ...
1  TrainSen  0  🤑: we have fun times you're chill asf and you'...
2  TrainSen  0  @ValaAfshar : be lucky 1 work hard 2 be positi...
3  TrainSen  0  @STurkle texting actually separates people as ...
4  TrainSen  0  EVERYONE come out to the girls bball game tomo...           0  1                                                  2
0  TrainSen  0  Sometimes I wish I learned how to speak Spanis...
1  TrainSen  0         PSV certainly had a difficult game tonight
2  TrainSen  1  I know man ! In for my leg finishing next week...
3  TrainSen  0  I feel strangely elated because im so used to ...
4  TrainSen  1  Yay . School . If you could only see the joy o...    label                                            comment     author  \
0      0                                         NC and NH.  Trumpbart   
1      0  You do know west teams play against west teams... 

In [341]:
import pandas as pd
re_tr = pd.read_csv(basepath + '/data/combinded/train-balanced-sarcasm.csv', encoding='utf-8')
red_dat = re_tr[['label','comment']]
red_dat['Sen_type'] = 'TrainSen'
red_dat.head()
cols = red_dat.columns.tolist()
cols = cols[-1:] + cols[:-1]
red_dat = red_dat[cols]
red_dat = red_dat.sample(frac=.05)
red_dat.reset_index(drop=True, inplace=True)
red_te = red_dat.iloc[:len(red_dat)//20, :]
red_va = red_dat.iloc[len(red_dat)//20:2*(len(red_dat)//20), :]
red_tr = red_dat.iloc[2*(len(red_dat))//20:, :]
print(red_te.shape, red_tr.shape, red_va.shape)
red_te.to_csv(basepath + '/data/reddit/test/Test_v1.txt', header=False, index=False, sep='\t', encoding='utf-8')
red_va.to_csv(basepath + '/data/reddit/Dev_v1.txt', header=False, index=False, sep='\t', encoding='utf-8')
red_tr.to_csv(basepath + '/data/reddit/train/Train_v1.txt', index=False ,sep='\t', header=False, encoding='utf-8')


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


(2527, 3) (45487, 3) (2527, 3)


In [343]:
twit_dat1 = tw_tr.merge(tw_te, how='outer')
twit_dat2 = twit_dat1.merge(tw_va, how='outer')
twit_dat2.columns = ['Sen_type', 'label','comment']
twit_dat2.head()

Unnamed: 0,Sen_type,label,comment
0,TrainSen,0,likers get tbh ; dun really know u but i know ...
1,TrainSen,0,🤑: we have fun times you're chill asf and you'...
2,TrainSen,0,@ValaAfshar : be lucky 1 work hard 2 be positi...
3,TrainSen,0,@STurkle texting actually separates people as ...
4,TrainSen,0,EVERYONE come out to the girls bball game tomo...


In [344]:
comb_dat = red_dat.merge(twit_dat2, how='outer')
comb_dat.shape

(93622, 3)

In [345]:
comb_dat = comb_dat.sample(frac=1)
comb_dat.reset_index(drop=True, inplace=True)
print(comb_dat.head())
comb_te = comb_dat.iloc[:len(comb_dat)//20, :]
comb_va = comb_dat.iloc[len(comb_dat)//20:2*(len(comb_dat)//20), :]
comb_tr = comb_dat.iloc[2*(len(comb_dat)//20):, :]

print(comb_tr.shape, comb_te.shape, comb_va.shape)

   Sen_type  label                                            comment
0  TrainSen      0  @realDonaldTrump @CNN why is ur response alway...
1  TrainSen      1  Yay its pouring rain and I have xc this mornin...
2  TrainSen      1    Forever fucked up in the head. Thanks. #sarcasm
3  TrainSen      1                          It's pretty widely known.
4  TrainSen      0  . ur kids it's a free download i'ma post it so...
(84260, 3) (4681, 3) (4681, 3)


In [346]:
comb_te.to_csv(basepath + '/data/combinded/test/Test_v1.txt', header=False, index=False, sep='\t', encoding='utf-8')
comb_va.to_csv(basepath + '/data/combinded/Dev_v1.txt', header=False, index=False, sep='\t', encoding='utf-8')
comb_tr.to_csv(basepath + '/data/combinded/train/Train_v1.txt', index=False ,sep='\t', header=False, encoding='utf-8')