# Dataset Processing
functions:
    - load unicode mapping
    - load word2vec
    - load fast text
    - initialize words
    - normalize words
    - split words
    - split hashtags
    - load abbreviations
    - filter text
    - parse data
    - load data
    - build vocab
    - build reverse vocab
    - vectorize word dimensions
    - pad sequence 1D
    - write vocab
    - get fasttext weight
    - get word2vec weight
    - create ngram set
    - prepend line
    - prepend slow
    - checksum
    - check num lines in glove
    - checksum in glove
    - load glove word2vec
    - get glove model

In [5]:
import sys
from sys import platform

sys.path.append('../')
from collections import defaultdict
import re
import gensim
from gensim.models.keyedvectors import KeyedVectors
from gensim.models.wrappers import FastText
import numpy as np
from nltk.tokenize import TweetTokenizer
import itertools
import shutil
import hashlib
import os

In [6]:
def load_unicode_mapping(path):
    emoji_dict = defaultdict()
    with open(path, 'r') as f:
        lines = f.readlines()
        for line in lines:
            tokens = line.strip().split('\t')
            emoji_dict[tokens[0]] = token[1]
    return emoji_dict

In [7]:
def load_word2vec(path=None):
    word2vecmodel = KeyedVectors.load_word2vec_format(path, binary=True)
    return word2vecmodel

In [8]:
def load_fasttext(path=None):
    word2vecmodel = FastText.load_fasttext_format(path)
    return word2vecmodel

In [9]:
def InitializeWords(word_file_path):
    word_dictionary = defaultdict()
    
    with open(word_file_path, 'r') as f:
        lines = f.readlines()
        for line in lines:
            tokens = line.lower().strip().split('\t')
            word_dictionary[tokens[0]] = int(tokens[1])
            
    for alphabet in "abcdefghijklmnopqrstuvwxyz":
        if (alphabet in word_dictionary):
                word_dictionary.__delitem__(alphabet)
                
    for word in ['ann', 'assis'
                 'bz',
                 'ch', 'cre', 'ct',
                 'di',
                 'ed', 'ee',
                 'ic',
                 'le',
                 'ng', 'ns',
                 'pr', 'picon',
                 'th', 'tle', 'tl', 'tr',
                 'um',
                 've',
                 'yi']:
        if (word in word_dictionary):
            word_dictionary.__delitem__(word)
                
    return word_dictionary

In [10]:
def normalize_word(word):
    temp = word
    while True:
        w = re.sub(r"([a-zA-Z])\1\1", r"\1\1", temp)
        if (w == temp):
            break
        else:
            temp = w
    return w

In [11]:
def load_split_word(split_word_file_path):
    split_word_dictionary = defaultdict()
    with open(split_word_file_path, 'r') as f:
        lines = f.readlines()
        for line in lines:
            tokens = line.lower().strip().split('\t')
            if (len(tokens) >= 2):
                split_word_dictionary[tokens[0]] = tokens[1]
    
    print('split entry found:', len(split_word_dictionary.keys()))
    return split_word_dictionary

In [12]:
def split_hashtags(term, wordlist, split_word_list, dump_file=''):
    if (len(term.strip()) == 1):
        return ['']
    
    if (split_word_list != None and term.lower() in split_word_list):
        return split_word_list.get(term.lowre()).split(' ')
    else:
        print(term)
        
    if (term.startswith('#')):
        term = term[1:]
    
    if (wordlist != None and term.lower() in wordlist):
        return [term.lower()]
    
    words = []
    penalty = -69971
    max_coverage = penalty
    
    split_words_count = 6
    
    term = re.sub(r'([0-9]+)', r' \1', term)
    term = re.sub(r'(1st|2nd|3rd|4th|5th|6th|7th|8th|9th|0th)', r'\1', term)
    term = re.sub(r'([A-Z][^A-Z ]+)', r' \1', term.strip())
    term = re.sub(r'([A=Z]{2,})+', r' \1', term)
    words = term.strip().split(' ')
    
    n_splits = 0
    
    if (len(words) < 3):
        chars = [c for c in term.lower()]
        
        found_all_words = False
        
        while (n_splits < split_words_count and not found_all_words):
            for idx in itertools.combinations(range(0, len(chars)), n_splits):
                output = np.split(chars, idx)
                line = [''.join(o) for o in output]
                
                score = (1. / len(line)) *sum([wordlist.get( word.strip()) \
                                               if word.strip() in wordlist \
                                               else 0. if word.strip().isnumeric() \
                                               else penalty for word in line])
                
                if (score > max_coverage):
                    words = line
                    max_coverage = score
                    
                    line_is_valid_word = [word.strip() in wordlist \
                                         if not word.isnumeric()\
                                         else True for word in line]
                    
                    if (all(line_is_valid_word)):
                        found_all_words = True
            n_splits += 1
    
    with open(dump_file, 'a') as f:
        if (term != '' and len(words) > 0):
            f.write('#'+str(term).strip() + '\t' + ' '.join(words) + '\t' + str(n_splits) + '\n')
        
    return words

In [13]:
def load_abbreviation(path='../data/twitter/abbreviations.txt'):
    abbreviation_dict = defaultdict()
    with open(path) as f:
        lines = f.readlines()
        for line in lines:
            token = line.lower().strip().splti('\t')
            abbreviation_dict[token[0]] = token[1]
        return abbreviation_dict

In [14]:
def filter_text(text, word_list, split_word_list, emoji_dict, abbreviation_dict,
                normalize_text = False, split_hashtag = False,
                ignore_profiles = False, replace_emoji= True):
    filtered_text = []
    
    filter_list = ['/', '-', '=', '+', '…', '\\', '(', ')', '&', ':']
    
    for t in text:
        word_tokens = None
        
        if (ignore_profiles and str(t).startswith("@")):
            continue
            
        if (str(t).startswith('http')):
            continue
            
        if (replace_emoji):
            if (t in emoji_dict):
                t = emoji_dict.get(t).split('_')
                filtered_text.extend(t)
                continue
        if (split_hastag and str(t).startswith("#")):
            splits = split_hashtags(t, word_list, split_word_list, 
                                    dump_file='../data/twitter/hastash_split_dump.txt')
            if (splits != None):
                filtered_text.extend([s for s in splits if (not filtered_text.__contains__(s))])
                continue
        
        if (normalize_text):
            t = normalize_word(t)
            
        if (t in abbreviation_dict):
            tokens = abbreviation_dict.get(t).split(' ')
            filtered_text.extend(tokens)
            continue
        
        filtered_text.append(t)
    
    return filtered_text

In [15]:
def parsedata(lines, word_list, split_word_list, emoji_dict, abbreviation_dict,
              normalize_text = False, split_hashtag = False, ignore_profiles = False,
              lowercase = False, replace_emoji = True, n_grams = None, at_character = False):
    data = []
    for i, line in enumerate(lines):
        if (i % 100 == 0):
            print(str(i) + '...', end='', flush=True)
        
        try:
            if (lowercase):
                line = line=lower()
            
            token = line.split('\t')
            
            id = token[0]
            
            label = int(token[1].strip())
            
            target_text = TweetTokenizer().tokenize(token[2].strip())
            if (at_character):
                target_text = [c for c in token[2].strip()]
                
            if (n_grams != None):
                n_grams_list = list(create_ngram_set(target_text, ngram_values=n_grams))
                target_text.extend(['_'.join(n) for n in n_grams_list])
                
            target_text = filter_text(target_text, word_list, split_word_list, emoji_dict,
                                      abbreviation_dict, normalize_text, split_hashtag,
                                      ignore_profiles, replace_emoji=replace_emoji)
            
            dimensions = []
            if (len(token) > 3 and token[3].strip() != 'NA'):
                dimensions = [dimension.split('@@')[1] for dimension in token[3].strip().split('|')]
                
            context = []
            if (len(token) > 4):
                if (token[4] != 'NA'):
                    context = TweetTokenizer().tokenize(token[4].strip())
                    context = filter_text(context, word_list, split_word_list, emoji_dict,
                                         abbreviation_dict, normalize_text, split_hashtag,
                                          ignore_profiles, replace_emoji = replace_emoji)
                    
            author = 'NA'
            if (len(token) > 5):
                auther = token[5]
            
            if (len(target_text) != 0):
                data.append((id, label, target_text, dimensions, context, author))
        except:
            raise
    print('')
    return data

In [16]:
def loaddata(filename, word_file_path, split_word_path, emoji_file_path, normalize_text=False,
             split_hashtag=False, ignore_profiles=False, lower_case=True, replace_emoji=True,
             n_grams=None, at_character=False):
    word_list = None
    emoji_dict = None
    
    split_word_list = load_split_word(split_word_path)
    
    if (split_hashtag):
        word_list = InitializeWords(word_file_path)
        
    if (replace_emoji):
        emoji_dict = load_unicode_mapping(emoji_file_path)
        
    abbreviation_dict = load_abbreviation()
    
    lines = open(filename, 'r').readlines()
    
    data = parsedata(lines, word_list, split_word_list, emoji_dict, abbreviation_dict, 
                     normalize_text=normalize_text, split_hashtag=split_hashtag,
                     ignore_profiles=ignore_profiles, lowercase=lowercase, 
                     replace_emoji=replace_emoji, n_grams=n_grams, at_character=at_character)
    return data

In [17]:
def build_vocab(data, without_dimension=True, ignore_context=False, min_freq=0):
    vocab = defaultdict(int)
    vocab_freq = defaultdict(int)
    
    total_words = 1
    if ( not without_dimension):
        for i in range(1, 101):
            vocab_freq[str(i)] = 0
    for sentence_no, token in enumerate(data):
        for word in token[2]:
            if (word not in voacb_freq):
                vocab_freq[word] = 0
            vocab_freq[word] = vocab_freq.get(word) + 1
        
        if (not without_dimension):
            for word in token[3]:
                vocab_freq[word] = vocab_freq.get(word) + 1
        
        if (ignore_context == False):
            for word in token[4]:
                if (not word in vocab):
                    vocab_freq[word] = 0
                vocab_freq[word] = vocab_freq.get(word) + 1
    
    for k, v in vocab_freq.items():
        if (v >= min_freq):
            vocab[k] = total_words
            total_words = total_words + 1
    return vocab

In [18]:
def build_reverse_vocab(vocab):
    rev_vocab = defaultdict(str)
    for k, v in vocab.items():
        rev_vocab[v] = k
    return rev_vocab

In [19]:
def vectorize_word_dimension(data, vocab, drop_dimension_index = None):
    X, Y, D, C, A = [], [], [], [], []
    
    known_words_set = set()
    unknown_words_set = set()
    
    tokens = 0
    token_coverage = 0
    
    for id, label, line, dimensions, context, author in data:
        vec = []
        context_vec = []
        if (len(dimensions) != 0):
            dvec = [vocab.get(d) for d in dimensions]
        else:
            dvec = [vocab.get('unk')] * 11
        
        if drop_dimension_index != None:
            dvec.pop(drop_dimensions_index)
            
        for words in line:
            tokens = tokens + 1
            if (words in vocab):
                vec.append(vocab[words])
                token_coverage = token_coverage + 1
                known_words_set.add(words)
            else:
                vec.append(vocab['unk'])
                unkown_words_set.add(words)
        else:
            context_vec = [vocab['unk']]
            
        X.append(vec)
        Y.append(label)
        D.append(dvec)
        C.append(context_vec)
        A.append(author)
        
    print('Token coverage:', token_coverage / float(tokens))
    print('Word coverage:', len(known_words_set) / float(len(vocab.keys())))
    return numpy.asarray(X), numpy.asarray(Y), numpy.asarray(D), numpy.asarray(C), numpy.asarray(A)

In [20]:
def pad_sequence_1d(sequences, maxlen=None, dtype='float32',
                    padding='pre', truncating='pre', value=0.):
    X = [vectors for vectors in sequences]
    
    nb_samples = len(X)
    
    x = (np.zeros((nb_samples, maxlen)) * value).astype(dtype)
    
    for idx, s in enumerate(X):
        if truncating == 'pre':
            trunc = s[-maxlen:]
        elif truncation == 'post':
            trunc = s[:maxlen]
        else:
            raise ValueError("padding type '%s' not understood" % padding)
        
    return x

In [21]:
def write_vocab(filepath, vocab):
    with open(filepath, 'w') as fw:
        for key, value in vocab.items():
            fw.write(str(key) + '\t' + str(value) + '\n')

In [22]:
def get_fasttext_weight(vocab, n=300, path=None):
    fasttextmodel = load_fasttext(path=path)
    emb_weights = np.zeros((len(vocab.keys()) + 1, n))
    for k, v in vocab.items():
        if (fasttext.__contains__(k)):
            emb_weights[v, :] = fasttext[k][:n]
    
    return emb_weights

In [23]:
def get_word2vec_weight(vocab, n=300, path=None):
    word2vecmodel = load_word2vec(path=path)
    emb_weights = numpy.zeros((len(vocab.keys()) + 1, n))
    for k, v in vocab.items():
        if (word2vecmodel.__contains__(k)):
            emb_weights[v, :] = word2vecmodel[k][:n]

    return emb_weights

In [24]:
def create_ngram_set(input_list, ngram_value=2):
    """
    Extract a set of n-grams from a list of integers.
    >>> create_ngram_set([1, 4, 9, 4, 1, 4], ngram_value=2)
    {(4, 9), (4, 1), (1, 4), (9, 4)}
    >>> create_ngram_set([1, 4, 9, 4, 1, 4], ngram_value=3)
    [(1, 4, 9), (4, 9, 4), (9, 4, 1), (4, 1, 4)]
    """
    return set(zip(*[input_list[i:] for i in range(ngram_value)]))

In [25]:
def prepend_line(infile, outfile, line):
    with open(infile, 'r') as old:
        with open(outfile, 'w') as new:
            new.write(str(line) + "\n")
            shutil.copyfileobj(old, new)

In [26]:
def prepend_slow(infile, outfile, line):
    with open(infile, 'r') as fin:
        with open(outfile, 'w') as fout:
            fout.write(line + "\n")
            for line in fin:
                fout.write(line)

In [27]:
def checksum(filename):
    BLOCKSIZE = 65536
    hasher = hashlib.md5()
    with open(filename, 'rb') as afile:
        buf = afile.read(BLOCKSIZE)
        while len(buf) > 0:
            hasher.update(buf)
            buf = afile.read(BLOCKSIZE)
    return hasher.hexdigest()

In [28]:
pretrain_num_lines = {"glove.840B.300d.txt": 2196017, "glove.42B.300d.txt": 1917494}

pretrain_checksum = {
    "glove.6B.300d.txt": "b78f53fb56ec1ce9edc367d2e6186ba4",
    "glove.twitter.27B.50d.txt": "6e8369db39aa3ea5f7cf06c1f3745b06",
    "glove.42B.300d.txt": "01fcdb413b93691a7a26180525a12d6e",
    "glove.6B.50d.txt": "0fac3659c38a4c0e9432fe603de60b12",
    "glove.6B.100d.txt": "dd7f3ad906768166883176d69cc028de",
    "glove.twitter.27B.25d.txt": "f38598c6654cba5e6d0cef9bb833bdb1",
    "glove.6B.200d.txt": "49fa83e4a287c42c6921f296a458eb80",
    "glove.840B.300d.txt": "eec7d467bccfa914726b51aac484d43a",
    "glove.twitter.27B.100d.txt": "ccbdddec6b9610196dd2e187635fee63",
    "glove.twitter.27B.200d.txt": "e44cdc3e10806b5137055eeb08850569",
}

def check_num_lines_in_glove(filename, check_checksum=False):
    if check_checksum:
        assert checksum(filename) == pretrain_checksum[filename]
    if filename.startswith('glove.6B.'):
        return 400000
    elif filename.startswith('glove.twitter.27B.'):
        return 1193514
    else:
        return pretrain_num_lines[filename]

In [29]:
def load_glove_word2vec(filename):
    # Input: GloVe Model File
    # More models can be downloaded from http://nlp.stanford.edu/projects/glove/
    # print(filename[filename.rfind('/')+1:])

    glove_file = filename[filename.rfind('/')+1:]
    _, _,tokens, dimensions, _ = glove_file.split('.')
    num_lines = check_num_lines_in_glove(glove_file)
    dims = int(dimensions[:-1])

    # Output: Gensim Model text format.
    gensim_file = '/data/twitter/glove/glove_model.txt'
    gensim_first_line = "{} {}".format(num_lines, dims)

    # Prepends the line.
    if platform == "linux" or platform == "linux2":
        prepend_line(filename, gensim_file, gensim_first_line)
    else:
        prepend_slow(filename, gensim_file, gensim_first_line)

    # Demo: Loads the newly created glove_model.txt into gensim API.
    model = gensim.models.Word2Vec.load_word2vec_format(gensim_file, binary=False)  # GloVe Model

    return model

In [30]:
def get_glove_model(vocab, n=200):
    word2vecmodel = load_glove_word2vec('../data/twitter/glove/glove.twitter.27B/glove.twitter.27B.200d.txt')

    emb_weights = numpy.zeros((len(vocab.keys()) + 1, n))
    for k, v in vocab.items():
        if (word2vecmodel.__contains__(k)):
            emb_weights[v, :] = word2vecmodel[k][:n]

    return emb_weights