In [2]:
import numpy as np
import re
import pickle
import time
import h5py
import operator
from nltk.corpus import stopwords
import spacy
from collections import deque
from random import shuffle

In [3]:
sentence_file_path = '/Users/Frost/Desktop/gourmet.txt'
edu_file_path = '/Volumes/CCsChunk2/projects/nlp/nlp-lab-group9/data/processed/edus/electronics_edus.txt'
w2v_path = '/Volumes/CCsChunk2/projects/nlp/milnet/embeddings/GoogleNews-vectors-negative300.bin'
fasttext_path = '/Volumes/CCsChunk2/projects/nlp/milnet/embeddings/cc.en.300.vec'

w2id_out_path = '/Volumes/CCsChunk2/projects/nlp/milnet/inputs/w2id.pkl'
h5_out_path = '/Volumes/CCsChunk2/projects/nlp/milnet/inputs/electronics.hdf5'
w2v_weights_out_path = '/Users/Frost/Desktop/weights.npy'

document_out_dir = 'document/'
label_out_dir = 'label/'

stop_words_path = None

w2v_vec_len = 300
word_min_len = 2
seg_min_len = 2
seg_max_len = 50

doc_min_seg = 1
doc_max_seg = 20

pad_value = 0

batch_size = 200
w2id_backup_batch = 5000

In [4]:
label_re = re.compile('(\d+)\.\d+')
sentence_re = re.compile('(?:\.|!|\?)\s')
edu_re = re.compile('(?:\.|!|\?|(?:edu_break))\s')

multi_space_re = re.compile('\s{2,}')
url_re = re.compile('(http://)?www\.[^ ]+')
unknown_char_re = re.compile('[^a-z0-9$\.\?\!\'_]')
multi_udl_re = re.compile('_{2,}')
abram_re = re.compile('\'m')
abris_re = re.compile('\'s')
abrare_re = re.compile('\'re')
abrhave_re = re.compile('\'ve')
abrnot_re = re.compile('n\'t')
abrwd_re = re.compile('\'d')
abrwill_re = re.compile('\'ll')
num_re = re.compile('(?<= )[0-9]+(?= )')
mixi_re = re.compile('(?<=[a-z])I')

stop_words = set(stopwords.words('english'))
if stop_words_path is not None:
    with open(stop_words_path) as in_file:
        file_content = in_file.read()
        stop_words = set(file_content.split('\n'))
lemmatizer = spacy.load('en_core_web_sm')

In [5]:
''' A dictionary to map words to their indices
'''
w2id = {}

''' Truncate or pad the input vector so that their first dimension meets `target_len`

Args:
    input_vec (np.array): The input numpy array.
    target_len (int): The expected length of the first dimension of the input vector.
    pad_value (float): The value to pad the vector if the length is not enough. The value
        padded will be numpy arrays which each value is `pad_value`.

Returns:
    (np.array): The processed `input_vec`.
'''
def __unilen_vec(input_vec, target_len, pad_value):
    if input_vec.shape[0] > target_len:
        return input_vec[0:target_len]
    elif input_vec.shape[0] < target_len:
        topad_len = target_len - input_vec.shape[0]
        pad_width = [(0, 0) if i != 0 else (0, topad_len) for i in range(len(input_vec.shape))]
        return np.pad(input_vec, pad_width, 'constant', constant_values=pad_value)
    else:
        return input_vec

''' Convert a word to its id

This method will use the global variable `w2id`, it will return the word's id if the word
has already existed in the dictionary, or create a new id and return if the word is new.

Args:
    word (str): The input word.

Returns:
    (int): The id of the input `word`.
'''
def __word_to_id(word):
    global w2id
    if word in w2id:
        return w2id[word]
    else:
        w2id[word] = len(w2id) + 1
        return w2id[word]

''' Clean a document

This method will clean an input string (document). The cleaning includes replacing certain 
values with markers, expand abbreviation, remove stopwords (if required), lemmatize (if required), 
and merge spaces.

Args:
    raw_document (str): The input string (document).
    stop_words (set(str)): The stop words set. If not `None`, words in this set will be removed.
    lemmatizer (lemmatizer): The lemmatizer. If not `None`, each word will be lemmatized.

Returns:
    (str): The cleaned document.
'''
def __clean_document(raw_document, stop_words=None, lemmatizer=None):
    raw_document = raw_document.lower()
    raw_document = url_re.sub(' _url_', raw_document)
    raw_document = multi_udl_re.sub('_', raw_document)
    raw_document = abram_re.sub(' am', raw_document)
    raw_document = abris_re.sub(' is', raw_document)
    raw_document = abrare_re.sub(' are', raw_document)
    raw_document = abrhave_re.sub(' have', raw_document)
    raw_document = abrnot_re.sub(' not', raw_document)
    raw_document = abrwd_re.sub(' would', raw_document)
    raw_document = abrwill_re.sub(' will', raw_document)
    raw_document = num_re.sub('<NUM>', raw_document)
    raw_document = mixi_re.sub(' I', raw_document)
    if lemmatizer is not None:
        raw_document = ' '.join([w.lemma_ for w in lemmatizer(raw_document)])
    if stop_words is not None:
        raw_document = ' '.join([*filter(lambda x: x not in stop_words, raw_document.split(' '))])
    raw_document = unknown_char_re.sub(' ', raw_document)
    raw_document = multi_space_re.sub(' ', raw_document)
    return raw_document

''' Iterate the inputs and their labels in the input file

This method will generate a tuple of the review and their label. In the file, label is supposed 
to be the first line, followed by several lines of review, then followed by an empty line.

Args:
    fp (file): The file pointer of the input file. Use `open` to open the file first.

Returns:
    (generator): The generator of the reviews of the file. Each yield returns a ((str), (str)) 
        tuple, which stand for (review, label).
'''
def load_labelled_document(fp):
    doc_line_cnt = 0
    label_cache = None
    line_cache = None
    for line in fp:
        if line.strip() == '':
            doc_line_cnt = 0
            yield line_cache, label_cache
            continue
        if doc_line_cnt == 0:
            label_cache = line
        elif doc_line_cnt == 1:
            line_cache = line
        else:
            line_cache += line
        doc_line_cnt += 1

''' Turn raw_segments into a list of cleaned word lists.

Args:
    raw_document (str): The input raw document.
    segment_re (re): The compiled regex which identifies a segment breaker.
    seg_min_len (int): The minimum length of a segment. Segments lower than this length will 
        be removed.
    word_min_len (int): The minimum length of a word. Words lower than this length will be 
        removed.
    stop_words (set(str)): The stop words set. If not `None`, words in this set will be removed.
    lemmatizer (lemmatizer): The lemmatizer. If not `None`, each word will be lemmatized.

Returns:
    ([[str]]): A list of word lists for a review.
'''
def clean_split(raw_document, segment_re, seg_min_len, word_min_len, stop_words=None, lemmatizer=None):
    cleaned_doc = __clean_document(raw_document, stop_words, lemmatizer)
    seg_list = segment_re.split(cleaned_doc)
    seg_list = [[*filter(lambda x: len(x) > word_min_len, seg.split(' '))] for seg in seg_list]
    return [*filter(lambda x: len(x) > seg_min_len, seg_list)]

''' Turn the cleaned list of word lists for a review into np.array.

Args:
    cleaned_segs ([[str]]): The cleaned list of word lists.
    seg_max_len (int): The maximum length of a segment. Segments longer than this value will be 
        truncated.
    doc_max_seg (int): The maximum length of a review. Reviews longer than this value will be 
        truncated.
    pad_value (float): The value to pad the vector if the length is not enough. The value
        padded will be numpy arrays which each value is `pad_value`.
'''
def doc_to_vec(cleaned_segs, doc_max_seg, seg_max_len, pad_value):
    for index, seg in enumerate(cleaned_segs):
        cleaned_segs[index] = np.array([*map(__word_to_id, seg)])
        cleaned_segs[index] = __unilen_vec(cleaned_segs[index], seg_max_len, pad_value)
    return __unilen_vec(np.array(cleaned_segs), doc_max_seg, pad_value)

''' Get the label integer from input line.

Args:
    raw_label (str): The line containing label in the input file.

Returns:
    (int): The label.
'''
def clean_label(raw_label):
    return int(label_re.findall(raw_label)[0]) - 1

''' Dump a batch in a hdf5 file.

'batch' refers to the batches in the hdf5 file, for reviews, the batches can be found at 
'<document_out_dir>/<batch_index>', for labels, the batches can be found at 
'<label_out_dir>/<batch_index>'.

Args:
    h5_out (h5py.File): The file handle of output file.
    d_queue (deque(np.array)): The queue containing reviews. This will be cleared in the call.
    l_queue (deque(np.array)): The queue containing labels. This will be cleared in the call.
    batch_index (int): The current index of batch.
    document_out_dir (str): The store location of the review batches.
    label_out_dir (str): 
'''
def dump_batch(h5_out, d_queue, l_queue, batch_index, document_out_dir, label_out_dir):
    h5_out[document_out_dir + str(batch_index)] = np.array(d_queue)
    d_queue.clear()
    h5_out[label_out_dir + str(batch_index)] = np.array(l_queue)
    l_queue.clear()

''' A counter to summarize the length distribution of the collected lists.
'''
class Len_Counter():
    '''
    Args:
        max_len (int): The maximum length of the collected lists. Lists exceed this amount will
            be ignored.
        description (str): The description of the functionality of this counter.
    '''
    def __init__(self, max_len, description=''):
        self.len_dict = {l:0 for l in range(max_len)}
        self.description = description
    
    ''' Add a new list to summarize.
    Args:
        x (list): The list to be added.
    '''
    def add(self, x):
        if len(x) in self.len_dict:
            self.len_dict[len(x)] += 1
    
    ''' Print the summary of the lists' length distribution.
    Args:
        checkpoints ([int]): The significance percentages. For each percentage p in `checkpoints`, this 
            method will output the length l of the list so that p percent of all lists' length is below l.
    '''
    def summary_distribution(self, checkpoints=[0.01, 0.05, 0.1, 0.2, 0.3, 0.5]):
        print('\n', self.description, 'summary:')
        sorted_len = sorted([(k, v) for k, v in self.len_dict.items()], key=lambda x: x[0], reverse=True)
        part_sum, sum_ptr = 0, 0
        for checkpoint in sorted(checkpoints):
            while part_sum / sum(v for k, v in sorted_len) < checkpoint:
                part_sum += sorted_len[sum_ptr][1]
                sum_ptr += 1
            print('Length', sorted_len[sum_ptr][0], 'truncate ratio', checkpoint)

''' Read w2v as dictionary from .bin file.

Args:
    bin_path (str): The path of .bin w2v file.

Returns:
    {str : np.array}: The w2v dictionary.
'''
def read_w2v(bin_path):
    with open(bin_path, 'rb') as in_file:
        w2v = {}
        word_cnt, w2v_vec_len = map(int, in_file.readline().split())
        bin_len = np.dtype('float32').itemsize * w2v_vec_len
        for _ in range(word_cnt):
            try:
                word = []
                while True:
                    ch = in_file.read(1)
                    if ch == b' ':
                        word = ''.join(map(lambda x: x.decode('ascii'), word))
                        break
                    elif ch != '\n':
                        word.append(ch)
                w2v[word] = np.fromstring(in_file.read(bin_len), dtype='float32')
            except UnicodeDecodeError:
                pass
            if _ % (word_cnt / 100) == 0:
                print('\r w2v loading:', round(_ / word_cnt * 100, 0), end='%')
        print('\rLoading finished.')
        return w2v

''' Read fasttext as dictionary from text file.

Args:
    fasttext_path (str): The path of text fasttext file.

Returns:
    {str : np.array}: The fasttext dictionary.
'''
def read_fasttext(fasttext_path):
    with open(fasttext_path) as in_file:
        word_cnt, dimension = map(int, in_file.readline().split())
        fasttext = {}
        cnt = 0
        for line in in_file:
            tokens = line.rstrip().split(' ')
            fasttext[tokens[0]] = np.array([*map(float, tokens[1:])])
            cnt += 1
            if cnt % (word_cnt / 100) == 0:
                print('\r fasttext loading:', round(cnt / word_cnt * 100, 0), end='%')
        return fasttext

''' Dump the embedding weight with respect to `w2id`.

This method will dump a npy file, in which the ith line is the w2v vector for word with id i.

Args:
    w2v ([str : np.array]): The w2v dictionary.
    w2id ([str : int]): The dictionary mapping word to their id.
    output_path (str): The path for dumping the npy weights file.
'''
def dump_embedding_weight(w2v, w2id, output_path):
    weight_matrix = np.zeros((len(w2id) + 1, w2v_vec_len))
    for word, index in w2id.items():
        if word in w2v:
            weight_matrix[index] = w2v[word]
    weight_matrix.dump(output_path)

In [9]:
seg_len_counter, doc_len_counter = Len_Counter(seg_max_len, 'Segment Length'), Len_Counter(doc_max_seg, 'Document Length')

with open(edu_file_path) as in_file, h5py.File(h5_out_path, 'w') as out_file:
    document_queue = deque()
    label_queue = deque()
    cnt, batch_cnt = 0, 0
    for document, label in load_labelled_document(in_file):
        cleaned_label = clean_label(label)
        cleaned_segs = clean_split(document, edu_re, seg_min_len, word_min_len, lemmatizer=lemmatizer)
        if len(cleaned_segs) < doc_min_seg:
            continue
        for seg in cleaned_segs:
            seg_len_counter.add(seg)
        doc_len_counter.add(cleaned_segs)
        document_queue.append(doc_to_vec(cleaned_segs, doc_max_seg, seg_max_len, pad_value))
        label_queue.append(cleaned_label)
        if cnt % batch_size == 0 and cnt != 0:
            print('\rDumping data ...', end='')
            dump_batch(out_file, document_queue, label_queue, batch_cnt, document_out_dir, label_out_dir)
            batch_cnt += 1
        if cnt % w2id_backup_batch == 0 and cnt != 0:
            print('\rDumping w2id ...', end='')
            with open(w2id_out_path, 'wb') as w2id_out:
                pickle.dump(w2id, w2id_out)
        if cnt % 100 == 0:
            print('\r', cnt, 'items processed.',end='')
        cnt += 1
    seg_len_counter.summary_distribution()
    doc_len_counter.summary_distribution()
    print('\nFinal dumping ...')
    dump_batch(out_file, document_queue, label_queue, batch_cnt, document_out_dir, label_out_dir)
    with open(w2id_out_path, 'wb') as w2id_out:
        pickle.dump(w2id, w2id_out)
    print('Finished.')

 407000 items processed.
 Segment Length summary:
Length 15 truncate ratio 0.01
Length 10 truncate ratio 0.05
Length 9 truncate ratio 0.1
Length 7 truncate ratio 0.2
Length 6 truncate ratio 0.3
Length 4 truncate ratio 0.5

 Document Length summary:
Length 18 truncate ratio 0.01
Length 16 truncate ratio 0.05
Length 14 truncate ratio 0.1
Length 11 truncate ratio 0.2
Length 9 truncate ratio 0.3
Length 6 truncate ratio 0.5

Final dumping ...
Finished.


In [6]:
w2v = read_w2v(w2v_path)
with open('/Volumes/CCsChunk2/projects/nlp/milnet/inputs/food_sentence/w2id.pkl', 'rb') as in_file:
    w2id = pickle.load(in_file)
dump_embedding_weight(w2v, w2id, w2v_weights_out_path)



Loading finished.0%
