In [0]:
import numpy as np
import re
import pickle
import time
import h5py
import operator
from nltk.corpus import stopwords
import spacy
from collections import deque
from matplotlib import pyplot as plt
from google.colab import drive
drive.mount('/content/gdrive')

import nltk
nltk.download('stopwords')

Go to this URL in a browser: https://accounts.google.com/o/oauth2/auth?client_id=947318989803-6bn6qk8qdgf4n4g3pfee6491hc0brc4i.apps.googleusercontent.com&redirect_uri=urn%3Aietf%3Awg%3Aoauth%3A2.0%3Aoob&scope=email%20https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fdocs.test%20https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fdrive%20https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fdrive.photos.readonly%20https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fpeopleapi.readonly&response_type=code

Enter your authorization code:
··········
Mounted at /content/gdrive
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [0]:
sentence_file_path = '/content/gdrive/My Drive/data_source/milnet/preprocessing/electronics_sentences.txt'
edu_file_path = '/content/gdrive/My Drive/data_source/milnet/preprocessing/electronics_edus.txt'
w2v_path = '/content/gdrive/My Drive/data_source/milnet/preprocessing/w2v.pkl'
h5_out_path = '/content/gdrive/My Drive/data_source/milnet/results/electronics.hdf5'
w2id_out_path = '/content/gdrive/My Drive/data_source/milnet/results/w2id.pkl'
weights_out_path = '/content/gdrive/My Drive/data_source/milnet/results/weights.npy'

document_out_dir = 'document/'
label_out_dir = 'label/'

seglen_out_path = '/content/gdrive/My Drive/data_source/milnet/results/segment_summary/sentence_level/seglens.pkl'
doclen_out_path = '/content/gdrive/My Drive/data_source/milnet/results/segment_summary/sentence_level/doclens.pkl'
wordcnt_out_path = '/content/gdrive/My Drive/data_source/milnet/results/segment_summary/sentence_level/wordcnt.pkl'

stop_words_path = None

w2v_vec_len = 300
word_min_len = 2
seg_min_len = 2
seg_max_len = 34

doc_min_seg = 1
doc_max_seg = 12

pad_value = 0

batch_size = 200
w2id_backup_batch = 5000

In [0]:
label_re = re.compile('(\d+)\.\d+')
sentence_re = re.compile('(?:\.|!|\?)\s')
edu_re = re.compile('(?:\.|!|\?|(?:EDU_BREAK))\s')

multi_space_re = re.compile('\s{2,}')
url_re = re.compile('(http://)?www\.[^ ]+')
unknown_char_re = re.compile('[^a-z0-9$\'_]')
multi_udl_re = re.compile('_{2,}')
abram_re = re.compile('\'m')
abris_re = re.compile('\'s')
abrare_re = re.compile('\'re')
abrhave_re = re.compile('\'ve')
abrnot_re = re.compile('n\'t')
abrwd_re = re.compile('\'d')
abrwill_re = re.compile('\'ll')
num_re = re.compile('(?<= )[0-9]+(?= )')
mixi_re = re.compile('(?<=[a-z])I')

stop_words = set(stopwords.words('english'))
if stop_words_path is not None:
    with open(stop_words_path) as in_file:
        file_content = in_file.read()
        stop_words = set(file_content.split('\n'))
lemmatizer = spacy.load('en_core_web_sm')

In [0]:
w2id = {}

def __unilen_vec(input_vec, target_len):
    global pad_value
    if input_vec.shape[0] > target_len:
        return input_vec[0:target_len]
    elif input_vec.shape[0] < target_len:
        topad_len = target_len - input_vec.shape[0]
        pad_width = [(0, 0) if i != 0 else (0, topad_len) for i in range(len(input_vec.shape))]
        return np.pad(input_vec, pad_width, 'constant', constant_values=pad_value)
    else:
        return input_vec

def __word_to_id(word):
    global w2id
    if word in w2id:
        return w2id[word]
    else:
        w2id[word] = len(w2id) + 1
        return w2id[word]

def __clean_segment(raw_segment):
    global word_min_len
    raw_segment = raw_segment.lower()
    raw_segment = url_re.sub(' _url_', raw_segment)
    raw_segment = unknown_char_re.sub(' ', raw_segment)
    raw_segment = multi_udl_re.sub('_', raw_segment)
    raw_segment = abram_re.sub(' am', raw_segment)
    raw_segment = abris_re.sub(' is', raw_segment)
    raw_segment = abrare_re.sub(' are', raw_segment)
    raw_segment = abrhave_re.sub(' have', raw_segment)
    raw_segment = abrnot_re.sub(' not', raw_segment)
    raw_segment = abrwd_re.sub(' would', raw_segment)
    raw_segment = abrwill_re.sub(' will', raw_segment)
    raw_segment = num_re.sub('<NUM>', raw_segment)
    raw_segment = mixi_re.sub(' I', raw_segment)
    raw_segment = multi_space_re.sub(' ', raw_segment)
    return [*filter(lambda x: len(x) >= word_min_len, raw_segment.split(' '))]

def load_labelled_document(fp):
    doc_line_cnt = 0
    label_cache = None
    line_cache = None
    for line in fp:
        if line.strip() == '':
            doc_line_cnt = 0
            yield line_cache, label_cache
            continue
        if doc_line_cnt == 0:
            label_cache = line
        elif doc_line_cnt == 1:
            line_cache = line
        else:
            line_cache += line
        doc_line_cnt += 1

def segment_document(raw_document, split_re):
    seg_rst = split_re.split(raw_document)
    return seg_rst

def clean_segments(raw_segments, stopword_remove=True, lemmatize=True):
    global seg_min_len, stop_words, lemmatizer
    segments = [__clean_segment(seg) for seg in raw_segments]
    if stopword_remove:
        segments = [[*filter(lambda x: x not in stop_words)] for seg in segments]
    if lemmatize:
        len_lst = [len(seg) for seg in segments]
        doc_words = ' '.join([' '.join(seg) for seg in segments])
        lemmatized_words = [word.lemma_ for word in lemmatizer(doc_words)]
        segments = []
        for length in len_lst:
            segments.append(lemmatized_words[:length])
            lemmatized_words = lemmatized_words[length:]
    return [*filter(lambda x: len(x) > seg_min_len, segments)]

def doc_to_vec(cleaned_segs):
    global seg_max_len, doc_max_seg
    for index, seg in enumerate(cleaned_segs):
        cleaned_segs[index] = np.array([*map(__word_to_id, seg)])
        cleaned_segs[index] = __unilen_vec(cleaned_segs[index], seg_max_len)
    return __unilen_vec(np.array(cleaned_segs), doc_max_seg)

def clean_label(raw_label):
    return int(label_re.findall(raw_label)[0]) - 1

def dump_batch(h5_out, d_queue, l_queue, batch_index):
    global document_out_dir, label_out_dir
    h5_out[document_out_dir + str(batch_index)] = np.array(d_queue)
    d_queue.clear()
    h5_out[label_out_dir + str(batch_index)] = np.array(l_queue)
    l_queue.clear()

def dump_w2v(bin_path, output_path):
    global w2v_vec_len
    with open(bin_path, 'rb') as in_file:
        w2v = {}
        word_cnt, w2v_vec_len = map(int, in_file.readline().split())
        bin_len = np.dtype('float32').itemsize * w2v_vec_len
        for _ in range(word_cnt):
            try:
                word = []
                while True:
                    ch = in_file.read(1)
                    if ch == b' ':
                        word = ''.join(map(lambda x: x.decode('ascii'), word))
                        break
                    elif ch != '\n':
                        word.append(ch)
                w2v[word] = np.fromstring(in_file.read(bin_len), dtype='float32')
            except UnicodeDecodeError:
                pass
            if _ % (word_cnt / 100) == 0:
                print('\r', round(_ / word_cnt * 100, 0), end='')
        print('Process finished, dumpting ...')
        with open(output_path, 'wb') as out_file:
            pickle.dump(w2v, out_file)

def dump_embedding_weight(w2v, w2id, output_path):
    weight_matrix = np.zeros((len(w2id) + 1, w2v_vec_len))
    for word, index in w2id.items():
        if word in w2v:
            weight_matrix[index] = w2v[word]
    weight_matrix.dump(output_path)

In [0]:
word_counter, seg_len_counter, doc_len_coutner = {}, {}, {}
cnt = 0

with open(sentence_file_path) as in_file:
    for document, label in load_labelled_document(in_file):
        cleaned_segs = clean_segments(segment_document(document, sentence_re), stopword_remove=False)
        for segment in cleaned_segs:
            if len(segment) in seg_len_counter:
                seg_len_counter[len(segment)] += 1
            else:
                seg_len_counter[len(segment)] = 1
            for word in segment:
                if word in word_counter:
                    word_counter[word] += 1
                else:
                    word_counter[word] = 1
        if len(cleaned_segs) in doc_len_coutner:
            doc_len_coutner[len(cleaned_segs)] += 1
        else:
            doc_len_coutner[len(cleaned_segs)] = 1
        cnt += 1
        if cnt % 100 == 0:
            print('\r', cnt, 'items processed.',end='')
    with open(seglen_out_path, 'wb') as sl_out, open(doclen_out_path, 'wb') as dl_out, open(wordcnt_out_path, 'wb') as wc_out:
        pickle.dump(seg_len_counter, sl_out)
        pickle.dump(doc_len_coutner, dl_out)
        pickle.dump(word_counter, wc_out)
    print('Finished.')

 1200 items processed.

In [0]:
with open(seglen_out_path, 'rb') as seg_in, open(doclen_out_path, 'rb') as doc_in:
    seg_len_counter = pickle.load(seg_in)
    doc_len_counter = pickle.load(doc_in)
    sorted_seglen = sorted([(k, v) for k, v in seg_len_counter.items()], key=lambda x: x[0], reverse=True)
    sorted_doclen = sorted([(k, v) for k, v in doc_len_counter.items()], key=lambda x: x[0], reverse=True)
    checkpoint_percents = [0.01, 0.05, 0.1, 0.2, 0.3, 0.5]
    seg_accum, doc_accum = 0, 0
    seg_ptr, doc_ptr = 0, 0
    for checkpoint in checkpoint_percents:
        while seg_accum / sum(v for k, v in sorted_seglen) < checkpoint:
            seg_accum += sorted_seglen[seg_ptr][1]
            seg_ptr += 1
        print('Segment len', sorted_seglen[seg_ptr][0], 'truncate ratio', checkpoint)
    print('')
    for checkpoint in checkpoint_percents:
        while doc_accum / sum(v for k, v in sorted_doclen) < checkpoint:
            doc_accum += sorted_doclen[doc_ptr][1]
            doc_ptr += 1
        print('Document len', sorted_doclen[doc_ptr][0], 'truncate ratio', checkpoint)

Segment len 75 truncate ratio 0.01
Segment len 44 truncate ratio 0.05
Segment len 34 truncate ratio 0.1
Segment len 25 truncate ratio 0.2
Segment len 20 truncate ratio 0.3
Segment len 14 truncate ratio 0.5

Document len 31 truncate ratio 0.01
Document len 17 truncate ratio 0.05
Document len 12 truncate ratio 0.1
Document len 8 truncate ratio 0.2
Document len 6 truncate ratio 0.3
Document len 3 truncate ratio 0.5


In [0]:
with open(edu_file_path) as in_file, h5py.File(h5_out_path, 'w') as out_file:
    document_queue = deque()
    label_queue = deque()
    cnt, batch_cnt = 0, 0
    for document, label in load_labelled_document(in_file):
        cleaned_label = clean_label(label)
        cleaned_segs = clean_segments(segment_document(document, edu_re), stopword_remove=False)
        if len(cleaned_segs) < doc_min_seg:
            continue
        document_queue.append(doc_to_vec(cleaned_segs))
        label_queue.append(cleaned_label)
        if cnt % batch_size == 0 and cnt != 0:
            print('\rDumping data ...', end='')
            dump_batch(out_file, document_queue, label_queue, batch_cnt)
            batch_cnt += 1
        if cnt % w2id_backup_batch == 0 and cnt != 0:
            print('\rDumping w2id ...', end='')
            with open(w2id_out_path, 'wb') as w2id_out:
                pickle.dump(w2id, w2id_out)
        if cnt % 100 == 0:
            print('\r', cnt, 'items processed.',end='')
        cnt += 1
    print('\rFinal dumping ...')
    dump_batch(out_file, document_queue, label_queue, batch_cnt)
    with open(w2id_out_path, 'wb') as w2id_out:
        pickle.dump(w2id, w2id_out)
    print('Finished.')

Final dumping ...
Finished.


In [0]:
with open(w2v_path, 'rb') as in_file:
    w2v = pickle.load(in_file)
with open(w2id_out_path, 'rb') as in_file:
    w2id = pickle.load(in_file)
dump_embedding_weight(w2v, w2id, weights_out_path)