In [74]:
import nltk
import gensim
import numpy as np
import pandas as pd
import os
import pickle
import gc

### Parameters

In [75]:
filename = 'test.txt'
#filename = 'en_US.blogs.txt'

IN_SEQ_LENGTH = 3
OUT_SEQ_LENGTH = 3

### Create corpora

In [76]:
META_UNKNOWN = '<<<!UNK!>>>'
META_EMPTY = '<<<!EMP!>>>'
META_NUMBER = '<<<!NUM!>>>'

no_below_tokens = 0

In [77]:
meta = [[META_UNKNOWN], [META_EMPTY], [META_NUMBER]]

In [78]:
corpora = gensim.corpora.Dictionary()

### Helper function to convert raw text to word ids representation

In [79]:
def update_corpora(sents):
    tmp = []
    for s in sents:
        words = nltk.tokenize.wordpunct_tokenize(s)
        
        tokens = []
        ###############################
        # Recognize digits and paterns
        ###############################
        for w in words:
            if w.isdigit() :
                w = META_NUMBER
            #print(w)
            tokens.append(w)
        tmp.append(tokens)
    
    #print(tmp)
    corpora.add_documents(tmp)   

### Build carpora

In [80]:
# read source file
print('Reading file...')
raw_text = open('./raw_texts/' + filename).read().lower()

Reading file...


In [81]:
# get raw sentences
print('Parsing sentences...')
raw_sents = nltk.tokenize.sent_tokenize(raw_text)

Parsing sentences...


In [82]:
# Update corpora
update_corpora(raw_sents)

In [83]:
# Filter out low frequent tokens
corpora.filter_extremes(no_below=no_below_tokens, no_above=0.8, keep_n=20000)

In [84]:
# Add meta symbols

In [85]:
corpora.add_documents(meta)
#corpora.compactify()

In [86]:
### Convert raw sentences in ids sentences

In [87]:
# get id sentences
id_sents = []
for s in raw_sents:
    raw_tokens = nltk.tokenize.wordpunct_tokenize(s)
    id_tokens = []
    for t in raw_tokens:
        try:
            tid = corpora.token2id[t]
        except:
            tid = corpora.token2id[META_UNKNOWN]
        id_tokens.append(tid)
    id_sents.append(id_tokens)
    #print(raw_tokens)
    #print(id_tokens)

In [88]:
print('Sentences %d'%(len(id_sents)))

Sentences 4


In [89]:
with open('./data/'+filename+'.id.dat', 'wb') as f:
    pickle.dump(id_sents, f)

In [90]:
del(raw_sents)
gc.collect()

351

### Corpora info

In [91]:
tmp = list(corpora.items())

In [92]:
#tmp

In [93]:
corpora.save('./data/' + filename + '.corpora.dat')
corpora.save_as_text('./data/'+filename+'.corpora.txt')

In [94]:
print('Number of words in corpora: %d'%(len(corpora)))

Number of words in corpora: 29


### Test tokens

In [95]:
#corpora.id2token[15000]

In [96]:
del(tmp)
gc.collect()

0

### Reload data (recovery steps)

In [97]:
corpora = gensim.corpora.Dictionary.load('./data/'+filename+'.corpora.dat')
vocab_size = len(corpora)
print('Number of words in corpora: %d'%(vocab_size))
tmp = list(corpora.items())
del(tmp)

Number of words in corpora: 29


In [98]:
with open('./data/'+filename+'.id.dat', 'rb') as f:
    id_sents = pickle.load(f)
    
print('Sentences %d'%(len(id_sents)))

Sentences 4


### Covert raw text to ids text

In [99]:
ID_META_EMPTY = corpora.token2id[META_EMPTY]
ID_META_UNKNOWN = corpora.token2id[META_UNKNOWN]

In [100]:
id_text = []

for id_s in id_sents:
    for id_t in id_s:
        id_text.append(id_t)
    
id_text = np.array(id_text)

In [101]:
print('Tokens: %d'%(len(id_text)))

Tokens: 30


### BUILD TRAINING SET

In [102]:
def csv_header(sufix, count):
    line = ''
    for i in range(0, count, 1):
        if i > 0:
            line += ','
        line += sufix + '_' + str(i)
    return line + '\n'

In [103]:
def csv_line(a):
    line = ''
    for i in range(0, a.shape[0],1):
        if i > 0:
            line += ','
        line += str(a[i])
    return line + '\n'

In [104]:
#csv_header('in', IN_SEQ_LENGTH)

In [105]:
#csv_line( np.array([1,2,3,4,5]) )

In [106]:
f_train_X = open('./data/' + filename + '.' + str(IN_SEQ_LENGTH) + '_in_' + str(OUT_SEQ_LENGTH) + '_out.train_X.csv', 'wt')
f_train_Y = open('./data/' + filename + '.' + str(IN_SEQ_LENGTH) + '_in_' + str(OUT_SEQ_LENGTH) + '_out.train_Y.csv', 'wt')

In [107]:
f_train_X.write(csv_header('in', IN_SEQ_LENGTH))
f_train_Y.write(csv_header('out', OUT_SEQ_LENGTH))

18

In [108]:
#train_X = []
#train_Y = []

In [109]:
seq_cnt = 0

In [110]:
for id_s in id_sents:
    
    if len(id_s) < 2 :
        continue
    
    for shift_i in range(0, IN_SEQ_LENGTH, 1):
    
        seq_in = np.full((IN_SEQ_LENGTH), ID_META_EMPTY, dtype=int)
        if shift_i > 0 :
            for ii in range(0, shift_i, 1):
                seq_in[IN_SEQ_LENGTH-shift_i+ii] = id_s[ii]
    
        #print(shift_i)
        seq_out = np.full((OUT_SEQ_LENGTH), ID_META_EMPTY, dtype=int)
        for oi in range(0, min(OUT_SEQ_LENGTH,len(id_s)), 1):          
            if oi+shift_i < len(id_s):
                seq_out[oi] = id_s[oi+shift_i]
        
        if np.any(seq_out!=ID_META_UNKNOWN) & np.all(seq_in!=ID_META_UNKNOWN) :
            f_train_X.write(csv_line(seq_in))
            f_train_Y.write(csv_line(seq_out))
            seq_cnt += 1
            if (seq_cnt % 1000)==0 :
                f_train_X.flush()
                f_train_Y.flush()
            #train_X.append(list(seq_in))
            #train_Y.append(list(seq_out))
            #print('Shift: {}, {} -> {}'.format(shift_i, seq_in, seq_out))

In [111]:
del(id_sents)
gc.collect()

579

### Skip the next calculation if any data issue

In [112]:
#seq_in = []
#seq_out = []
for i in range(0, len(id_text)-IN_SEQ_LENGTH-OUT_SEQ_LENGTH+1, 1):
    seq_in = id_text[i:i+IN_SEQ_LENGTH]
    seq_out = id_text[i+IN_SEQ_LENGTH:i+IN_SEQ_LENGTH+OUT_SEQ_LENGTH]
    #print('{} -> {}'.format(seq_in, seq_out))
    
    if np.any(seq_out!=ID_META_UNKNOWN) & np.all(seq_in!=ID_META_UNKNOWN) :
        f_train_X.write(csv_line(seq_in))
        f_train_Y.write(csv_line(seq_out))
        seq_cnt += 1
        if (seq_cnt % 1000)==0 :
            f_train_X.flush()
            f_train_Y.flush()

        #train_X.append(list(seq_in))
        #train_Y.append(list(seq_out))

### Save train set

In [113]:
f_train_Y.close()
f_train_X.close()

In [114]:
#print('Train set X, Y:')
#print( len(train_X), len(train_Y) )

In [115]:
print('Train set: %d'%(seq_cnt))

Train set: 37


In [116]:
#train_X[1000001]

In [117]:
#with open('./data/'+filename+'.train_X.dat', 'wb') as f:
#    pickle.dump(train_X, f)

In [118]:
#with open('./data/'+filename+'.train_Y.dat', 'wb') as f:
#    pickle.dump(train_Y, f)