In [1]:
import numpy as np
import os
import glob
import keras as kr
import ryan_tools as rt

  from ._conv import register_converters as _register_converters
Using TensorFlow backend.


In [2]:
class TextGetter():
    def get_article_list(self):
        os.chdir('cleaned')
        file_list = glob.glob('*.txt')
        os.chdir('..')
        return file_list

    def get_article(self, name):
        os.chdir('cleaned')
        with open(name, 'r') as f:
            text = f.read()
        os.chdir('..')
        return text
    
    def get_corpus(self):
        articles = self.get_article_list()
        result = ''
        bar = rt.progress_bar(len(articles))
        for art_name in articles:
            result = result + self.get_article(art_name)
            bar.progress()
        return result
    


In [3]:
def get_text():
    text_getter = TextGetter()
    text = text_getter.get_corpus()
    return text

In [4]:
def create_dictionaries(text):
    char_id = dict(zip(set(text), range(len(set(text)))))
    id_char = dict(zip(char_id.values(), char_id.keys()))
    return id_char, char_id

In [174]:
def sample(a, temperature=1.0):
    a = a**(1/temperature)
    a_sum = a.sum()
    sample_temp = a/(a.sum()*1.0001)
    sampled = np.random.multinomial(1, sample_temp, 1)
    return np.argmax(sampled)

def convert_char_id(text):
    result = []
    features = len(char_id)
    for i, char in enumerate(text):
        position = int(char_id[char])
        observation = np.zeros(( features))
        observation[position] = 1
        result.append(observation)
    return np.array(result)

def convert_id_char(onehot, temperature):
    result = ''
    for letter in onehot:
        l = sample(letter, temperature)
        result = result + id_char[l]
    return result

def get_X_y(list_of_one_hots, sentence_length, do_bar = True):
    y = []
    X = []
    if do_bar:
        bar = rt.progress_bar(len(list_of_one_hots), 100)
    for i, letter in enumerate(list_of_one_hots):
        if i > sentence_length:
            y.append(letter)

            x = list_of_one_hots[i- sentence_length:i]
            X.append(x)
        if do_bar:
            bar.progress()

    return np.array(X), np.array(y)

In [177]:
def generate_model(X, y):
    model = kr.models.Sequential()
    model.add(kr.layers.LSTM(128, input_shape=X.shape[1:],return_sequences= False ))
    model.add(kr.layers.Dense(X.shape[2]))
    model.add(kr.layers.Activation('softmax'))
    optimizer = kr.optimizers.Adam()
    model.compile(loss='categorical_crossentropy', optimizer=optimizer)
    return model
model = None

In [113]:
def predict(text, temperature):
    onehot = convert_char_id(text[-sentence_length:])
    return convert_id_char(model.predict(np.array([onehot]), 1), temperature)

In [111]:
def print_letters(seed_text, sentence_length, how_many, temperature= 1):
    while len(seed_text) < sentence_length:
        seed_text = ' ' + seed_text
        
    result = seed_text
    for num in range(0, how_many):
        prediction = predict(seed_text, temperature)
        seed_text = seed_text[1:] + prediction
        result = result + prediction
    return result

In [90]:
text = get_text()
id_char, char_id = create_dictionaries(text)
one_hots = convert_char_id(text)

0o0o0o0o0o0o0o0o0o0o0o0o0o0o0o0o0o0o0o0o0o0o0o0o0o0o0o0o0o0o0o0o0o0o0o0o0o0o0o0o0o0o0o0o0o0o0o0o0o> | 100.00 % 
Done: 324 Remaining: 1, Remaining Time: 0s


In [178]:
sentence_length = 100
X,y  = get_X_y(one_hots[0: 1000], sentence_length)
model = generate_model(X, y)


0o0o0o0o0o0o0o0o0o0o0o0o0o0o0o0o0o0o0o0o0o0o0o0o0o0o0o0o0o0o0o0o0o0o0o0o0o0o0o0o0o0o0o0o0>          | 90.09 % 
Done: 900 Remaining: 100, Remaining Time: 0s


In [189]:
import time

In [242]:

def test_sizes(batch_size, samples, epochs):
    global start_point
    start = time.time()
    X,y  = get_X_y(one_hots[start_point: start_point + samples], sentence_length, False)
    callback = model.fit(X, y, epochs = epochs, batch_size= size, verbose =0 )
    start_point = start_point + samples
    time_taken = time.time() - start
    print(start_point)
    print('Projected Time for 10000 samples {}'.format(time_taken * 10000/(samples* epochs)))
    return callback

In [252]:
len('The five whys analysis is also known as the why-why chart and root cause analysis.')

82

In [None]:
for num in range(start_point, len(one_hots)):
    if num%160 == 0:
        start = num
        
        callback = test_sizes(160, 160, 100)
        print(min(callback.history['loss']))
        for temperature in [0.1, 0.3, .5, .7, 1]:
            print(temperature)
            seed_text = 'The five whys analysis is also known as the why-why chart and root cause analysis.'.lower()
            print(print_letters(seed_text, sentence_length, 100, temperature).capitalize()[len(seed_text):])
        if num%10000 == 0:
            rt.clear_output()

5720
Projected Time for 10000 samples 7.159374803304672
0.6902888417243958
0.1
ot cause analysis.mamis sefodededed apedened rer alenmenm-salormins rerker preror mergr prors mer ris rer wer alminm-s
0.3
ot cause analysis.mas as alaminm-sfrorkers erer aemenmers as alkers arseder penf ror ring as am rers rers per aomenys 
0.5
ot cause analysis.m-ald alminds aek areod mer r rorkers-menmacs amir rs-for wer pror ror rers-sprros mer rerks-forfoa 
0.7
ot cause analysis. ad alks-faomed ars lanmener prasks aorfordenssamer rd a> peror anmrrs ress ae reovernst rinsma>enma
1
ot cause analysis.smeincmanes-.ffeore mins rer pr-yomocyrony-mmarkirr-clomennryrss her har doopwrfepr er rhhgp-fomonmr
5880
Projected Time for 10000 samples 7.303125113248825
0.7517054080963135
0.1
ot cause analysis.tad aer teren 201 then 201 wan 2011 wend 2011 when s ayuts ratrructrrutrrutrr tarirs tractrrac ay cc
0.3
ot cause analysis.tatin rs aewer 2011 ween 201 whenk 201 and 201 and rs ay trructrrstrrutarurs taterwet r2011 

7640
Projected Time for 10000 samples 7.160000056028366
0.35808834433555603
0.1
ot cause analysis., <h3>expand one anfing and ane sane ;and more sanes and more sang ane sane sane sane sanes and more
0.3
ot cause analysis., <h3>expand inecoffind dngeced ine ang ane, sane more sane ;and more sanes; and mare sanes and more
0.5
ot cause analysis., <h3>expand ineconfind nnes;and in cand ane s ane ;o andin. <h, <h3>esploding ang and ;anes and san
0.7
ot cause analysis.</h3>></a>p oppiped ing on. ane, eacd anes;ang ofe anduin </p>, <hp>exs dind and monduses and ars an
1
ot cause analysis., <hp>exlhing condind nnn., andd hveecon. pane;s angd</>e, <h3>excivd inns;anding son mine, an seand
7800
Projected Time for 10000 samples 6.779999881982803
0.3335992395877838
0.1
ot cause analysis.</h <h3><psadepplonkiigesthiza al arg arg ane ar-sha hazpre="l ar ares-haza t-ar ar--hagg ag tare s 
0.3
ot cause analysis.</h <h3p>dtoptockanging-hazhi"e s ar ade-s are ar-s ar iag th hazt are por hagh "> ar pach 

9560
Projected Time for 10000 samples 6.76624983549118
0.256037175655365
0.1
ot cause analysis. wesping wis roves ingrovimg werker ilg orig wesker worker ilesion ghesping wesprovimging"><img wesk
0.3
ot cause analysis. wesping wils orsive imgrove-king"><irg wesking werkerongive-img"><img ant="oil rig worker werispros
0.5
ot cause analysis. wespons wis rons weskerk reotion workeron-reotong"><img al imes rles wess orimerin "lorimg werseve 
0.7
ot cause analysis. werepi= sels ong clarkg werker wive-pond werimprocsives alemiog weskeresiviggrovimg als corkerke</p
1
ot cause analysis.g"iter w-lmerl wolsp>s colire" worke ragecesping</aing="<i>s wessivl ghocke r-iver plovimprig<ig1 re
9720
Projected Time for 10000 samples 6.910624951124191
0.13595883548259735
0.1
ot cause analysis.<//ha><//>watpalers in tho-eres the-heaspre-heat ron-heas.</p>, </p>atspresion ton header1.jpg"/></s
0.3
ot cause analysis.<//ha><//>warkerses prewerkery inestandigg</>><//ha></p>aspans res in thae-ser thg"i></spa></