In [1]:
!pip install tika

Collecting tika
  Downloading https://files.pythonhosted.org/packages/96/07/244fbb9c74c0de8a3745cc9f3f496077a29f6418c7cbd90d68fd799574cb/tika-1.24.tar.gz
Building wheels for collected packages: tika
  Building wheel for tika (setup.py): started
  Building wheel for tika (setup.py): finished with status 'done'
  Created wheel for tika: filename=tika-1.24-cp37-none-any.whl size=32887 sha256=a72ae011cf8e1c50094090bb9aa57c1f23edd8ab17f4efd596590778a420639d
  Stored in directory: C:\Users\gabe5\AppData\Local\pip\Cache\wheels\73\9c\f5\0b1b738442fc2a2862bef95b908b374f8e80215550fb2a8975
Successfully built tika
Installing collected packages: tika
Successfully installed tika-1.24


In [1]:
from tika import parser

In [2]:
path = r'E:\Documents\Books\How emotions are made.pdf'
text = parser.from_file(path)
type(text)

2020-06-11 21:32:59,617 [MainThread  ] [WARNI]  Failed to see startup log message; retrying...


dict

In [None]:
print(text['content'])

In [4]:
book_text = text['content']
len(book_text)

1185260

In [5]:
book_text = book_text[11465:1185260 - 342509 ]
len(book_text)

831286

In [None]:
print(book_text)

In [9]:
# only about 70% of the text is text useful text from the book
831286/1185260
8312860 * .25


2078215.0

In [43]:
def load_doc(path, file = False):
    ''' Function to load data file '''
    
    with open(text,'rb') as file:
        data = file.read()
        return data
    
def split_data(data, size = .25, section = 'front', verbose = True):
    ''' returns a sample of the data 
    
        parameters
        -----------------------
        
        size: 
            this determines how much of the data to return ranging from 0-1
            
        section:
            specifies where to grab text from
                * front: starts from the begining
                * back: grabs from the back
                * random: its random 
                
    '''
    
    if size > 1 or size < 0:
        raise ValueError("Size must be a value between 1 and 0")
        
    amount = int(len(data) * size)
    section = section.lower()
    if section == 'front':
        split = data[:amount]
    elif section == 'back':
        # use negatives to go from the back
        split = data[-amount:-1]
    elif section == 'random':
        print("this isnt ready yet :(")
        pass
    
    if verbose:
        print(f'Size: {int(size*100)}% or {len(split)} words')
        print(f"section: {section}")
    
    # remove newline characters
    split = split.replace("\n","")
    return split

def clean(doc, verbose = True):
    import string
    # replace '--' with " "
    doc = doc.replace('--'," ")
    # split into a list (tokens)by whitespace
    tokens = doc.split()
    # remove punctuation
    table = str.maketrans('','',string.punctuation)
    tokens = [word.translate(table) for word in tokens]
    # remove tokens that arent alphabetic
    tokens = [word for word in tokens if word.isalpha()]
    # make all lower case
    tokens = [word.lower() for word in tokens]
    
    if verbose:
        print(f'Total Characters: {len(tokens)}')
        print(f'Unique Characters: {len(sorted(set(tokens)))}')
        
    return tokens
    
    
    

# Cleaning the text
We need to transform the raw text into a sequence of tokens or words that we can use as a source to train the model.

* Replace ‘–‘ with a white space so we can split words better.
* Split words based on white space.
* Remove all punctuation from words to reduce the vocabulary size (e.g. ‘What?’ becomes ‘What’).
* Remove all words that are not alphabetic to remove standalone punctuation tokens.
* Normalize all words to lowercase to reduce the vocabulary size.


In [52]:
doc = split_data(book_text,size = 1)
tokens = clean(doc)
# print(tokens[:150])

Size: 100% or 831286 words
section: front
Total Characters: 123945
Unique Characters: 10248
['introduction', 'the', 'twothousandyearold', 'assumptionon', 'december', 'the', 'deadliest', 'school', 'shooting', 'in', 'us', 'history', 'took', 'place', 'at', 'sandy', 'hook', 'elementary', 'school', 'in', 'newtown', 'connecticut', 'twentysix', 'people', 'inside', 'the', 'school', 'including', 'twenty', 'children', 'were', 'massacred', 'by', 'a', 'lone', 'gunman', 'several', 'weeks', 'after', 'this', 'horror', 'i', 'watched', 'the', 'governor', 'of', 'connecticut', 'dannel', 'malloy', 'give', 'his', 'annual', 'of', 'the', 'speech', 'on', 'television', 'he', 'spoke', 'in', 'a', 'strong', 'and', 'animated', 'voice', 'for', 'the', 'first', 'three', 'minutes', 'thanking', 'individuals', 'for', 'their', 'service', 'and', 'then', 'he', 'began', 'to', 'address', 'the', 'newtown', 'tragedywe', 'have', 'all', 'walked', 'a', 'very', 'long', 'and', 'very', 'dark', 'road', 'together', 'what', 'befell', '

# Create sequences
We can organize the long list of tokens into sequences of 50 input words and 1 output word.
That is, sequences of 51 words.
We can do this by iterating over the list of tokens from token 51 onwards and taking the prior 50 tokens as a sequence, then repeating this process to the end of the list of tokens.
We will transform the tokens into space-separated strings for later storage in a file.

In [68]:
# organize into sequences of tokens
length = 50 + 1
sequences = list()

# generate sequences
# grabs a subset of the text 51 characters at a time
for index in range(length, len(tokens)):
    # select a sequence of tokens from 0 to 51, then 52 - 103, and so on
    seq = tokens[index - length: index]
    #print(index - length,index)
    
    # flatten the sequence (convert into a string)
    line = ' '.join(seq)
    # append the sequnce
    sequences.append(line)

In [77]:
print(f"Total sequences: {len(sequences)}")

Total sequences: 123894


# Save the sequences
Now we can save our sequences to a file so that we can load it in later. Im using a pickle file so that i can load the data back in as a list.

In [99]:
import pickle as pk
import os
def save_seq(seqs,filename,location: str):
    # join the location and the filename
    path = os.path.join(location,filename)
    with open(path,'wb') as file:
        pk.dump(seqs, file)
        print(f'Sequences saved to: {path}')

        
    # replace .seq with .txt  
    filename = filename.replace(filename[-4:],'.txt')
    print(filename)
    path = os.path.join(location,filename)
    with open(path,'w',encoding = 'UTF-8') as file:
        data = '\n'.join(seqs)
        file.write(data)

In [100]:
location = r'E:\Documents\My Projects\Text Generation\data'
filename = 'HEAM.seq'
save_seq(sequences,filename,location)

Sequences saved to: E:\Documents\My Projects\Text Generation\data\HEAM.seq
HEAM.txt
