# Preprocessing
* Convert letters to lowercase
* convert numbers into words (excluding timestamps)
* remove punctuation
* remove white space
* remove stop words

In [62]:
import re
import string
import numpy as np 
import pandas as pd
import nltk as nlp
from sklearn.feature_extraction.stop_words import ENGLISH_STOP_WORDS
import spacy # for Part Of Speech  tagging
import tqdm
import os
import inflect
p = inflect.engine()

In [36]:
path = r'E:\Documents\My Projects\Text Scraper\data'

In [4]:
def get_files(path, names_only = False):
    ''' Returns a list of filepaths from a directory'''
    files = list()
    for file in os.listdir(path):
        #print(file)
        # append the fullpath
        if names_only:
            files.append(file)
        else:
            files.append(os.path.join(path,file))
    
    return files

def read_files(filepaths):
    ''' Reads files in a path and returns as a dictionary'''
    text = dict()
    for index, file in enumerate(filepaths):
        with open(file, 'r', encoding = "UTF-8") as data:
            content = data.read()
        
        text[index] = content
        
    print(f'Done! Corpus size: {len(text)} documents')
    return text


    

In [5]:
files = get_files(path)

In [6]:
corpus = read_files(files)

Done! Corpus size: 107 documents


In [7]:
print(len(corpus[0]))

81961


In [8]:
def prnt(value):
    ''' print a number with commas '''
    
    value = str(value)
    value_flipped = list(value[::-1])
#     print(value_flipped)
    pos = list()
    for i in range(3,len(value_flipped)+1,4):
        pos.append(i)
        
    for i in pos:
        value_flipped.insert(i,".")
        
    print("".join(value_flipped[::-1]))
               
total = 0
for index, text in corpus.items():
    total += len(corpus[index])


prnt(total)    

9.085.542


In [58]:
sample = corpus
print(sample)

0:00:00 Sean Carroll: Hello, everyone. Welcome to the Mindscape Podcast. I am your host Sean Carroll and if you’re like me, you remember maybe in high school, maybe in college in some psychology course being taught about Abraham Maslow’s hierarchy of needs. There was this pyramid diagram and at the bottom of it, there were your basic physiological needs: Food, shelter, things like that. Then you built up through other higher level psychological needs, until at the top, you reach self-actualization. So today’s guest, Scott Barry Kaufman, is a psychologist who is proposing that we update Maslow’s hierarchy of needs. He’s done two things. 0:00:37 SC: Number one, he’s actually dug into many of the writings that Maslow himself did and learned things like Maslow himself never drew a pyramid and Maslow himself had a lot of ideas that go well beyond the famous hierarchy of needs and the second thing that Scott does is propose an entirely new metaphor based on a different kind of hierarchy. He 

In [139]:
def remove_punctuation(text):
    text = re.sub(r'[^\w\s]','',text)
    text = text.replace('\n',"")
    text = text.replace('\t',"")
    return text

def convert_numbers(text, verbose = False):
    # convert corpus into a list so we can look at whole words instead of characters
    text = text.split()
    
    # look at each word    
    for index,word in enumerate(text):
        # handles for money
        if word.startswith('$'):
            if verbose:
                print(f'{text[index]}')
            money = word.split('$')
            #print(f'amount: {money}\n')
            
            # make sures the input text has digits 
            try:
                # hanldes for commas in the number eg. 34,543
                if len(money[1]) > 3:
                    # substitute anythin thats not a digit with whitespace
                    amount = int(re.sub(r'[^0-9]','',money[1]))
                    #print(amount)
                    
                # if the number is less than 1,000
                else:
                    amount = int(money[1])
            except ValueError:
                print(f'{money[1]}. This is not an appropriate value\n')
                continue
            
            # converts the number to text
            in_text = p.number_to_words(amount, andword = '', zero = 'oh') + ' dollars'
            if verbose:
                print(f'Amount in text: {in_text}\n')
                
            text[index] = in_text
        
        # standard case of just coming across a word
        if word.isnumeric():
            
                
            in_text = p.number_to_words(word, andword = '', zero = 'oh') 
            old_word = text[index]
            text[index] = in_text
            
            if verbose:
                print(f'{old_word}')
                print(f'Number in text: {in_text}\n')
            
    # convert the list back into a string
    text = " ".join(text)
    return text
            
            

def lemmeatize(text):
    pass

def stem(text):
    pass

def remove_timestamps(text):
    text = re.sub(r'\d{1}:\d{2}:\d{2}','________',text)
    return text

def tokenize(text):
    pass

def clean(doc, verbose = True):
    import string
    # replace '--' with " "
    doc = doc.replace('--'," ")
    
    # convert numbers into words
    doc = convert_numbers(doc)
    # remove timestamps
    doc = remove_timestamps(doc)
    
    # split into a list (tokens)by whitespace
    tokens = doc.split()
    # remove punctuation
    table = str.maketrans('','',string.punctuation)
    tokens = [word.translate(table) for word in tokens]
    # remove tokens that arent alphabetic
    tokens = [word for word in tokens if word.isalpha()]
    # make all lower case
    tokens = [word.lower() for word in tokens]
    # get the sequences also
    seqs = sequences(tokens)
    # vocab
    vocab = sorted(set(tokens))
    
    if verbose:
        print(f'Total Characters: {len(tokens)}')
        print(f'Unique Characters: {len(sorted(set(tokens)))}')
        print(f'Total Sequences: {len(seqs)}')

        
    return tokens, vocab, seqs

def sequences(tokens, length = 50):
    # organize into sequences of tokens
    length += 1
    sequences = list()

    # generate sequences
    # grabs a subset of the text 51 characters at a time
    for index in range(length, len(tokens)):
        # select a sequence of tokens from 0 to 51, then 52 - 103, and so on
        seq = tokens[index - length: index]
        #print(index - length,index)

        # flatten the sequence (convert into a string)
        line = ' '.join(seq)
        # append the sequnce
        sequences.append(line)
    
    return sequences

def save_tokens(tokens, filename, location):
    # join the location and the filename
    path = os.path.join(location,filename)
    with open(path,'w',encoding = 'UTF-8') as file:
        for i in range(len(tokens)):
            if i%20 == 0:
                file.write('\n')
            else:
                file.write(tokens[i]+' ')
        print(f'Tokens saved to: {path}') 
        
def save_seqs(seqs, filename, location):
    
    path = os.path.join(location,filename)
    with open(path,'w',encoding = 'UTF-8') as file:
        data = '\n'.join(seqs)
        file.write(data) 
        
    print(f'Sequences saved to: {path}')
        
def save_all(tokens, vocab, seqs, filenames, location):
    ''' save the tokens, vocab and sequences to a file.
    
    filenames is a list where 
    filename[0] = tokens
    filename[1] = vocab
    filename[2] = sequences
    
    '''
    save_tokens(tokens, filenames[0], location)
    save_tokens(vocab, filenames[1], location)
    save_seqs(seqs, filenames[2],location)
    
    
    
#remove_punctuation(sample)
tokens, vocab, seqs = clean(sample)
names = ['tokens.txt','vocab.txt','seqs.txt']
location = r'E:\Documents\My Projects\Text Generation\data\Mindscape articles'
save_all(tokens, vocab, seqs, names, location)

Total Characters: 604
Unique Characters: 279
Total Sequences: 553
Tokens saved to: E:\Documents\My Projects\Text Generation\data\tokens.txt
Tokens saved to: E:\Documents\My Projects\Text Generation\data\vocab.txt
Sequences saved to: E:\Documents\My Projects\Text Generation\data\seqs.txt


In [140]:
# save each article in the corpus
for index, article in corpus.items():
    print(f'Article {index}')
    # get the tokens, vocab and sequences
    tokens, vocab, seqs = clean(article)
    # change the filename based on index 
    names = [f'tokens_{str(index)}.txt',f'vocab_{str(index)}.txt',f'seqs_{str(index)}.txt']
    location = r'E:\Documents\My Projects\Text Generation\data\Mindscape articles'
    # save everything for this article
    save_all(tokens, vocab, seqs, names, location)
    print()  

Article 0
Total Characters: 13987
Unique Characters: 1815
Total Sequences: 13936
Tokens saved to: E:\Documents\My Projects\Text Generation\data\Mindscape articles\tokens_0.txt
Tokens saved to: E:\Documents\My Projects\Text Generation\data\Mindscape articles\vocab_0.txt
Sequences saved to: E:\Documents\My Projects\Text Generation\data\Mindscape articles\seqs_0.txt

Article 1
Total Characters: 10201
Unique Characters: 1521
Total Sequences: 10150
Tokens saved to: E:\Documents\My Projects\Text Generation\data\Mindscape articles\tokens_1.txt
Tokens saved to: E:\Documents\My Projects\Text Generation\data\Mindscape articles\vocab_1.txt
Sequences saved to: E:\Documents\My Projects\Text Generation\data\Mindscape articles\seqs_1.txt

Article 2
Total Characters: 16602
Unique Characters: 2079
Total Sequences: 16551
Tokens saved to: E:\Documents\My Projects\Text Generation\data\Mindscape articles\tokens_2.txt
Tokens saved to: E:\Documents\My Projects\Text Generation\data\Mindscape articles\vocab_2.

Sequences saved to: E:\Documents\My Projects\Text Generation\data\Mindscape articles\seqs_23.txt

Article 24
Total Characters: 18281
Unique Characters: 2404
Total Sequences: 18230
Tokens saved to: E:\Documents\My Projects\Text Generation\data\Mindscape articles\tokens_24.txt
Tokens saved to: E:\Documents\My Projects\Text Generation\data\Mindscape articles\vocab_24.txt
Sequences saved to: E:\Documents\My Projects\Text Generation\data\Mindscape articles\seqs_24.txt

Article 25
Total Characters: 17945
Unique Characters: 2151
Total Sequences: 17894
Tokens saved to: E:\Documents\My Projects\Text Generation\data\Mindscape articles\tokens_25.txt
Tokens saved to: E:\Documents\My Projects\Text Generation\data\Mindscape articles\vocab_25.txt
Sequences saved to: E:\Documents\My Projects\Text Generation\data\Mindscape articles\seqs_25.txt

Article 26
Total Characters: 18443
Unique Characters: 2720
Total Sequences: 18392
Tokens saved to: E:\Documents\My Projects\Text Generation\data\Mindscape artic

Sequences saved to: E:\Documents\My Projects\Text Generation\data\Mindscape articles\seqs_45.txt

Article 46
Total Characters: 16106
Unique Characters: 2009
Total Sequences: 16055
Tokens saved to: E:\Documents\My Projects\Text Generation\data\Mindscape articles\tokens_46.txt
Tokens saved to: E:\Documents\My Projects\Text Generation\data\Mindscape articles\vocab_46.txt
Sequences saved to: E:\Documents\My Projects\Text Generation\data\Mindscape articles\seqs_46.txt

Article 47
Total Characters: 12363
Unique Characters: 1908
Total Sequences: 12312
Tokens saved to: E:\Documents\My Projects\Text Generation\data\Mindscape articles\tokens_47.txt
Tokens saved to: E:\Documents\My Projects\Text Generation\data\Mindscape articles\vocab_47.txt
Sequences saved to: E:\Documents\My Projects\Text Generation\data\Mindscape articles\seqs_47.txt

Article 48
Total Characters: 14766
Unique Characters: 2231
Total Sequences: 14715
Tokens saved to: E:\Documents\My Projects\Text Generation\data\Mindscape artic

Unique Characters: 2169
Total Sequences: 15759
Tokens saved to: E:\Documents\My Projects\Text Generation\data\Mindscape articles\tokens_68.txt
Tokens saved to: E:\Documents\My Projects\Text Generation\data\Mindscape articles\vocab_68.txt
Sequences saved to: E:\Documents\My Projects\Text Generation\data\Mindscape articles\seqs_68.txt

Article 69
1,. This is not an appropriate value

1,. This is not an appropriate value

Total Characters: 16537
Unique Characters: 2107
Total Sequences: 16486
Tokens saved to: E:\Documents\My Projects\Text Generation\data\Mindscape articles\tokens_69.txt
Tokens saved to: E:\Documents\My Projects\Text Generation\data\Mindscape articles\vocab_69.txt
Sequences saved to: E:\Documents\My Projects\Text Generation\data\Mindscape articles\seqs_69.txt

Article 70
Total Characters: 13433
Unique Characters: 1928
Total Sequences: 13382
Tokens saved to: E:\Documents\My Projects\Text Generation\data\Mindscape articles\tokens_70.txt
Tokens saved to: E:\Documents\My Projec

Tokens saved to: E:\Documents\My Projects\Text Generation\data\Mindscape articles\tokens_91.txt
Tokens saved to: E:\Documents\My Projects\Text Generation\data\Mindscape articles\vocab_91.txt
Sequences saved to: E:\Documents\My Projects\Text Generation\data\Mindscape articles\seqs_91.txt

Article 92
Total Characters: 12088
Unique Characters: 2081
Total Sequences: 12037
Tokens saved to: E:\Documents\My Projects\Text Generation\data\Mindscape articles\tokens_92.txt
Tokens saved to: E:\Documents\My Projects\Text Generation\data\Mindscape articles\vocab_92.txt
Sequences saved to: E:\Documents\My Projects\Text Generation\data\Mindscape articles\seqs_92.txt

Article 93
Total Characters: 14680
Unique Characters: 2087
Total Sequences: 14629
Tokens saved to: E:\Documents\My Projects\Text Generation\data\Mindscape articles\tokens_93.txt
Tokens saved to: E:\Documents\My Projects\Text Generation\data\Mindscape articles\vocab_93.txt
Sequences saved to: E:\Documents\My Projects\Text Generation\data\M