In [7]:
import os
import re
import glob
from tqdm import tqdm
import codecs
from chardet import detect

# get file encoding type
def get_encoding_type(file):
    with open(file, 'rb') as f:
        rawdata = f.read()
    return detect(rawdata)['encoding']


def correctTxtEncoding(filename, encoding_to='UTF-8'):
    from_codec = get_encoding_type(filename)
    temp_filename = filename[:-4]+"temp.txt"
    try:
        with open(filename, 'r', encoding=from_codec) as fr:
            with open(temp_filename, 'w', encoding=encoding_to) as fw:
                for line in fr:
                    fw.write(line[:-1]+'\r\n')
        os.remove(filename) # remove old encoding file
        os.rename(temp_filename, filename) # rename new encoding
    except UnicodeDecodeError:
        print('Decode Error')
    except UnicodeEncodeError:
        print('Encode Error')

def clean_text(string):
    pattern = '(page|PAGE|Page)(\s+\|\s+)([0-9]+)(.*)$'
    output_cleaned = re.sub('\s$', '', string, flags=re.MULTILINE)
    p=re.compile(pattern,re.MULTILINE)
    output_cleaned = p.sub(" ",output_cleaned)
    return output_cleaned

def merge_texts(texts):
    merged_text = ''
    for text in tqdm(texts):
        correctTxtEncoding(text)
        with open(text,'r',encoding="utf8") as f:
            for line in f:
                merged_text += line[:-1]+'\n'
    merged_text_cleaned = clean_text(merged_text)
    return merged_text_cleaned

def get_all_txt_files():
    path = os.path.abspath(os.path.join("../", os.pardir))+'\Data\**\*.txt';
    files = glob.glob(path, recursive=True)
    return files

def get_files_in_data_folder(folder):
    path = os.path.abspath(os.path.join("../", os.pardir))+ '\Data\\'+ folder + '\*.txt';
    files = glob.glob(path, recursive=True)
    return files

def get_files_by_author(author):
    path = os.path.abspath(os.path.join("../", os.pardir))+ '\Data\**\*_' + author + '.txt';
    files = glob.glob(path, recursive=True)
    return files

def get_train_test_validation(txt_file, train=0.70, test=0.20, val=0.10):
    train_doc =[]
    test_doc = []
    val_doc =[]
    with open("output_cleaned.txt",'r') as f:
        file_input=f.readlines()

    count = 0
    for cnt, line in enumerate(file_input):
            if cnt <= len(file_input)*train:
                train_doc.append(line)
            elif (cnt > len(file_input)*train and cnt < len(file_input)*(train+test)):
                test_doc.append(line)
            else:
                val_doc.append(line)

    ## Write to file
    f = open(txt_file+'_train', "w+")
    count = 0
    for line in train_doc:
        count=count+1
        f.write(str(line))
        f.write("\n")  
    f.close()
    print("Training lines:\t",count)
    
    ## Write to file
    f = open(txt_file+'_test.txt', "w+")
    count = 0
    for line in train_doc:
        count=count+1
        f.write(str(line))
        f.write("\n")  
    f.close()
    print("Testing lines:\t",count)

    ## Write to file
    f = open(txt_file+'_val.txt', "w+")
    count = 0
    for line in val_doc:
        count=count+1
        f.write(str(line))
        f.write("\n")

    f.close()
    print("Validation lines:\t",count)

In [4]:
all_text_files = get_all_txt_files()
harry_potter_texts = merge_texts(get_files_in_data_folder("Harry_Potter"))
print("Cleaned Harry Potter Text:\n\n\n" + harry_potter_texts[:2000]+"....\n\n")
stephen_king_texts = merge_texts(get_files_by_author("Stephen_King"))
print("Cleaned Stephen Text:\n\n\n" + stephen_king_texts[:2000]+"....\n\n")
horror_movie_transcripts = merge_texts(get_files_in_data_folder("Horror_Movie_Transcripts"))
print("Cleaned Horror Movie Transcripts:\n\n\n" + horror_movie_transcripts[:2000]+"....\n\n")
public_domain_texts = merge_texts(get_files_in_data_folder("Public_Domain_Horror_Novels"))
print("Cleaned Public Domain Horror Novels:\n\n\n" + public_domain_texts[:2000]+"....\n\n")

100%|████████████████████████████████████████████████████████████████████████████████████| 7/7 [02:35<00:00, 22.22s/it]
  0%|                                                                                            | 0/8 [00:00<?, ?it/s]

Cleaned Harry Potter Text:


/
THE BOY WHO LIVED
Mr. and Mrs. Dursley, of number four, Privet Drive,
were proud to say that they were perfectly normal,
thank you very much. They were the last people you’d
expect to be involved in anything strange or
mysterious, because they just didn’t hold with such
nonsense.
Mr. Dursley was the director of a firm called
Grunnings, which made drills. He was a big, beefy
man with hardly any neck, although he did have a
very large mustache. Mrs. Dursley was thin and
blonde and had nearly twice the usual amount of
neck, which came in very useful as she spent so
much of her time craning over garden fences, spying
on the neighbors. The Dursley s had a small son
called Dudley and in their opinion there was no finer
boy anywhere.
The Dursleys had everything they wanted, but they
also had a secret, and their greatest fear was that
somebody would discover it. They didn’t think they
could bear it if anyone found out about the Potters.
Mrs. Potter was Mrs. Dursl

100%|████████████████████████████████████████████████████████████████████████████████████| 8/8 [01:18<00:00,  9.87s/it]
  7%|█████▉                                                                             | 2/28 [00:00<00:01, 15.20it/s]

Cleaned Stephen Text:


To an extent, this novel deals with the legal aspects of child custody in the State of Maine. I asked for help in understanding this subject from my friend Warren Silver, who is a fine attorney. Warren guided me carefully, and along the way he also told me about a quaint old device called the Stenomask, which I immediately appropriated for my own fell purposes. If I've made procedural mistakes in the story which follows, blame me, not my legal resource. Warren also asked me--rather plaintively ¡ª if I could maybe put a 'good' lawyer in my book. All I can say is that I did my best in that regard.
Thanks to my son Owen for technical support in Woodstock, New York, and to my friend (and fellow Rock Bottom Remainder) Ridley Pearson for technical support in Ketchum, Idaho. Thanks to Pam Dorman for her sympathetic and perceptive reading of the first draft. Thanks to Chuck Verrill for a monumental editing job--your personal best, Chuck. Thanks to Susan Moldow, Nan Grah

100%|██████████████████████████████████████████████████████████████████████████████████| 28/28 [00:19<00:00,  1.41it/s]
  0%|                                                                                           | 0/46 [00:00<?, ?it/s]

Cleaned Horror Movie Transcripts:





Aliens vs. Predator
Dark Horse Prod. Presents
ALIENS
Vs.
PREDATOR
Screenplay by Peter Briggs
EXT.  DEEP SPACE
We OPEN on TOTAL BLACKNESS, a sea of stars spread across the infinite depths
of space.  As the TITLES ROLL, we notice that three of these specks seem to
be moving; one of them picking up acceleration and racing toward us.  Our
perspective changes, and we catch a quick glimpse as it HURTLES past, and
into the gravitational pull of a large brownish planet.  Kicking up SPARKS of
FRICTION as it hits atmosphere.  It seems to be manmade.  Or at least
artificial.
EXT.  PLANET SURFACE - DAY
The planet is dead, barren.  Death Valley on a grand scale.  We watch the
object plummet through the wispy cloud-cover, emitting a few last burning
embers before falling to ground way-off in the distance.  A BOOMING ECHO
resonates across the dusty plains, before settling back into an eerie
silence.
EXT.  FISSURE CANYON - DAY
We're looking into a deep gorge, dar

100%|██████████████████████████████████████████████████████████████████████████████████| 46/46 [02:11<00:00,  2.86s/it]


Cleaned Public Domain Horror Novels:


The Project Gutenberg EBook of Animal Ghosts, by Elliott O'Donnell
This eBook is for the use of anyone anywhere at no cost and with
almost no restrictions whatsoever.  You may copy it, give it away or
re-use it under the terms of the Project Gutenberg License included
with this eBook or online at www.gutenberg.org
Title: Animal Ghosts
       Or, Animal Hauntings and the Hereafter
Author: Elliott O'Donnell
Release Date: April 23, 2006 [EBook #18233]
Language: English
*** START OF THIS PROJECT GUTENBERG EBOOK ANIMAL GHOSTS ***
Produced by Barbara Tozier, Graeme Mackreth and the Online
Distributed Proofreading Team at http://www.pgdp.net
ANIMAL GHOSTS
OR,
ANIMAL HAUNTINGS AND THE HEREAFTER
BY ELLIOTT O'DONNELL
AUTHOR OF
"THE SORCERY CLUB," "WERWOLVES," "BYWAYS OF GHOSTLAND," "SCOTTISH
GHOSTS," "HAUNTED HOUSES OF LONDON," "HAUNTED HOUSES OF ENGLAND AND
WALES," "DREAMS AND THEIR MEANINGS," "FOR SATAN'S SAKE," "THE UNKNOWN
DEPTHS," "DINEVAH THE BEAUTIFU