## Gutenberg API

Importing and abridging texts from Project Gutenberg using the **gutenbergpy** library

In [5]:
import gutenbergpy.textget

In [6]:
#in this dictionary, use the Project Gutenberg id as the key
#and enter the start-line and end-line numbers (keeps only the text that is found inside these lines)
vols = {}
vols[996] = {'start-line-num': 4186, 'end-line-num': 85848}   #will cut all text found before this line (in the raw text)
vols[52806] = {'start-line-num': 454, 'end-line-num': 10333, 'volno': 1, 'numvols': 3}
vols[52938] = {'start-line-num': 180, 'end-line-num': 9791, 'volno': 2, 'numvols': 3}
vols[53081] = {'start-line-num': 153, 'end-line-num': 11403, 'volno': 3, 'numvols': 3}
vols[53489] = {'start-line-num': 1640, 'end-line-num': 5295}

In [None]:
import re

#Create regular expressions to identify passages to be removed from text

remove_passages_regexs = ["\[Siden.*\]", #ids all passages that begin with "[Siden" (even if starts on one line and continues onto another)
                          "\[Illus.*\]"]

fn_regexs = [" {1,3}\[[0-9]{1,3}\]",     #identify numbered footnotes
             " {1,3}\[[A-Z]{1,2}\]"]      #identify lettered footnotes

cont_fn_regexs = [" {1,3}\w{1,}"]        #identify text that is continuation of footnote text

In [9]:
for key, value in vols.items():
    #get text
    print(key)
    raw_book = gutenbergpy.textget.get_text_by_id(key)

    #split into lines and remove header and footer lines
    raw_lines = re.split("[\r\n]", raw_book.decode())
    print(raw_lines[:10])
    print("line count before abridging: ", len(raw_lines))
    bodylines = raw_lines[value['start-line-num']: value['end-line-num'] ]
    print("line count after abridging: ", len(bodylines))

    #use remove_lines_regexs to remove lines that match these regular expressions
    skiplines_index = []
    fn_open = False
    for i, line in enumerate(bodylines):
        if fn_open:
            fn_match = False
            if line.strip() == "":
                skiplines_index.append(i)
            else:
                for cont_fn_regex in cont_fn_regexs:
                    if re.match(cont_fn_regex, line):
                        fn_match = True
                if fn_match:
                    skiplines_index.append(i)
                else:
                    fn_open = False
        else:
            fn_match2 = False
            for fn_regex in fn_regexs:
                if re.match(fn_regex, line):
                    fn_match2 = True
            if fn_match2:
                skiplines_index.append(i)
                fn_open = True
    print(skiplines_index)
    omit_lines = [line for i, line in enumerate(bodylines) if i in skiplines_index]
    new_lines = [line for i, line in enumerate(bodylines) if i not in skiplines_index]
    if len(omit_lines) > 0:
        print("lines omitted:", omit_lines)
        print(f"lines cut = {len(omit_lines)} and {len(bodylines) - len(new_lines)}") 

    #rejoin lines as string and remove passages that match remove_passages_regexs
    newbodystring = '\n'.join(new_lines)
    print("char count before passage removal:", len(newbodystring))
    for passage_regex in remove_passages_regexs: 
        print(passage_regex)
        newbodystring = re.sub(passage_regex, "", newbodystring)
    print("char count after passage removal:", len(newbodystring))

    #add new body text to books dict
    value['bodystring'] = newbodystring
    print("\n***\n\n")

    

996
['The Project Gutenberg eBook of The History of Don Quixote, by Miguel de Cervantes', '', '', '', 'This eBook is for the use of anyone anywhere in the United States and', '', 'most other parts of the world at no cost and with almost no restrictions', '', 'whatsoever. You may copy it, give it away or re-use it under the terms', '']
line count before abridging:  86559
line count after abridging:  81662
[]
char count before passage removal: 2231828
\[Siden.*\]
\[Illus.*\]
char count after passage removal: 2231828
52806
['The Project Gutenberg EBook of The life and adventures of Guzman', '', "D'Alfarache, or the Spanish Rogue, by Alain-René Le Sage", '', '', '', 'This eBook is for the use of anyone anywhere at no cost and with', '', 'almost no restrictions whatsoever.  You may copy it, give it away or', '']
line count before abridging:  11089
line count after abridging:  9879
[]
char count before passage removal: 312806
\[Siden.*\]
\[Illus.*\]
char count after passage removal: 312806
5

Export texts after combining separate volumes of same book

In [12]:
books = {}
for key, value in vols.items():
    print(key)
    if "volno" in value.keys():
        print("multivolume text")
        print(f"volume {value['volno']} of {value['numvols']}")
        if value['volno'] == 1:             #i.e. vol 1 / 3
            alltext = value['bodystring']
            volnumbers = [key]
            print(f"this volume is {len(alltext)} chars long.")
        elif value['volno'] == value['numvols']: #i.e. vol. 3/3
            volnumbers.append(key)
            print(f"this volume is {len(value['bodystring'])} chars long.")
            alltext += "\n\n" + value['bodystring']
            with open(f"{'_'.join([str(v) for v in volnumbers])}_abridged.txt", 'w', encoding = 'utf-8') as f:
                f.write(alltext)
            print(f"multivolume text is {len(alltext)} chars long.")
            alltext = ""
        else:                                   #ie. vol 2/3 or 3/4
            print(f"this volume is {len(value['bodystring'])} chars long.")
            alltext += "\n\n" + value['bodystring']
            volnumbers.append(key)
    else:
        alltext = value['bodystring']
        print(f"single volume text is {len(alltext)} chars long.")
        with open(f"{key}_abridged.txt", 'w', encoding = 'utf-8') as f:
            f.write(alltext)

    print()

996
single volume text is 2231828 chars long.

52806
multivolume text
volume 1 of 3
this volume is 312806 chars long.

52938
multivolume text
volume 2 of 3
this volume is 302814 chars long.

53081
multivolume text
volume 3 of 3
this volume is 357297 chars long.
multivolume text is 972921 chars long.

53489
single volume text is 96461 chars long.

