'''
This program selects future-oriented text and cleans text
#Date: Dec 3 2019
#Update: Mar 25, 2019
#update: June 27, 2020
#Author: Carly Knight
'''

### Import packages

In [118]:
import os
from zipfile import ZipFile
import xml.etree.ElementTree as ET
from xml import etree
import csv
import re
import dateutil.parser as parser
from enchant.tokenize import get_tokenizer
import enchant
import pandas as pd
from itertools import chain
import random

## get liwc list


In [119]:
liwclist= "/Users/carlyknight/Dropbox/PROJECTS/ConceptionsofRisk/data/LIWC/liwc2015dict.csv"
lw= pd.read_csv(liwclist)

#select FocusFuture
future= lw.loc[lw['var'] == 'FocusFuture']
futurelist = future['word'].tolist()

#stems versus words
futurelist_stem = [s.strip('*') for s in futurelist if "*" in s]
futurelist_word = [s for s in futurelist if "*" not in s]

#add/remove
futurelist_word.extend(["Future", "will", "is heading", "are heading", "aim","could", "doubt", "has begun", "intend", "intends", "intending", "is expected", "is likely to", "it is hoped", "optimism", "pessimism", "uncertainty", "projected", "certainty", "as yet", "not yet", "may", "outlook", "should", "risk", "risky", "risk-taking", "is planned", "is planning", "is scheduled", "a planned", "the planned", "optimistic", "pessimistic", "next generation", "well positioned", "next-generation", "well-positioned", "is scheduled", "are scheduled"])
futurelist_word.remove("plan")
futurelist_word.remove("plans")
futurelist_word.remove("planner")
futurelist_word.remove("planning")

futurelist_stem.extend(["confident"])
futurelist_stem.remove("headin")
futurelist_stem.remove("prepar")

#notwordlist
notwordlist=["shareholders will", "Shareholders will", "will be of interest", "good will", "GOOD WILL", "Good will", "you will", "was expected", "were expected", "TABLE OF CONTENTS", "table of contents", "should have", "could have", "would have", "should not have", "could not have", "would not have", "may have", "accounting standards"]

#financial statement list
financialstatement = ["Notes to Consolidated Financial Statements", "Notes to Financial Statements"]

#paragrapsh to exclude list
phrasetoexclude = financialstatement + ['financial accounting', "the following table", "transfer agent", "annual meeting of shareholders", "annual meeting of stockholders", "next annual meeting"]

#test examples
example_list = ["possible"]
example_container_lists = [ [] for i in range(len(example_list)) ]

print(futurelist_word)
print(futurelist_stem)
print(notwordlist)
#Other potential options:
#Risk, Money, Quant, Verb, Tentat, Certain, Time

['ahead', 'anticipation', 'approaching', 'attainable', 'coming', 'eventual', 'eventually', 'fate', 'fated', 'fates', 'feasible', 'finna', 'fixin', "fixin'", 'foresight', 'forseeable', 'forthcoming', 'going', 'gon', 'gonna', 'gotta', 'gunna', "he'll", 'henceforth', 'hope', 'hopeful', 'hopefully', 'hoping', "i'll", "i'mma", 'ima', 'imma', 'imminent', 'impending', "it'll", 'itll', 'looming', 'may', 'might', 'obtainable', 'oncoming', 'onward', 'pray', 'prayed', 'praying', 'promising', 'shall', "shan't", 'shant', "she'll", 'someday', 'sometime', 'soon', 'sooner', 'soonest', "that'll", 'thatll', 'Future', 'will', 'is heading', 'are heading', 'aim', 'could', 'doubt', 'has begun', 'intend', 'intends', 'intending', 'is expected', 'is likely to', 'it is hoped', 'optimism', 'pessimism', 'uncertainty', 'projected', 'certainty', 'as yet', 'not yet', 'may', 'outlook', 'should', 'risk', 'risky', 'risk-taking', 'is planned', 'is planning', 'is scheduled', 'a planned', 'the planned', 'optimistic', 'pes

# Prep + helper functions

### Input and Output Data Location

In [120]:
directory = "/Users/carlyknight/Documents/Data/Annual Report/XML/"
results= "/Users/carlyknight/Documents/Data/Annual Report/report_paragraphs/future_texts/"
results_paragraphs = "/Users/carlyknight/Documents/Data/Annual Report/report_paragraphs/future_texts_paragraphs/"
test_files= "/Users/carlyknight/Dropbox/PROJECTS/ConceptionsofRisk/data/keyword_test_files/"
test_paragraphs = "/Users/carlyknight/Dropbox/PROJECTS/Corporate_Futures_for_Hesu/paragraphs/"

### Help function: spell checker

In [121]:
#function to calculate the percent
def calculate_percent_correct(text):
	true =0
	total = 0.0
	for word in tknzr(text):
		word1 =word[0]
		total +=1
		if d2.check(word1):
			true +=1
	if total !=0:
		percent = true*1.0/total
	else:
		percent =0
	return percent

'''INITIATE SPELLCHECKER'''
d2 = enchant.DictWithPWL("en_US")
tknzr = get_tokenizer("en_US")

### Helper function: clean text

In [122]:
def get_fulltext(root):
	full = root.find('FullText')
	if full is None:
		fulltext = ""
	else:
		fulltext=full.text
	return fulltext

In [123]:
def clean_punction(fulltext):
    #remove various kinds of superfluous punction
    fulltext = re.sub(r'[;:$-*><=|^·\,\\-]','',fulltext)
    fulltext.replace(u'No.', u'No')
    fulltext.replace("\\", "") 
    #replace series of periods with one period
    fulltext = re.sub(r'(\.\s*\.\s*)+', '', fulltext)
    return fulltext

### Regex Compile

In [124]:
pattern_word = r"(?<=\W)(%s)\b" % "|".join(futurelist_word)
regex_word = re.compile(pattern_word)

pattern_stem = r"(?<=\W)(%s)" % "|".join(futurelist_stem)
regex_stem = re.compile(pattern_stem)

pattern_finance = r"(?<=\W)(%s)" % "|".join(financialstatement)
regex_financestatement = re.compile(pattern_finance)

#note: we don't care about case sensitivity for the removals
pattern_remove = r"(?<=\W)(%s)" % "|".join(phrasetoexclude)
regex_remove = re.compile(pattern_remove, re.IGNORECASE)

### Helper fuction: Wordsearch functions

In [125]:
#identify paragraphs
def paragraph_split(fulltext):
    splits= re.split('(\.{3,}[-_\s]*\.{2,}[-_\s]*)|(\_{3,})|(\-{3,})|(\s{3,}[0-9]{1,}\s{3,})|(\s{5,}[A-Z\s]+\s{5,})', fulltext)
    splits=[split for split in splits if split is not None if split.strip() is not '']
    return splits

#combine paragraphs if they are too short
def paragraph_combine(si, length):
    si = iter(si)
    while True:
        try:
            current = next(si)
        except StopIteration:
            return
        try:
            while len(current) < length:
                current += ' ' + next(si)
        except StopIteration:
            return
        try:
            yield current
        except StopIteration:
            return
        

In [126]:
#identify future paragraphs
def paragraph_wordsearch(paragraphs):
    selected_paragraphs = []
    for paragraph in paragraphs:
        #search paragraph for wordlist
        num_matches=len(regex_stem.findall(paragraph)) + len(regex_word.findall(paragraph))
        if num_matches > 0: #if there is a match
            #chain the word and stem iterators:
            chain_iter = chain(regex_stem.finditer(paragraph), regex_word.finditer(paragraph))
            #iterate through chain
            for match in chain_iter:
                #make sure only match is not in not-match list:
                if [m for m in notwordlist if m in paragraph[match.start()-10: match.end()+10]]:
                    continue
                else:
                    #replace with asterisks
                    paragraph =paragraph.replace(match.group(), "***" + match.group())
            #substitute due to loop problems
            paragraph = re.sub(r'\*+', "***", paragraph)
            #if there are asterisks, then append
            if "***" in paragraph:
                selected_paragraphs.append(paragraph)
    return selected_paragraphs


In [127]:
#remove paragraphs contain content we know we don't want
def paragraph_remove(paragraphs):
    selected_paragraphs = []
    for paragraph in paragraphs:
        #search for what to remove
        if len(regex_remove.findall(paragraph))>0: #if there is something to remove
            #print("FOUND PARAGRAPH TO REMOVE")
            continue
        elif "NOTE" in paragraph:
            #print("WORD 'NOTE' FOUND")
            continue
        else:
            selected_paragraphs.append(paragraph)
    return selected_paragraphs


In [128]:
#identify future paragraphs for examples
def paragraph_example_wordsearch(selected_text):
    paragraphs = selected_text.split("####")
    for paragraph in paragraphs: #iterate through paragraph
        for i,item in enumerate(example_list): #for each paragraph, test for each item
            if re.search(r"(?<=\*\*\*)" + item + r"\b", paragraph):
                example_container_lists[i].append(paragraph)

In [129]:
#find financial statement and remove text after
def find_financialstatement(fulltext):
    matches = regex_financestatement.finditer(fulltext)
    text_length = len(fulltext)
    if matches: 
        for m in matches:
            location = m.start()
            finance_location = location/text_length
            #print(m.start(), m.group(), finance_location)
            #if finance location in back fourth of document, remove everything after that location
            if finance_location >= 2/3:
                fulltext = fulltext[:location]
                return(fulltext)
                break
    return(fulltext)

### Helper function: get metadata

In [130]:
def get_metadata(root):
    date= root.find('AlphaPubDate').text
    datenum = root.find('NumericPubDate').text
    year = parser.parse(date).year
    url = root.find('URLDocView').text
    child = root.find('Terms')
    #create empty containers for terms
    company = ''
    naics=''
    code = ''
    classic = ''
    #iterate through terms
    for alltags in child.findall('.//'):
        if alltags.tag == "CompanyName":
            company = alltags.text
        elif alltags.tag == "CompanyNAIC":
            naics = alltags.text
        elif alltags.tag == "ClassCode":
            code = alltags.text
        elif alltags.tag == "ClassExpansion":
            classic = alltags.text
    # put the flexterms into a dictionary
    dict = {}
    for elem in root.iter('FlexTerm'):
        dict[elem.find('FlexTermName').text] = elem.find('FlexTermValue').text
    # create terms from dictionary
    ancode, ancashd, ancashs, anasd, anass, anrevd, anrevs, anearnd, anearns = [dict.get(k) for k in
                                                                            ['AnrClassCode1IdxLit',
                                                                             'AnrCashDispIdxNum',
                                                                             'AnrCashSrchIdxNum',
                                                                             'AnrAssetDispIdxNum',
                                                                             'AnrAssetSrchIdxNum',
                                                                             'AnrRevDispIdxNum',
                                                                             'AnrRevSrchIdxNum',
                                                                             'AnrEarnDispIdxNum',
                                                                             'AnrEarnSrchIdxNum']]
    # if variables above are NONE, recode as empty string
    ancode, ancashd, ancashs, anasd, anass, anrevd, anrevs, anearnd, anearns = [k or "" for k in
                                                                            [ancode, ancashd, ancashs, anasd,
                                                                             anass, anrevd, anrevs, anearnd,
                                                                             anearns]]
    #create list of alternative company names
    altnames =[]
    for elem in root.iter('FlexTerm'):
        if elem.find('FlexTermName').text == "AnrRelCoNameIdxLit":
            altnames.append(elem.find('FlexTermValue').text)
    relnames='; '.join(altnames)
    return date, datenum, year, url, company, naics, code, classic, ancode, ancashd, ancashs, anasd, anass, anrevd, anrevs, anearnd, anearns, relnames
    

# Main loop

### create list for random selection to check

In [131]:
random_selection = []

### create list of zip-files

In [132]:
zippaths=[]
for zipped in os.listdir(directory):
	if not zipped.endswith('.zip'):continue
	zippaths.append(os.path.join(directory, zipped))


### open csv for metadata- paragraph

In [133]:
csvfile = open(results + 'metadata_futureperfect_paragraphs.csv', 'w')
fieldnames = ['Location', 'Filename', "Paragraph", 'RecordTitle', 'Date', 'Date1', 'Year', 'URL', 'CompanyName', 'NAIC', 'ClassCode', 'Classification', 'AnrAssetDispIdxNum', 'numwordstotal', "numwordsparagraph", 'num_paragraphs_prespellcheck', 'num_paragraphs_postspellcheck']
writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
writer.writeheader()
            

# TEST

In [134]:
# zip1 = [zippaths[4]]
# for filename in zip1:
#     with ZipFile(filename, "r") as zf:
#         for name in zf.namelist():
#             print("FILE :", name)
#             data= zf.read(name)
#             root = ET.fromstring(data)
#             date, datenum, year, url, company, naics, code, classic, ancode, ancashd, ancashs, anasd, anass, anrevd, anrevs, anearnd, anearns, relnames= get_metadata(root)
#             #get fulltext
#             fulltext= get_fulltext(root)
#             #clean fulltext
#             fulltext=clean_punction(fulltext)
#             #find and remove financial statements later in document
#             fulltext=find_financialstatement(fulltext)
#             #identify paragraphs
#             paragraphs1 = paragraph_split(fulltext)
#             paragraphs2 = list(paragraph_combine(paragraphs1, 200))
#             #wordsearch paragraphs
#             paragraphs3= paragraph_wordsearch(paragraphs2)
#             n_paragraphs_pre= len(paragraphs3)
#             #select only quality paragraphs
#             paragraphs4 = select_good_paragraphs(paragraphs3)
#             #remove bad phrases
#             paragraphs5 = paragraph_remove(paragraphs4)
#             if paragraphs5 is None or len(paragraphs5)==0:
#                 print(str(i), name, "No results returned")
#                 continue
#             #combine paragraphs again for STM analysis (into 500 words)
#             paragraphs6 = paragraph_combine(paragraphs5, 500)
           

# Loop

In [143]:
i=1
for filename in zippaths:
    with ZipFile(filename, "r") as zf:
        for name in zf.namelist():
            data= zf.read(name)
            root = ET.fromstring(data)
            date, datenum, year, url, company, naics, code, classic, ancode, ancashd, ancashs, anasd, anass, anrevd, anrevs, anearnd, anearns, relnames= get_metadata(root)
            #get fulltext
            fulltext= get_fulltext(root)
            #clean fulltext
            fulltext=clean_punction(fulltext)
            #find and remove financial statements later in document
            fulltext=find_financialstatement(fulltext)
            #identify paragraphs
            paragraphs1 = paragraph_split(fulltext)
            paragraphs2 = list(paragraph_combine(paragraphs1, 200))
            #wordsearch paragraphs
            paragraphs3= paragraph_wordsearch(paragraphs2)
            #select only quality paragraphs
            paragraphs4 = select_good_paragraphs(paragraphs3)
            #remove bad phrases
            paragraphs5 = paragraph_remove(paragraphs4)
            #combine paragraphs again for STM analysis (into 500 words)
            paragraphs6 = list(paragraph_combine(paragraphs5, 500))
            n_paragraphs_post= len(paragraphs6)
            if paragraphs6 is None or len(paragraphs6)==0:
                print(str(i), name, "No results returned")
                continue
            #select a random paragraph for Barbara analysis in post 1900 period
            if year >1990:
                selection = random.choice(paragraphs6)
                random_selection.append([name, selection])
            #paragraph_example_wordsearch(selected_text)
            #join text
            selected_text= "####".join(paragraphs6)
            #calculate number of words
            numwordstotal = len(selected_text.split())
            #write out full document
            with open(results + re.sub(".xml", ".txt", name), "w") as text_file:
                _=text_file.write(selected_text)
            #iterate through paragraphs and write file
            for p, paragraph in enumerate(paragraphs6):
                #length of paragraph
                numwordspara = len(paragraph)
                #put results in a dict
                resultsdict= {'Filename': name, "Paragraph": p, 'RecordTitle': id, 'Date': date, "Date1": datenum, 'Year': year, 'URL': url, 'CompanyName': company.encode('utf-8'), 'NAIC': naics, 'ClassCode': code, 'Classification': classic.encode('utf-8'), 'AnrAssetDispIdxNum': anasd, 'numwordstotal': numwordstotal, 'numwordsparagraph': numwordspara, 'num_paragraphs_prespellcheck': n_paragraphs_pre, 'num_paragraphs_postspellcheck': n_paragraphs_post}
                #write to metadata file
                writer.writerow(resultsdict)
                #write out selected text
                filenm = re.sub(".xml", "", name)
                pname = filenm + "_P" + str(p) + ".txt"
                with open(results_paragraphs + pname, "w") as text_file2:
                    _=text_file2.write(paragraph)
            print(str(i), name, "Written")
            i+=1
            
            

1 88201676.xml Written
2 88179860.xml Written
3 88198679.xml Written
4 88190194.xml Written
5 88178029.xml Written
6 88198719.xml Written
7 88177949.xml Written
8 88180180.xml Written
9 88205316.xml Written
10 88180349.xml Written
11 88203156.xml Written
12 88179900.xml Written
13 88179100.xml Written
14 88178669.xml No results returned
14 88175669.xml Written
15 88197156.xml Written
16 88177269.xml No results returned
16 88201519.xml Written
17 88193154.xml Written
18 88200119.xml Written
19 88179620.xml Written
20 88176989.xml No results returned
20 88178229.xml No results returned
20 88191354.xml Written
21 88179420.xml Written
22 88197999.xml Written
23 88176869.xml Written
24 88189354.xml Written
25 88175429.xml Written
26 88204516.xml Written
27 88178909.xml Written
28 88173700.xml Written
29 88180420.xml No results returned
29 88177220.xml Written
30 88176909.xml Written
31 88174700.xml Written
32 88180900.xml Written
33 88175509.xml Written
34 88178069.xml Written
35 88180389.x

275 88195960.xml Written
276 88181228.xml Written
277 88195120.xml Written
278 88173051.xml Written
279 88191834.xml Written
280 88172895.xml No results returned
280 88214451.xml Written
281 88213811.xml Written
282 88193074.xml Written
283 88208091.xml Written
284 88183627.xml Written
285 88172699.xml Written
286 88183267.xml No results returned
286 88174055.xml Written
287 88205371.xml Written
288 88181268.xml Written
289 88201451.xml Written
290 88208105.xml Written
291 88205942.xml Written
292 88208185.xml Written
293 88175868.xml No results returned
293 88177308.xml No results returned
293 88173335.xml No results returned
293 88199980.xml Written
294 88181307.xml Written
295 88190634.xml Written
296 88179387.xml No results returned
296 88182788.xml Written
297 88205611.xml Written
298 88191714.xml Written
299 88176588.xml No results returned
299 88191394.xml Written
300 88172977.xml Written
301 88176948.xml Written
302 88207785.xml Written
303 88185707.xml Written
304 88182588.xml

540 88189216.xml Written
541 88185257.xml Written
542 88187139.xml Written
543 88173131.xml Written
544 88173571.xml Written
545 88177371.xml Written
546 88205756.xml Written
547 88209476.xml Written
548 88224356.xml Written
549 88225916.xml Written
550 88216596.xml Written
551 88176912.xml Written
552 88206116.xml Written
553 88223676.xml Written
554 88207156.xml Written
555 88178632.xml Written
556 88207076.xml Written
557 88218116.xml Written
558 88208236.xml Written
559 88223996.xml Written
560 88207756.xml Written
561 88223116.xml Written
562 88220916.xml Written
563 88222116.xml Written
564 88228836.xml Written
565 88181913.xml Written
566 88228803.xml Written
567 88181393.xml No results returned
567 88223756.xml Written
568 88180352.xml Written
569 88180032.xml Written
570 88228916.xml Written
571 88212916.xml Written
572 88223316.xml Written
573 88181193.xml Written
574 88180913.xml Written
575 88206516.xml Written
576 88208476.xml Written
577 88206076.xml Written
578 88223569.

816 88232361.xml Written
817 88179374.xml Written
818 88192843.xml Written
819 88213393.xml Written
820 88204232.xml Written
821 88193723.xml Written
822 88200392.xml Written
823 88184816.xml Written
824 88196043.xml No results returned
824 88205075.xml Written
825 88204115.xml Written
826 88197392.xml Written
827 88195643.xml Written
828 88203192.xml Written
829 88189518.xml Written
830 88189798.xml Written
831 88199312.xml Written
832 88187005.xml Written
833 88231164.xml Written
834 88204194.xml Written
835 88180457.xml Written
836 88177777.xml No results returned
836 88176897.xml Written
837 88180174.xml Written
838 88202438.xml Written
839 88190758.xml Written
840 88214233.xml Written
841 88186678.xml Written
842 88192963.xml Written
843 88197123.xml Written
844 88198598.xml Written
845 88192118.xml Written
846 88198672.xml Written
847 88193523.xml Written
848 88188099.xml Written
849 88187958.xml Written
850 88220553.xml Written
851 88183979.xml No results returned
851 88206152.x

1104 88199160.xml Written
1105 88215491.xml Written
1106 88218277.xml Written
1107 88175735.xml Written
1108 88175540.xml Written
1109 88176464.xml Written
1110 88176986.xml No results returned
1110 88175506.xml Written
1111 88176425.xml No results returned
1111 88176146.xml Written
1112 88177304.xml No results returned
1112 88174744.xml Written
1113 88175865.xml Written
1114 88178304.xml No results returned
1114 88178384.xml Written
1115 88175225.xml Written
1116 88177745.xml No results returned
1116 88174584.xml Written
1117 88173780.xml Written
1118 88174820.xml No results returned
1118 88176220.xml Written
1119 88176345.xml Written
1120 88174300.xml Written
1121 88176465.xml No results returned
1121 88175145.xml Written
1122 88175466.xml No results returned
1122 88174824.xml No results returned
1122 88176705.xml No results returned
1122 88176062.xml No results returned
1122 88175224.xml Written
1123 88173620.xml Written
1124 88176624.xml Written
1125 88175265.xml Written
1126 88175

1351 88229239.xml Written
1352 88173291.xml Written
1353 88193581.xml Written
1354 88181146.xml Written
1355 88189503.xml Written
1356 88199866.xml Written
1357 88188731.xml Written
1358 88182909.xml Written
1359 88183651.xml Written
1360 88201186.xml Written
1361 88202026.xml Written
1362 88187049.xml No results returned
1362 88185204.xml Written
1363 88180746.xml Written
1364 88177346.xml Written
1365 88187849.xml No results returned
1365 88174589.xml Written
1366 88192624.xml Written
1367 88187376.xml Written
1368 88190609.xml Written
1369 88181229.xml Written
1370 88187084.xml Written
1371 88179106.xml Written
1372 88174651.xml Written
1373 88188649.xml Written
1374 88184009.xml No results returned
1374 88182778.xml Written
1375 88192864.xml Written
1376 88172904.xml Written
1377 88190743.xml Written
1378 88182549.xml No results returned
1378 88181746.xml Written
1379 88194656.xml Written
1380 88194056.xml Written
1381 88193301.xml Written
1382 88230390.xml Written
1383 88191064.xm

1636 88180182.xml Written
1637 88189063.xml Written
1638 88224967.xml Written
1639 88221725.xml Written
1640 88188485.xml No results returned
1640 88218122.xml Written
1641 88190285.xml Written
1642 88189685.xml Written
1643 88184888.xml Written
1644 88195080.xml Written
1645 88212160.xml Written
1646 88196600.xml Written
1647 88189423.xml No results returned
1647 88222925.xml Written
1648 88183568.xml Written
1649 88184782.xml No results returned
1649 88176441.xml Written
1650 88190405.xml Written
1651 88176942.xml No results returned
1651 88198965.xml No results returned
1651 88223925.xml Written
1652 88221267.xml Written
1653 88188985.xml No results returned
1653 88207747.xml Written
1654 88197585.xml Written
1655 88189905.xml No results returned
1655 88195265.xml Written
1656 88191900.xml Written
1657 88214827.xml Written
1658 88191820.xml Written
1659 88208507.xml Written
1660 88199985.xml Written
1661 88201665.xml Written
1662 88196625.xml Written
1663 88202540.xml Written
1664 8

1907 88175325.xml Written
1908 88186155.xml Written
1909 88201809.xml Written
1910 88176046.xml No results returned
1910 88188275.xml Written
1911 88177631.xml Written
1912 88173362.xml Written
1913 88177836.xml Written
1914 88178071.xml Written
1915 88179314.xml Written
1916 88204086.xml Written
1917 88185434.xml Written
1918 88177084.xml Written
1919 88172743.xml Written
1920 88229774.xml Written
1921 88221157.xml Written
1922 88173143.xml Written
1923 88183236.xml No results returned
1923 88181596.xml Written
1924 88212666.xml Written
1925 88227194.xml Written
1926 88183474.xml Written
1927 88177645.xml Written
1928 88196313.xml Written
1929 88179394.xml Written
1930 88224088.xml Written
1931 88200567.xml Written
1932 88206208.xml Written
1933 88202047.xml Written
1934 88183393.xml Written
1935 88212168.xml Written
1936 88202567.xml Written
1937 88204359.xml Written
1938 88186353.xml No results returned
1938 88211368.xml Written
1939 88202557.xml Written
1940 88207408.xml Written
19

KeyboardInterrupt: 

## Create text file with examples

In [192]:
#for i,item in enumerate(example_list):
#    with open(test_files +  item.replace(" ", "") + '2.txt', 'w') as f:
#        f.write('%s\n\n' % example_container_lists[i])

## Create selection of 1000 random paragrahs

In [211]:
#how many were selected?
len(random_selection)

38060

In [212]:
#select 1000 random paragraphs 
random_sample = random.choices(random_selection, k=1000)
len(random_sample)

1000

In [213]:
#put these in a csv
with open(test_paragraphs + "random_sample_v1.csv", "w", newline="") as f:
    writer = csv.writer(f)
    writer.writerows(random_sample)

In [214]:
paragraphs4

['Shareowners are cordially invited to attend the Annual Meeting of the company at The Greenbner White Sulphur Springs W. Va. on March 24 1961. Details ***will be given in the Notice of the Annual MeetingBeginning Another Great Era of Growth']