### Prepare text data by parsing the book pdfs.

In [1]:
import os
import json
import pdfplumber
import pandas as pd
import numpy as np
from collections import Counter
import string

In [2]:
books_to_exclude = []
labels = pd.read_excel('./Book-List-Final-NONA.xlsx', sheet_name='Sheet1')
labels = labels.rename(columns={'Author ': 'Author'})
labels = labels.loc[~labels.Title.isin(books_to_exclude)]

In [3]:
os.chdir('../text_pdfs')

In [4]:
df = pd.DataFrame()

def grab_text(title, labels):
    
    start = labels.loc[labels.Title==title]['Starting Page']
    if len(start)==0:
        print(title, "no start")
        start = 0
    else:
        start = start.values[0]
    end = labels.loc[labels.Title==title]['Ending Page']
    if len(end)==0:
        print(title, "no end")
        end = 0
    else:
        end = end.values[0]
    
    title = title + '.pdf'
    all_text = ''
    with pdfplumber.open(title) as pdf:
        for i, page in enumerate(pdf.pages):
            if i+1 >= start and i < end:
                single_page_text = page.extract_text()

                if single_page_text is not None:
                    all_text = all_text + '\n' + single_page_text
                
    return all_text

df['Title'] = [file.split('.')[0] for file in os.listdir() if file.split('.')[1]=='pdf']
df['Text'] = [grab_text(title, labels) for title in df.Title]

In [5]:
os.chdir('../code_new_version')
df.head()

Unnamed: 0,Title,Text
0,The Night Before Christmas,\n'Twas the night before Christmas\nwhen all t...
1,Sugarlump and the Unicorn,"\nThe unicorn has a silver horn, Her\neyes are..."
2,The Gruffalo,\nA mouse took a stroll through the deep dark ...
3,The Monstrous Tale of Celery Crumble,\nHave you met Celery Crumble?\nThat’s her rig...
4,Peace at Last,"\nThe hour was late.\nMr Bear was tired, Mrs B..."


In [6]:
df = df.loc[df.Text != ''].reset_index()

In [7]:
len(df)

196

In [10]:
import pickle

In [11]:
with open('data/tempdf.pickle', 'wb') as outfile:
    pickle.dump(df, outfile)

#### Converting the full dataset into a dataframe of sentences
Note: using strip() here to remove trailing or leading spaces for improved performance.

In [33]:
sentences = pd.DataFrame()

book_col = []
sentences_col = []
length_col = []
index_col = []

for title, text in zip(df.Title, df.Text):
    text = text.replace('\n', ' ') # This is only safe provided the line break is not being used to separate sentences w/o puntctuation...
    text = text.replace('\t', ' ') # This allows us to save as tsv (and simplifies the whitespace)
    text = ' '.join(text.split())
    
    
    doc = nlp(text)
    sentence_list = list(doc.sents)
    
    for si, sen in enumerate(sentence_list):
        book_col.append(title)
        
        _doc = sen #nlp(sen.text.strip())
        sentences_col.append(_doc)
        length_col.append(len(_doc.text.translate(str.maketrans('', '', string.punctuation)).split(' ')))
        index_col.append(si)

    
sentences['book'] = book_col
sentences['sentence_length'] = length_col
sentences['sentence'] = sentences_col
sentences['sentence_index'] = index_col

In [36]:
coding_sample = sentences.sample(frac=0.15, axis=0, random_state=42)

#### Check that this sample contains the same sentences that were manually coded previously.

In [37]:
manually_coded = pd.read_csv('./sentences_for_coding/sample_15pc.csv', delimiter='\t', index_col=0)

In [38]:
text_equal = [
    i == j.text
    for i,j in
    zip(manually_coded.sentence, coding_sample.sentence)
]    

In [39]:
assert sum(text_equal) == len(text_equal)

In [35]:
with open('data/temp.pickle', 'wb') as outfile:
    pickle.dump(doc, outfile)

In [20]:
sentences.to_csv('./data/all_sentences_oldenv.tsv', sep='\t')

In [28]:
sentences.iloc[0].sentence)

spacy.tokens.span.Span

In [23]:
coding_sample.sentence

12493    (Now, the, fat, red, hen, with, her, thin, bro...
1350     (", I, have, secret, plans, and, clever, trick...
3656     (She, took, him, to, the, park, to, play, on, ...
11731                                        (“, There, .)
4560     (Inside, the, tower, a, windy, ,, windy, stair...
                               ...                        
3784     (Dogger, had, just, been, bought, by, a, littl...
3076     (He, found, a, space, helmet, on, the, drainin...
7799                         (Even, Biscuits, the, dog, !)
12890                 (“, You, ’re, not, the, Mouse, ., ”)
2611     (A, baby, ,, lying, in, a, manger, ;, a, baby,...
Name: sentence, Length: 2125, dtype: object

In [18]:
pip freeze

anyio==3.7.1
backcall==0.2.0
blis==0.7.11
cached-property==1.5.2
catalogue==2.0.10
certifi==2024.2.2
cffi==1.15.1
charset-normalizer==3.3.2
click==8.1.7
cloudpathlib==0.16.0
confection==0.1.4
cryptography==42.0.7
cycler==0.11.0
cymem==2.0.8
debugpy==1.7.0
decorator==5.1.1
distro==1.9.0
en-core-web-lg @ https://github.com/explosion/spacy-models/releases/download/en_core_web_lg-3.7.1/en_core_web_lg-3.7.1-py3-none-any.whl#sha256=ab70aeb6172cde82508f7739f35ebc9918a3d07debeed637403c8f794ba3d3dc
entrypoints==0.4
et-xmlfile==1.1.0
exceptiongroup==1.2.1
fonttools==4.38.0
h11==0.14.0
httpcore==0.17.3
httpx==0.24.1
idna==3.7
importlib-metadata==6.7.0
ipykernel==6.16.2
ipython==7.34.0
jedi==0.19.1
Jinja2==3.1.4
jupyter_client==7.4.9
jupyter_core==4.12.0
kiwisolver==1.4.5
langcodes==3.3.0
MarkupSafe==2.1.5
matplotlib==3.5.3
matplotlib-inline==0.1.6
murmurhash==1.0.10
nest-asyncio==1.6.0
numpy==1.21.6
openai==1.35.10
openpyxl==3.1.2
packaging==24.0
pandas==1.3.5
parso==0.8.4
pdfminer.six==20221105
