In [3]:
import pandas as pd

from pdf_scraper.doc_utils import open_exam
from pdf_scraper.block_utils import clean_blocks
from pdf_scraper.line_utils import print_line_table, get_line_df

1. Get all images.
2. Get all text lines.
3. Identify and Remove captions from text lines.
4. Identify and resort dual column text.

In [13]:
def get_doc_line_df(doc):
    dfs = []
    for i, page in enumerate(doc):
        page_blocks  = page.get_text("dict",sort=True)["blocks"]

        text_blocks  = [block for block in page_blocks if not block["type"]]
        text_blocks = clean_blocks(text_blocks)

        page_lines   = [ line for block in text_blocks for line in block["lines"]]
        page_df = get_line_df(page_lines)
        page_df["page"] = i+1
        page_df.sort_values("y0",inplace=True)
        dfs.append(page_df)
    grand_df = pd.concat(dfs,ignore_index=True)
    grand_df["dual_col"]=0

    return grand_df


In [14]:
year=2020
doc = open_exam(year,"english","al",1)
grand_df = get_doc_line_df(doc)

In [15]:
grand_df.text.values[20:40]

array(['Question B. ',
       'N.B. Candidates may NOT answer a Question A and a Question B on the same text.  ',
       'SECTION II – COMPOSING ',
       '• Candidates must write on one of the compositions 1 – 7. ',
       'SECTION I                       COMPREHENDING                    (100 marks) ',
       'TEXT 1 – FROM GENRE to GENRE ',
       'This text consists of two elements: firstly, edited extracts adapted from Alan McMonagle’s essay, ',
       'The Misadventures of a Dithering Writer in Thirteen and A Half Fragments, in which he discusses ',
       'writing in different genres.  The second element is a genre-related cartoon by Tom Gauld. ',
       'I flit anxiously and eagerly from genre to genre.  ',
       'yourself be led by the child that you were.  This ',
       'I always have a few stories on the go.  Some of ',
       'is a tendency I adhered to upon my resumption ',
       'and, indeed, return to when it all threatens to ',
       'them are like eels – they slip a

In [None]:
blocks=[]
images = []
dfs = []
for i, page in enumerate(doc):
    page_blocks  = page.get_text("dict",sort=True)["blocks"]

    text_blocks  = [block for block in page_blocks if not block["type"]]
    text_blocks = clean_blocks(text_blocks)

    image_blocks = [block for block in page_blocks if     block["type"]]
    for image_block in image_blocks:
        image_block["page"]= i+1

    page_lines   = [ line for block in text_blocks for line in block["lines"]]
    page_df = get_line_df(page_lines)
    page_df["page"] = i+1
    dfs.append(page_df)

grand_df = pd.concat(dfs,ignore_index=True)
grand_df["dual_col"]=0

# label dual column text

## Label page 2

In [16]:
grand_df[grand_df.page==2].text.head(15)

24    SECTION I                       COMPREHENDING ...
25                        TEXT 1 – FROM GENRE to GENRE 
26    This text consists of two elements: firstly, e...
27    The Misadventures of a Dithering Writer in Thi...
28    writing in different genres.  The second eleme...
29    I flit anxiously and eagerly from genre to gen...
30    yourself be led by the child that you were.  T...
31     I always have a few stories on the go.  Some of 
32       is a tendency I adhered to upon my resumption 
33     and, indeed, return to when it all threatens to 
34     them are like eels – they slip away if I do not 
35    make a fast grab.  Some are like bold children – 
36                                   get away from me. 
37      they pay absolutely no attention to anything I 
38     tell them to do.  One or two arrive unannounced 
Name: text, dtype: object

function
- takes in 4 lines: top left, top right, bottom left, bottom right
- makes copy of data frame
- identifies all lines between these. (max and min index)
- sorts this part of the data frame 
- returns tuple (sorted data frame, indices)

Then you can reassign this sorted data frame to the original data frame between
the provided indices.

Or else it can change the passed data frame in place which would be as useufl. 

In [17]:
def setDualCols(grand_df: pd.DataFrame, page_num:int, bookends: tuple[str]):
    page_df = grand_df[grand_df.page==page_num].copy()
    l1, r1, l2, r2 = bookends
    for line in bookends:
        print(page_df[page_df.text.str.contains(line)].index)
    indices = [page_df[page_df.text.str.contains(line)].index for line in bookends]
    top    = min(indices).values[0]
    bottom = max(indices).values[0]
    dual_cols = page_df[top:bottom+1].copy()
    dual_cols.sort_values(["x0","y0"],inplace=True)
    dual_cols["daul_col"]=1

    grand_df.loc[top:bottom+1] = dual_cols

    return grand_df


In [18]:
grand_df[grand_df.page==2].text.head(40)

24    SECTION I                       COMPREHENDING ...
25                        TEXT 1 – FROM GENRE to GENRE 
26    This text consists of two elements: firstly, e...
27    The Misadventures of a Dithering Writer in Thi...
28    writing in different genres.  The second eleme...
29    I flit anxiously and eagerly from genre to gen...
30    yourself be led by the child that you were.  T...
31     I always have a few stories on the go.  Some of 
32       is a tendency I adhered to upon my resumption 
33     and, indeed, return to when it all threatens to 
34     them are like eels – they slip away if I do not 
35    make a fast grab.  Some are like bold children – 
36                                   get away from me. 
37      they pay absolutely no attention to anything I 
38     tell them to do.  One or two arrive unannounced 
39      I am, at various times, a reluctant, plodding, 
40    instinctive, spontaneous writer.  At times I f...
41        from the farthest recesses of my imagi

In [19]:
line_l1 = "I flit anxious"
line_r1 = "yourself be led by the child"
line_lf = "imagination had not abandoned me"
line_rf = "lose almost all the time"

bookends = [line_l1,line_r1, line_lf, line_rf]

grand_df = setDualCols(grand_df, 2, bookends)

Index([29], dtype='int64')
Index([30], dtype='int64')
Index([106], dtype='int64')
Index([107], dtype='int64')


In [20]:
grand_df[grand_df.page==2].text.head(40)

24    SECTION I                       COMPREHENDING ...
25                        TEXT 1 – FROM GENRE to GENRE 
26    This text consists of two elements: firstly, e...
27    The Misadventures of a Dithering Writer in Thi...
28    writing in different genres.  The second eleme...
53                              immediate attention.   
54             I don’t know if my writing is in anyway 
55        I have started several novels.  There is the 
56    distinctive.  I am an aural learner as opposed...
57       edgy-existential one about the brother-sister 
58        assassination squad.  There is the comedy-of-
59            say the more common visual learning that 
60         attends so much writing.  I can hear things 
61      desperation one about the office slave finally 
62           tipped over the edge by a boss constantly 
63      before I see them.  My reasons for writing are 
64    partly intrinsic, partly spiritual, partly fan...
65       referred to as the highly evolved veget

In [75]:
page2_df = grand_df[grand_df.page==2].copy()
line_l1 = "I flit anxious"
line_r1 = "yourself be led by the child"

line_lf = "imagination had not abandoned me"
line_rf = "lose almost all the time"

bookends = [line_l1,line_r1, line_lf, line_rf]
for line in bookends:
    print(page2_df[page2_df.text.str.contains(line)].index)
indices = [page2_df[page2_df.text.str.contains(line)].index for line in bookends]
top = min(indices)
bottom = max(indices)

Index([29], dtype='int64')
Index([69], dtype='int64')
Index([68], dtype='int64')
Index([107], dtype='int64')


In [73]:
page2_text = page2_df[top.values[0]:bottom.values[0]+1].sort_values(["x0","y0"])
page2_text["dual_col"]=1

In [74]:
page2_text.text[36:50].values

array(['referred to as the highly evolved vegetable.  ',
       'There is the life-weary one about the last day ',
       'in the working life of a barber terrified beyond ',
       'measure of the imminent reunion with his ',
       'poet-activist daughter.  There is my novel ',
       'featuring an as-yet-to-be named antagonist ',
       'who is more of a genius in dreams than in life.  ',
       'I am all the time hankering to work on the very ',
       'project I am not currently tangled up inside.  ',
       'If you know what you want to be you will be it.  ',
       'If you don’t know, then you will spend your ',
       'days reinventing yourself, discovering who you ',
       'are.  I envy the former standpoint in so many ',
       'ways.  But I am an uncertain person and rather '], dtype=object)