# Refine rotated text

In [1]:
from pdfmole import read_pdf, group_blocks, align_exact_match, kdplot
import numpy as np
import pandas as pd
from collections import Counter

## Example 1

In [2]:
doc = read_pdf("../samples/rotated_text_1.pdf", page_numbers=[0])
d = doc.get_text()
d = d[d['size'] > 0]

For rotated texts the `block` annotation does not work and since we only read one page also the page id is not needed.

In [3]:
unused_columns = {"colorspace", "color", "block", "pid"}
d = d[[k for k in d.columns if not k in unused_columns]]

### Identify rotated text
**pdfminer** doesn't tell us if a given letter is rotated or not. However for most characters and fonts the character height exceeds the character width. This changes if the letter is rotated.

In [4]:
letter_height = d['y1'] - d['y0']
letter_width = d['x1'] - d['x0']
d['rotated'] = (letter_width > letter_height).to_numpy()
d.head(3)

Unnamed: 0,text,font,size,x0,y0,x1,y1,rotated
0,1.0,BAAAAA+LiberationSans,18.0,96.605,678.813,106.613,696.813,False
2,,BAAAAA+LiberationSans,4.986,87.704,629.688,105.704,634.674,True
4,,BAAAAA+LiberationSans,4.986,87.704,624.586,105.704,629.572,True


### Split the DataFrame
We split the `DataFrame` into a rotated and not rotated part.

In [5]:
d0 = d[d["rotated"]]
d0 = d0.assign(rowid = align_exact_match(d0["x0"]))
d1 = d[~d["rotated"]]
d1 = d1.assign(rowid = align_exact_match(d1["y0"]))

### Join the rows

In [6]:
def join_rows(x, rotated=False):
    def join_row(d, rotated=False):
        if d.shape[0] == 0:
            return None
        d = d.sort_values("y0" if rotated else "x0")
        font = Counter(d['font']).most_common()[0][0]
        font_size = Counter(d['size']).most_common()[0][0]
        di = {'text': ''.join(d['text']), 'font': font, 'size': font_size,
              'x0': d['x0'].min(), 'y0': d['y0'].min(), 'x1': d['x1'].max(), 'y1': d['y1'].max()}
        return di
    x = x.astype({'rowid': int})
    y = [join_row(df, rotated) for row_id, df in x.groupby('rowid', as_index=False)]
    return pd.DataFrame(y)

In [7]:
join_rows(d1)

Unnamed: 0,text,font,size,x0,y0,x1,y1
0,1,BAAAAA+LiberationSans,18.0,96.605,678.813,106.613,696.813
1,Mein Text,BAAAAA+LiberationSans,18.0,273.005,574.413,349.613,592.413
2,Häferl,BAAAAA+LiberationSans,18.0,114.605,480.813,162.503,498.813


In [8]:
join_rows(d0, rotated=True)

Unnamed: 0,text,font,size,x0,y0,x1,y1
0,Rotated 2,BAAAAA+LiberationSans,10.008,289.587,261.099,307.587,338.997
1,1,BAAAAA+LiberationSans,10.008,107.886,561.713,125.886,571.721
2,Rotated,BAAAAA+LiberationSans,10.008,87.704,561.713,105.704,634.674


## Example 2

In [9]:
doc = read_pdf("../samples/rotated_text_2.pdf", page_numbers=[0])
d = doc.get_text()
d = d[d['size'] > 0]

In [10]:
unused_columns = {"colorspace", "color", "block", "pid"}
d = d[[k for k in d.columns if not k in unused_columns]]

In [11]:
letter_height = d['y1'] - d['y0']
letter_width = d['x1'] - d['x0']
d['rotated'] = (letter_width > letter_height).to_numpy()
d.head(3)

Unnamed: 0,text,font,size,x0,y0,x1,y1,rotated
0,2,BAAAAA+LiberationSans,10.008,255.089,724.904,273.089,734.912,True
2,,BAAAAA+LiberationSans,4.986,255.089,719.887,273.089,724.873,True
4,t,BAAAAA+LiberationSans,4.986,255.089,714.898,273.089,719.884,True


In [12]:
d0 = d[d["rotated"]]
d0 = d0.assign(rowid = align_exact_match(d0["x0"]))
d1 = d[~d["rotated"]]
d1 = d1.assign(rowid = align_exact_match(d1["y0"]))

In Zeile 1 and Zeile 2, Textblock 1 and 2 are connected by spaces.
In Zeile 3, Textblock 1 and 2 are two seperated blocks not connected by spaces. The join row function only joins the rows.

In [13]:
join_rows(d1)

Unnamed: 0,text,font,size,x0,y0,x1,y1
0,Zeile 1 Textblock 1 Zeile 1 ...,BAAAAA+LiberationSans,18.0,64.205,682.413,456.497,700.413
1,Zeile 2 Textblock 1 Zeile 2 Textblock 2,BAAAAA+LiberationSans,18.0,71.405,563.613,378.323,581.613
2,Zeile 3 Textblock 1Zeile 3 Textblock 2,BAAAAA+LiberationSans,18.0,95.811,392.911,425.069,410.911


In [14]:
join_rows(d0, rotated=True)

Unnamed: 0,text,font,size,x0,y0,x1,y1
0,Rotated Text 1,BAAAAA+LiberationSans,10.008,524.211,508.904,542.211,624.502
1,Rotated Text 2,BAAAAA+LiberationSans,10.008,255.089,619.313,273.089,734.912


We see for rotated text the font size provided by **pdfminer** seems to be misleading.

### Group Columns
Here, we don't assume a table structure and just write simple functions to assign the columns to blocks.

In [15]:
def align_text_blocks(x, max_char_dist=None):
    # TODO: extend for rotated text
    x = x.copy()
    x['block_id'] = -1
    if max_char_dist is None:
        max_char_dist = (x['x1'] - x['x0']).max()
    for i in sorted(set(x['rowid'])):
        b = (x['rowid'] == i)
        row = x[b]
        row = row.sort_values("x0")
        char_dist = row['x0'].to_numpy()[1:] - row['x1'].to_numpy()[:-1]
        bid = [0] + (char_dist > max_char_dist).cumsum().tolist()
        row['block_id'] = bid
        x.update(row['block_id'])
    x = x.astype({'block_id': int})
    return x


def join_text_blocks(x, rotated=False):
    def join_text_block(d, rotated=False):
        if d.shape[0] == 0:
            return None
        d = d.sort_values("y0" if rotated else "x0")
        font = Counter(d['font']).most_common()[0][0]
        font_size = Counter(d['size']).most_common()[0][0]
        di = {'text': ''.join(d['text']), 'font': font, 'size': font_size,
              'x0': d['x0'].min(), 'y0': d['y0'].min(), 'x1': d['x1'].max(), 'y1': d['y1'].max()}
        return di
    x = x.astype({'rowid': int})
    y = [join_text_block(df, rotated) for row_id, df in x.groupby(['rowid', 'block_id'], as_index=False)]
    return pd.DataFrame(y)

In Zeile 1 and Zeile 2, Textblock 1 and 2 are connected by spaces.
In Zeile 3, Textblock 1 and 2 are two seperated blocks not connected by spaces. The join row function only joins the rows.

In [16]:
x = d1.copy()
x = align_text_blocks(x)
join_text_blocks(x)

Unnamed: 0,text,font,size,x0,y0,x1,y1
0,Zeile 1 Textblock 1 Zeile 1 ...,BAAAAA+LiberationSans,18.0,64.205,682.413,456.497,700.413
1,Zeile 2 Textblock 1 Zeile 2 Textblock 2,BAAAAA+LiberationSans,18.0,71.405,563.613,378.323,581.613
2,Zeile 3 Textblock 1,BAAAAA+LiberationSans,18.0,95.811,392.911,244.275,410.911
3,Zeile 3 Textblock 2,BAAAAA+LiberationSans,18.0,276.605,392.911,425.069,410.911
