# Vistelius (1995) pdf table extraction

- Use tabula-py (or camelot) for tabular data extraction --> oxide data tables
- Use tika for text extraction --> additional information pages

In [1]:
# Python table extraction library
import camelot
# Python wrapper of Java tabula library
import tabula
import numpy as np
import pandas as pd
from pandas.api.types import is_numeric_dtype

## Workflow for tabular data
- Read in OCR'ed pdf file
- Subsets and operations:
    * index (sample lables)
        - strip dash
        - assert that there are no duplicates
        - assert that numbers are continuous
    * main data (oxides, loss on ignition (l.i.) and others)
        - Check data type of each column --> if 'float' then do not perform any of the following
        - strip any preceding or postceding character from numbers
        - remove spaces
        - replace common mistakes in OCR (e.g. 'l' for 1)
    * sum (sum of oxides, loss on ignition and others)
        - Perform same operations as for main data (if dtype != 'float')
    * hygroscopic water content (hs)
        - Perform same operations as for main data (if dtype != 'float')
- Assert that sum of 'main data' == 'sum'

In [2]:
col_headers = ["SiO2", "TiO2", "Fe2O3", "FeO", "MnO", 
               "MgO", "CaO", "Na2O", "K2O", "P2O5", 
               "l.i.", "oth", "sum", "hs"]

In [3]:
pdf = "../_DATA/Scan 2017-10-19 11.52.01.pdf"
pdf2 = "../_DATA/Scan 2019-03-11 13.38.11.pdf"

### Tabula-py

In [4]:
df_tabula = tabula.read_pdf(pdf, pandas_options={'header': None})

In [5]:
df_tabula

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15
0,261-,76.18,0.09,12.3,0.40,1.65,0.03,0.12,0.11,4.05,4.3,0.03,0.34,,99.6,
1,262-,76.18,0.16,10.53,0.)2,2.73,0.03,0.23,0.35,3.87,5.25,0.09,0.27,,99.81,
2,263-,76.18,0.09,12.79,0.96,1. 73,0.01,0.23,0.6,3.55,3.06,,0.32,0.26,99.78,OA3
3,264-,76.17,0.15,12.39,0.89,0.49,0.1,0.19,0.81,2.79,4.97,,0.48,,99.43,
4,265-,76.17,0.15,11.64,2.27,0.94,0.12,0.21,0.91,3.02,2.89,0.11,0.9,,99.33,0.22
5,266-,76.16,0.04,13.33,0.27,l.11,0.03,0.06,0.65,3.34,5.04,0.01,0.12,,100.16,0.18
6,267-,76.16,0.07,13.41,0.43,1.34,0.03,0.16,0.48,3.31,4.55,0.03,0.34,0.11,100.42,0.19
7,268-,76.16,0.37,12.32,0.77,0.39,0.04,0.56,0.65,4.53,3.55,0.06,0.57,,99.97,0.29
8,269-,.76.15,0.15,12.02,0.66,1.56,0.01,0.12,0.81,2.75,5.58,0.03,0.63,,100.47,0.14
9,270-,76.15,0.05,12.43,1.03,1.00,,0 .12,0.52,2.33,5.16,0.03,0.8,0.36,99.98,0.48


In [126]:
data_tabula = df_tabula.iloc[:, 1:14]

In [127]:
sum_tabula = df_tabula.iloc[:, 14]

In [169]:
hs_tabula = df_tabula.iloc[:, 15]

In [129]:
data_tabula

Unnamed: 0,1,2,3,4,5,6,7,8,9,10,11,12,13
0,76.18,0.09,12.3,0.40,1.65,0.03,0.12,0.11,4.05,4.3,0.03,0.34,
1,76.18,0.16,10.53,0.)2,2.73,0.03,0.23,0.35,3.87,5.25,0.09,0.27,
2,76.18,0.09,12.79,0.96,1. 73,0.01,0.23,0.6,3.55,3.06,,0.32,0.26
3,76.17,0.15,12.39,0.89,0.49,0.1,0.19,0.81,2.79,4.97,,0.48,
4,76.17,0.15,11.64,2.27,0.94,0.12,0.21,0.91,3.02,2.89,0.11,0.9,
5,76.16,0.04,13.33,0.27,l.11,0.03,0.06,0.65,3.34,5.04,0.01,0.12,
6,76.16,0.07,13.41,0.43,1.34,0.03,0.16,0.48,3.31,4.55,0.03,0.34,0.11
7,76.16,0.37,12.32,0.77,0.39,0.04,0.56,0.65,4.53,3.55,0.06,0.57,
8,.76.15,0.15,12.02,0.66,1.56,0.01,0.12,0.81,2.75,5.58,0.03,0.63,
9,76.15,0.05,12.43,1.03,1.00,,0 .12,0.52,2.33,5.16,0.03,0.8,0.36


In [150]:
# Get indices of columns that do not contain numeric (float) data
non_numeric_cols = []

for col_index in range(1, data_tabula.shape[1] + 1):
    if not is_numeric_dtype(data_tabula[col_index]):
        non_numeric_cols.append(col_index) 
non_numeric_cols

In [172]:
# Dictionary to use during replace operations 
# to fix common mistakes in OCR for numbers
replacements = {")": "1", 
                "l": "1",
                "A": "4",
                " ": ""}

In [135]:
# Fix common OCR mistakes for numbers
for key, value in replacements.items():
    data_tabula[4].str.replace(key, value)

0     0.40
1     0.12
2     0.96
3     0.89
4     2.27
5     0.27
6     0.43
7     0.77
8     0.66
9     1.03
10    0.80
11    0.07
12    0.93
13    0.50
14    0.64
15    0.93
16    2.20
17    0.42
18    0.25
19    0.30
Name: 4, dtype: object

0     0.40
1     0.)2
2     0.96
3     0.89
4     2.27
5     0.27
6     0.43
7     0.77
8     0.66
9     1.03
10    0.80
11    0.07
12    0.93
13    0.50
14    0.64
15    0.93
16    2.20
17    0.42
18    0.25
19    0.30
Name: 4, dtype: object

In [103]:
#  Strip specific characters
for col_index in range(1, data_tabula.shape[1] + 1):
    try:
        data_tabula[col_index] = data_tabula[col_index].str.lstrip(".")
    except:
        pass

In [105]:
# After all replace and strip operations have been performed convert the data to 'float'
data_tabula = data_tabula.astype('float')

In [115]:
# Check that sum of data equals the sum reported in the tables
assert all(np.isclose(data_tabula.sum(axis=1), sum_tabula))

___

### Camelot

In [165]:
tables = camelot.read_pdf(pdf2, flavor='stream', row_tol=50, split_text=True)

In [7]:
tables

<TableList n=1>

In [160]:
tables[0]

<Table shape=(42, 16)>

In [161]:
print(tables[0].parsing_report)

{'accuracy': 99.91, 'whitespace': 55.51, 'order': 1, 'page': 1}


In [168]:
tables[0].df

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15
0,,5102.0,‚1102,A1203,Fezo3,FeO,MnO,MgO,CaO,NaZO,K\nO,P\nO\n2,,0th,"SUI""",hs
1,101-,76.86,0.04,12.36,0.61,0.82,0.05,0.14,0.44,3.48,4.55,,0.46,,99.81,
2,102-,76.86,0.11,11.03,1.53,1.15,0.02,0.62,1.50,4.80,0.93,0.02,0.99,0.23,99.79,0.10
3,103-,76.85,0.06,12.23,0.01,0.87,0.01,0.05,0.18,3.58,5.21,0.30,0.65,,100.00,
4,104-,76.84,0.09,11.56,0.19,0.90,0.03,0.25,0.90,3.00,5.00,0.07,0.68,,99.51,
5,105-,76.84,0.10,12.54,0.60,0.18,0.01,0.50,0.65,2.75,5.29,0.02,0.73,,100.21,
6,106-,76.83,0.13,14.07,0.21,0.20,0.01,0.05,1.05,3.18,3.72,0.05,0.32,,99.82,
7,107-,76.83,0.18,13.60,0.47,1.52,0.02,0.10,0.35,2.47,4.24,,0.44,0.11,100.33,0.23
8,108-,76.83,,13.70,0.40,0.61,,0.22,1.36,2.80,4.11,,0.37,,100.40,0.07
9,109-,76.82,0.09,12.73,1.36,0.56,0.07,0.05,0.79,3.45,3.98,,0.19,,100.09,0.08


In [22]:
data = tables[0].df.iloc[1:, 1:14].reset_index(drop=True)

In [23]:
data

Unnamed: 0,1,2,3,4,5,6,7,8,9,10,11,12,13
0,76.18,0.09,12.3,0.40,1.65,0.03,0.12,0.11,4.05,4.3,0.03,0.34,99.60
1,76.18,0.16,10.53,0.)2,2.73,0.03,0.23,0.35,3.87,5.25,0.09,0.27,99.81
2,76.18,0.09,12.79,0.96,1. 73,0.01,0.23,0.6,3.55,3.06,,0.32,0.26 \n99.78
3,76.17,0.15,12.39,0.89,0.49,0.1,0.19,0.81,2.79,4.97,,0.48,99.43
4,76.17,0.15,11.64,2.27,0.94,0.12,0.21,0.91,3.02,2.89,0.11,0.9,99.33
5,76.16,0.04,13.33,0.27,l.11,0.03,0.06,0.65,3.34,5.04,0.01,0.12,100.16
6,76.16,0.07,13.41,0.43,1.34,0.03,0.16,0.48,3.31,4.55,0.03,0.34,0.11 100.42
7,76.16,0.37,12.32,0.77,0.39,0.04,0.56,0.65,4.53,3.55,0.06,0.57,99.97
8,. 76.15,0.15,12.02,0.66,1.56,0.01,0.12,0.81,2.75,5.58,0.03,0.63,100.47
9,76.15,0.05,12.43,1.03,1.00,,0 .12,0.52,2.33,5.16,0.03,0.8,99.98 \n0.36


___

### Tika

In [192]:
pdf_text = "../_DATA/Scan 2019-03-12 10.07.14.pdf"
pdf_text2 = "../_DATA/Scan 2017-10-19 12.13.51.pdf"

In [174]:
from tika import parser

In [193]:
pdf_parsed = parser.from_file(pdf_text)

In [194]:
pdf_parsed2 = parser.from_file(pdf_text2)

In [191]:
print(pdf_parsed["content"].strip().replace("\n\n", "\n"))

101- (48°13’,138°41’). Granite leucocratic. Pg]. M.M.Stukalova (A.N.Pagol’skaya,l958)-
102- (63°35’,175°24’). Granite. K. G.F.M1khaylova. 0th.:Cr203-0.01, coz-o 22
(S.A.Pa1andzhyan‚1976).
103- (53°06’30",112°52’40"). Granite. Tr. V.P.Usacheva (L.F.Dekhtereva,l964).
104- (52°37',118°29’). Granite alaskitic. J2. D.M.Shuster (A.I.Shadr1n,1962).
105- (43°3a'45",135°12'47"). Granite leucocratic. KZ-Pgl. Saken massif (I.I.Antushevich,l966).
106- (52°37’50",117°36'30"). Granite leucocratic. Tr. S.T.Ba1yuk (P.0.Be11k‚194s).
107- (69°58’,171°32’). Two mica granite. K1. K.A.Baklanova. 0th.:c02-0.05, so3-o.03, BaO-0.03
(A.V.Andrianov,1940).
108- (66°11’,129°l4’). Granite medium-fine-grained. Kl. Khoboyotuu-Echiy massif
(N.A.Tseyd1er,1969).
109- (61°16’,149°ll’). Granite medium-graíned. Kl. Hest-Butugychag massif. K.A.Bak1anova
(A.F.M1khaylov,1948).
110- (46°29'30",138°13'56"). Granite. K2. 0et.:H20*-0.05 (G.A.Amel’chenko‚1976).
111- (66°24',174°53'w). Granite-porphyry. K1. Kenga massif. L.A.F1nog

In [195]:
print(pdf_parsed2["content"].strip().replace("\n\n", "\n"))

261- (53°54'50'',115°33'). Granite leucocratic alkaline . ·rr -J . Kontalakan -Marektin massif. N.P. Mel'nikova 
(V.S.Ivanov,1968). 
262- (46°37' ,104°44'). Granophire alkaline. Tr -J. Bayan -Ulan massif (V.I.Kovalenko , 1971). 
263- (61°22', 151°50 '). Granite -porphyry micropegmatitic . K1. Dneprovsky massif . V. I . Noskov . Oth .: C02-0. 26 
(R.P.Petrov,1945). 
264- (46°55' ,105°46'50"). Granite medium-grained. J3. Det .: S-0. 01 (U.A.Korchagin , 1967). 
265- (61°08',151°04'). Granite alaskitic. K2. Z.A.Lipnyagova (A . P.Osipov,1964) . 
266- (69°44 ' ,171°55'). Bt granite fine-grained. K2. Northern massif. (E.P.Fedorov ,1 952). 
267- (49°31 ' ,110°06 ' ). Two mica granite. J2. B.M.Frenkel' . Oth .: Sno2-0.10, S-0. 01 (I . F.Grigor ' ev,1957) . 
268- (50°39 ' 20",104°34'). Granite -porphyry. J2 (Ts . B.Tarchimaev , 1963). 
269- (45°44 ' 30'' , 134°46'36"). Granite porphyraceous. K2. Dintsukhin massif (V.I .Safronov , 1966) . 
270 - (44°52 ' 07",136°09'50"). Granite alaskitic. K2-Pg1