# Vistelius text digitalization

In [1]:
# Package used to work with OCR of text
from tika import parser

# Linear algebra package
import numpy as np

# Tabular data package
import pandas as pd
from pandas.api.types import is_numeric_dtype, is_string_dtype

# Operating system package
import os

# Parallellization modules and packages
from joblib import Parallel, delayed
import multiprocessing

# Regular expressions package
import re

## Reading pdf

In [2]:
# Open pdf
pdf_text = "../_DATA/Vistelius_scans_text_OCR.pdf"

In [3]:
# Parse pdf file
pdf_parsed = parser.from_file(pdf_text)

2019-10-17 09:07:23,440 [MainThread  ] [INFO ]  Retrieving http://search.maven.org/remotecontent?filepath=org/apache/tika/tika-server/1.19/tika-server-1.19.jar to C:\Users\u0108248\AppData\Local\Temp\tika-server.jar.
2019-10-17 09:07:40,205 [MainThread  ] [INFO ]  Retrieving http://search.maven.org/remotecontent?filepath=org/apache/tika/tika-server/1.19/tika-server-1.19.jar.md5 to C:\Users\u0108248\AppData\Local\Temp\tika-server.jar.md5.
2019-10-17 09:07:40,862 [MainThread  ] [WARNI]  Failed to see startup log message; retrying...
2019-10-17 09:07:45,873 [MainThread  ] [WARNI]  Failed to see startup log message; retrying...


In [4]:
# Get text from parsed pdf file
text_parsed = pdf_parsed['content'].strip().replace("\n\n", "\n").split("\n")

In [5]:
# Remove empty lines from text
text_parsed_noempty = []

for index, line in enumerate(text_parsed):
    if len(line) != 0:
        text_parsed_noempty.append(line)

In [6]:
%%capture
# Save text without empty lines as text file
with open("../text_parsed_noempty.txt", 'w') as f:
    for line in text_parsed_noempty:
        f.write(f"{line}\n")

___

## Data cleaning

### Split double lines

In [7]:
# Specific correction for lines containing two sampleID-coordinates pairs
double_lines_indices = []
regex_double = re.compile(r"[0\-\s]*[\(\{]+[0-9iIl!p&\s]{2,}°?[0-9]")

for index, line in enumerate(text_parsed_noempty):
    if len(regex_double.findall(line)) >= 2:
        print(index, line)
        double_lines_indices.append(index)

2000 (47°20' ,106°55'). Bt granite porphyraceous. Tr3-J1. Abdaryn m~ssif (V.I.Ushakov,1964). 0 0 + . (47 09'20",134 4A'l5"J .. , Granite -porphyry. K2. Det.:H2o -0.26 (A .G.Kandaurov,1974) . 
3486 2831 - (61°10 1 ,144°06'). Granite. K2. Vega massif. P.V.Artemenko. Det.:H2o+-0.52 (l.M.Speranskaya?l960). o' o 2832- {52 59' ,113 14'). Granosyenite. Tr {F.N.Lyudofun,1967). 


In [8]:
double_lines_indices

[2000, 3486]

In [9]:
part1_2000 = text_parsed_noempty[2000][:95]
part2_2000 = text_parsed_noempty[2000][95:]
part1_3486 = text_parsed_noempty[3486][:112]
part2_3486 = text_parsed_noempty[3486][112:]

In [10]:
del text_parsed_noempty[2000]
del text_parsed_noempty[3485]
text_parsed_noempty.insert(2000, part2_2000)
text_parsed_noempty.insert(2000, part1_2000)
text_parsed_noempty.insert(3487, part2_3486)
text_parsed_noempty.insert(3487, part1_3486)

In [11]:
text_parsed_noempty[1999:2002]

['(51°29\'30",110°10\'). Granite fine-grained. Tr. M.I.Kodachigova (V.A.Novikov,1966) . ',
 "(47°20' ,106°55'). Bt granite porphyraceous. Tr3-J1. Abdaryn m~ssif (V.I.Ushakov,1964). 0 0 + .",
 ' (47 09\'20",134 4A\'l5"J .. , Granite -porphyry. K2. Det.:H2o -0.26 (A .G.Kandaurov,1974) . ']

In [12]:
text_parsed_noempty[3485:3490]

['2829 - (46°22\'58",136°18\'10"). Granite. K2. Det.:H2o -0.85 (V.K.Matushkin?l974). ',
 '2830- (47°22\'25",139°00\'30"). Bt -Amf granodiorite. K2-Pg1. Oth.:S03-0.19 (V.A.Yarmolyuk?l956). ',
 "2831 - (61°10 1 ,144°06'). Granite. K2. Vega massif. P.V.Artemenko. Det.:H2o+-0.52 (l.M.Speranskaya?l960). o' o ",
 "2832- {52 59' ,113 14'). Granosyenite. Tr {F.N.Lyudofun,1967). ",
 "2833 - {52°16'20 11 ,117°36'30 11 ). Granite porphyraceous. J3. Sretensky massif. M.E.Kazakova. Oth.:Ba0-0.08? so3-0.20? "]

### Specific corrections

In [13]:
def correct_lines(lines):
    regex =  re.compile(r"^[\-\.·\s,:;\'0]*[0-9~iIl&TJ\)]+[L\s\.·]*[\-~]{1,2}")
    regex2 = re.compile(r"^[\-\.·\s,;:\'0]*[0-9~iIl&TJ\)]+[L\s\.·]*[\-~]{1,2}\s?$")
    regex3 = re.compile(r"^[0\-\s]*[\(\{]+[0-9iIl!p&\s]{2,}°?[0-9]")

    actual_lines = []
    empty_entries_index = []
    missing_entries = []
    # Check for systematic error
    hit0 = []
    i = 0
    empty_entry_section = False

    for index, test in enumerate(lines):
        
        # Check if entry starts with sampleID and nothing else
        if len(regex2.findall(test)) > 0:
            missing_entries.append(lines[index])
#             print('hit0', test, index, len(missing_entries))
            hit0.append(test)
            # Exception for sampleID 3061 that where an empty section immediatley follows another empty section:
            # Needs a rule instead of this exception.
            if test.startswith("'3061"):
                i -= 1
            if empty_entry_section:
                i += 1
            
        # Check if entry starts with sampleID and has more information behind it
        if len(regex.findall(test)) > 0:
            actual_lines.append(test)
            index_to_add = index
            empty_entry_section = False
            # If line in actual_lines only contains sampleID, keep its index for later use
            if len(regex2.findall(actual_lines[-1])) > 0:
                empty_entries_index.append(len(actual_lines) - 1)
        else:
            actual_lines[-1] = actual_lines[-1] + lines[index]
            
        # Check if entry starts with coordinates
        if len(regex3.findall(test)) > 0:
            empty_entry_section = True
            missing_entries[i] = missing_entries[i] + lines[index]
#             print(missing_entries[i])
            # Check for systematic error
            assert(missing_entries[i][:4] == hit0[i][:4])
            j = i
#             print(i, j)
            
        if empty_entry_section:
            # Check again if entry starts with coordinates
            if len(regex3.findall(test)) > 0:
                i += 1
            else:
                missing_entries[j] = missing_entries[j] + lines[index]
#                 print('hit1', test)
    
    # Replace the former saved 'empty' lines with actual corresponding info
    for index, line in zip(empty_entries_index, missing_entries):
        actual_lines[index] = line

    return actual_lines

In [14]:
# Apply function
text_parsed_corr = correct_lines(text_parsed_noempty)

In [15]:
# Replace common errors for minutes and seconds symbols
text_parsed_corr_repl = []

for item in text_parsed_corr:
    item = item.replace(" 1 ", "'")
    item = item.replace(" 11 ", "\"")
    item = item.replace("''", "\"")
    
    text_parsed_corr_repl.append(item)

In [16]:
len(text_parsed_corr_repl)

4660

In [17]:
%%capture
# Save corrected lines as text file
with open("../text_parsed_corr_repl.txt", 'w') as f:
    for line in text_parsed_corr_repl:
        f.write(f"{line}\n")

In [18]:
# Lines in saved file that are containing double entries
missed_entries = [2990, 3656, 3876, 4070]

In [19]:
# Check which lines do not contain actual information
regex4 = re.compile(r"\-$")

for index, line in enumerate(text_parsed_corr_repl):
    if len(regex4.findall(line)) != 0:
        print(index, line)

459 460- (52°10'10",110°22'30"). Granite porphyraceous. Tr-J1 (V.I.Pelepyagin,1975). - --- - ------==-='--=------------ .,,,j, - ' -
479 480- (50°17' ,108°28'). Granite leucocratic. J 1. Otytey massif. N.P.Mel'nikova (V.V.Starchenko,1968). ----=---~-------=---=--- - ---- -- --- I• -
480 ~-·----------
500 500- (60°50',152°48'). Granite subalkaline . K2. Upper -Yam massif. Oth .:co2-0.10 (A .A.Dontsov , 1946). ----- • ~---=---==-- --- -
501 ~ -~- .. ---·~--
521 520- {45°33'50",135°25'). Granite. K2. Yamutinzin massif (N.K.Flyaga , 1964) . --- -=---=-====-==-:: .. ·~ ""-
532 531- (47°15',135°12'}. Bt granite medium-grained. K2• E.L.Trusov (N.F.Smirnov,1961}. -
562 ~---- -
563 ., - ~ -~--


___

## Regex matching

**Regular expression setup:**
- Group 01 - Sample ID
- Group 02 - Lattitude degrees
- Group 03 - Lattitude minutes
- Group 04 - Lattitude seconds
- Group 05 - Longitude degrees
- Group 06 - Longitude minutes
- Group 07 - Longitudes seconds
- Group 08 - Longitude direction (W)
- Group 09 - Rock name
- Group 10 - Whole-rock age
- Group 11 - Massif name (optional)
- Group 12 - Analyst name (optional)
- Group 13 - Interpretation of "oth." and "det." (optional)
- Group 14 - Author and year of the original report

### Test regex for sampleID and coordinates

In [20]:
# Regular expression for finding sampleID and coordinates
regex_sampleID_coordinates = re.compile(r"^[\-\.·,:\s0\']*" + 
                                        r"([0-9GSO~iIlJT\)&\']+)" + # SampleID
                                        r"[\-\s\(\.\{L\'~0·]*" + 
                                        r"([0-9iIl!pS~o\-•Z\.,&]{2,})" +  # Lattitude degrees
                                        r"[°\s90Qg]*" + 
                                        r"([0-9iIl!pS~roJ\()\.]{2,})" +  # Lattitude minutes
                                        r"['\\ji\s1;]*" + 
                                        r"([0-9iIl!pS~norJf]{2,})?" +  # Lattitude seconds
                                        r"[\"\'\s,~\.;\)]*" +
                                        r"([0-9iIl!pS~of\s\$sa·t£]{1,})" +  # Longitude degrees
                                        r"[°\s90Qg]*" +
                                        r"([0-9iIl!pS~\soJ\-Zj]{2,})" +  # Longitude minutes
                                        r"['\\\s]*" +
                                        r"([0-9iIl!pS~o]{2,})?" +  # Longitude seconds
                                        r"[\"\)\}\.wW]*")

In [21]:
regex_sampleID_coordinates

re.compile(r'^[\-\.·,:\s0\\']*([0-9GSO~iIlJT\)&\\']+)[\-\s\(\.\{L\\'~0·]*([0-9iIl!pS~o\-•Z\.,&]{2,})[°\s90Qg]*([0-9iIl!pS~roJ\()\.]{2,})[\'\\ji\s1;]*([0-9iIl!pS~norJf]{2,})?[\"\\'\s,~\.;\)]*([0-9iIl!pS~of\s\$sa·t£]{1,})[°\s90Qg]*([0-9iIl!pS~\soJ\-Zj]{2,})[\'\\\s]*([0-9iIl!pS~o]{2,})?[\"\)\}\.wW]*',
re.UNICODE)

In [22]:
# Check if all samples are picked up by sampleID and coordinates regex
counter = 0

for line in text_parsed_corr_repl:
    if len(regex_sampleID_coordinates.findall(line)) == 0:
        print(line)
    else:
        counter += 1
counter

~-·----------
~---- -
., - ~ -~--


4657

___

### Getting sampleID and coordinates

In [24]:
text_parsed_corr_repl_checked = open("../_TEMP/Text/text_parsed_corr_repl_checked.txt", "r")

In [25]:
text_parsed_corr_repl_checked_list = []

for line in text_parsed_corr_repl_checked:
    text_parsed_corr_repl_checked_list.append(line)

In [26]:
text_parsed_corr_repl_checked.close()

In [27]:
text_parsed_corr_repl_checked_list[40:55]

['41- (54°20\' ,117°57\'40"). Granite leucocratic alkaline . J3 (N.V.Kuzheleva,1959) . \n',
 "42- (61°17' ,149°23'). Bt granite (alaskite). K1. East -Butugychag massif (P.N .Sp i ridonov,1940). \n",
 "43- (67°50' ,178°50'W). Granite. K1. Iul'tin massif. Oth.:B2o3-0.0l, co2-0.08 (A . I .Kyshtymov i l959). \n",
 "44- (52°28' ,140°19'). Granite. K2-Pg. A.N .Geraskina. Det.:co2-0.14 (V.P.Polikanov , 1974). \n",
 "45- (50°12',112°39'). Granite leucocratic . J1_2. 0th . : so3-tr. (I.I.Kozyrevil968). \n",
 "46- (50°56' ,113°18'). Granite leucocratic. J2_3. V.Petrikovetz (A.I.Shevtsov,1967). \n",
 '47- (47°27\'48~,138°23\'24") . Granite. Pg1. Vayga massif (F.G.Fedchin,1975) . \n',
 "48- (60°27' ,l4p0 53'). Bt granite. K2. Vega massif (Yu.I.Korshikov,1966). \n",
 '49- (52°06\'50",115°58\') Granite leucocratic alkaline porphyraceous. J3. Zangan massif . D.M.Shuster (V.A.Ulanov,1961). \n',
 "50- (61°19' ,149°15'). Bt granite leucocratic fine -grained. K1. West-B~tugychag massif. K.A.Baklanova (M 

In [121]:
# OK Add exceptions for parentheses in Massif name
# Specify more precisely in regex how to handle "Det." and "Oth." keywords

In [42]:
text_parsed_corr_repl_checked_list_regex_sub = []

for line in text_parsed_corr_repl_checked_list:
    # Always select first item since 'subn' returns a tuple
    line = re.subn(r"·", "", line)[0]
    line = re.subn(r"•", "", line)[0]
    line = re.subn(r"([^A-z0-9])(\s+)([A-z0-9])", r"|\1\3", line)[0]
    line = re.subn(r"([A-z0-9])(\s+)([^A-z0-9])", r"\1|\3", line)[0]
    line = re.subn(r"\|\(([A-z\s\-~,]+)\|\)(massif)", r" \1 \2", line)[0]
    text_parsed_corr_repl_checked_list_regex_sub.append(line.replace("\x00", ""))

In [124]:
text_parsed_corr_repl_checked_list_regex_sub[1577:1580]

['1578- (44°29\'40",136°08\'30") |.Granite|(granophyre) |.Pg1_2|.Mutukhin Oprichn in massif|.N.M.Nikitina|(S|.A.Korenbaum,1973). \n',
 "1579- (50°02' ,112°35')|.Granite|-porphyry|.J3|.Kharalgin massif||.A|.I.Fedorova|(N.K.Dmitrochenko,1968). \n",
 "1580- (49°33' ,112°37')|.Two mica granite||.J2|.Khalzan mass if||.L.S.Voronova|(N|.K.Dmit rochenko|,1964) . \n"]

In [114]:
%%capture
# Save text without empty lines as text file
with open("../_TEMP/Text/text_parsed_regex_sub.txt", 'w') as f:
    for line in text_parsed_corr_repl_checked_list_regex_sub:
        f.write(f"{line}")

In [11]:
sampleID_coordinates = []

for line in text_parsed_corr_repl_checked_list:
    # Leave out [0] first to get all matches, 
    # instead of only the first, for checking in next cell
    sampleID_coordinates.append(regex_sampleID_coordinates.findall(line)[0]) 

In [168]:
# Make sure that every match only contains one pair of sampleID and coordinates
for list_ in sampleID_coordinates:
    if len(list_) > 1:
        print(list_)

In [46]:
df_sampleID_coordinates = pd.DataFrame(sampleID_coordinates)

In [50]:
def replace_misspells(a):
    a = a.str.replace("l", "1")
    a = a.str.replace("I", "1")
    
    return a

In [52]:
df_sampleID_coordinates = df_sampleID_coordinates.apply(replace_misspells)

In [53]:
wrong_entries = []
counter = 0

for index, row in df_sampleID_coordinates.iterrows():   
    for item in row:
        try:
            _ = int(item)
        except:
            if item != "":
                if index not in wrong_entries:
                    wrong_entries.append(index)
                    counter += 1

counter

96

In [56]:
df_wrong_entries = df_sampleID_coordinates.iloc[wrong_entries,:]

In [57]:
# Save wrong entries to disk; to be cleaned manually
df_wrong_entries.to_excel("../_NEEDS_CLEANING/Vistelius_text_to_be_cleaned.xlsx")

Take 'W' longitudes into consideration --> Change longitude to negative value to represent this

___

### Old code snippets

In [None]:
^[\-\.·,:\s0\']*
([0-9GSO~iIlJT\)&\'\|]+)  # Group 1 - SampleID
[\-\s\(\.\{L\\'\'~0·\|]*
([0-9iIl!pS~o\-•Z\.,&\|]{2,})  # Group 2 - Lattitude degrees
[°\s90Qg\|]*
([0-9iIl!pS~roJ\()\.\|]{2,})  # Group 3 - Lattitude minutes
[\'\\ji\s1;\|]*
([0-9iIl!pS~norJf\|]{2,})?  # Group 4 - Lattitude seconds
[\"\\'\s,~\.;\)\|]*
([0-9iIl!pS~of\s\$sa·t£\|]{1,})  # Group 5 - Longitude degrees
[°\s90Qg\|]*
([0-9iIl!pS~\soJ\-Zj\|]{2,})  # Group 6 - Longitude minutes
[\'\\\s\|]*
([0-9iIl!pS~oq]{2,})?  # Group 7 - Longitude seconds
([Ww]?)?  # Group 8 - Longitude direction (W)
[\"\)\}\.,wWt\s\'j\|:]*
([A-z\s\-~\(\)·0\.\|\?\']{3,})  # Group 9 - Rock name
[\.\|,]{2,}
([A-z0-9\s\-~\']+)  # Group 10 - Whole rock age
[\.\|\s,]*
([A-z0-9\s\-\|\'\?]{2,})?  # Group 11 - Massif name (optional)
(?(11)[\.\|\s,]+|[\.\|\s,]*)
([A-z0-9\s\.,\'!\|]{4,})?  # Group 12 - Analyst name (optional)
(?(12)[\.\|\s,]+|[\.\|\s,]*)
([A-z0-9\s:\.,~\-+\|\?]+)?  # Group 13 - Interpretation of "oth." and "det." (optional)
(?(13)[\.\|\s,]+|[\.\|\s,]*)
[\.\|\s,\(\{]+
([A-z\.,0-9\s\|\'~\-!\?]+)  # Group 14 - Author and year of the original report
[\)\}\.\s]+