# Module Import

In [2]:

from pdf2image import convert_from_path
import pytesseract
import numpy as np
import pandas as pd
import os
from PIL import Image
from PIL import UnidentifiedImageError
from tqdm import tqdm
import re
from unidecode import unidecode

pdf_path = './data/pdf/The Complete Star Wars Encyclopedia - Volume II.pdf'
img_folder = './data/img/FullImg'
img_prefix_name = 'SW_II_'


# Image Extractor

In [3]:

def extract_images(pdf_path,first_page,last_page,output_folder,prefix_name):
    for page_num in tqdm(range(first_page,last_page+1)):
        images = convert_from_path(pdf_path, first_page=page_num, last_page=page_num, dpi=300, output_folder=output_folder,output_file=prefix_name , fmt="png")
    

In [4]:
def validate_images(source_folder):
    image_files = os.listdir(source_folder)
    image_files = [os.path.join(source_folder,img) for img in image_files]
    invalid = []
    
    for img_file in tqdm(image_files):
        try:
            with Image.open(img_file) as im:
                im.verify()
        except (OSError,UnidentifiedImageError):
            invalid.append(img_file)
    
    return invalid
            
    

# Image to Text Parser

In [5]:
def load_parse_images(source_folder):
    image_files = os.listdir(source_folder)
    image_files.sort()
    
    ocr_data = pd.DataFrame()
    page = 0
    for img_file in tqdm(image_files):
        #Extracts data from image and saves to data frame
        image = Image.open(os.path.join(source_folder,img_file))
        image_arr = np.array(image)
        img_data = pytesseract.image_to_data(image_arr, output_type=pytesseract.Output.DATAFRAME)
        
        #Adds file name and a page number sequence
        page += 1
        img_data['file_name']= img_file
        img_data['page_num']= page
        ocr_data = pd.concat([ocr_data,img_data])
        
    ocr_data = ocr_data.reset_index(drop=True)
    return ocr_data
    
    

In [32]:
ocr_data[22:]

Unnamed: 0_level_0,level,page_num,block_num,par_num,line_num,word_num,left,top,width,height,conf,text,file_name
idx,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
22,5,1,4,1,1,1,202,1783,111,25,83.714577,A-121,The Complete Star Wars Encyclopedia - Volume I...
23,5,1,4,1,1,2,336,1784,43,24,96.839432,An,The Complete Star Wars Encyclopedia - Volume I...
24,5,1,4,1,1,3,392,1782,156,34,96.458687,antiquated,The Complete Star Wars Encyclopedia - Volume I...
25,5,1,4,1,1,4,560,1784,99,32,96.654556,variety,The Complete Star Wars Encyclopedia - Volume I...
26,5,1,4,1,1,5,672,1782,30,26,93.057625,of,The Complete Star Wars Encyclopedia - Volume I...
...,...,...,...,...,...,...,...,...,...,...,...,...,...
1550324,3,1155,33,1,0,0,2339,84,80,80,-1.000000,,The Complete Star Wars Encyclopedia - Volume I...
1550325,4,1155,33,1,1,0,2395,84,24,41,-1.000000,,The Complete Star Wars Encyclopedia - Volume I...
1550326,5,1155,33,1,1,1,2395,84,24,41,19.353905,4,The Complete Star Wars Encyclopedia - Volume I...
1550327,4,1155,33,1,2,0,2339,130,72,34,-1.000000,,The Complete Star Wars Encyclopedia - Volume I...


# Word Cleanning

In [6]:
def clean_ocr_data(df):
    df = df.dropna(subset=['text'])
    
    df.loc[:,'text'] = df.text.apply(lambda x: unidecode(x))
    df.loc[:,'text'] = df.text.apply(lambda x: x.replace('()','').replace('( )','').replace('[]','').replace('[ ]',''))
    df.loc[:,'text'] = df.text.apply(str.strip)
    pattern = r'[^a-zA-Z0-9\-\(\),.;\'"]'
    df.loc[:,'text'] = df.text.apply(lambda x: re.sub(pattern,'',x))
    df = df.loc[~df['text'].isin(['"','\'','--','.','--.',',',';',''])]
    
    #Reorder lines
    df.loc[:,'min_left'] = df.groupby(['page_num','block_num']).left.transform('min')
    df.loc[:,'column'] = df['min_left'].apply(lambda x:( 3 if x>1600 else 2 if x > 900 else 1))
    df = df.sort_values(by=['page_num','column','block_num','par_num','line_num','word_num']).reset_index(drop=True)
    
    #add sequence block number
    df.loc[:,'seq_block_num'] = df.sort_values(by=['page_num','column','block_num','par_num','line_num','word_num']).groupby(['page_num','column','block_num']).ngroup()+1
    
    #reset line sequence
    df.loc[:,'seq_line_num'] = df.sort_values(by=['page_num','column','block_num','par_num','line_num','word_num']).groupby(['page_num','column','block_num','par_num','line_num']).ngroup()+1
    df.loc[:,'min_block_line_num'] = df.groupby(['page_num','seq_block_num']).seq_line_num.transform('min')
    df.loc[:,'line_num'] = df['seq_line_num'] - df['min_block_line_num'] + 1
    
    df.drop(columns=['level','par_num','top','width','min_block_line_num','seq_line_num','file_name'], inplace=True)
    #'conf'
    
    df = df.reset_index(drop=True)
    
    return df

In [7]:


def add_features(df):
    #Add last word of line 
    df['max_line_word_num'] = df.groupby(['page_num','seq_block_num','line_num']).word_num.transform('max')
    df['last_line_word'] = df['max_line_word_num'] == df['word_num']
    
    #Add first word of line 
    df['min_line_word_num'] = df.groupby(['page_num','seq_block_num','line_num']).word_num.transform('min')
    df['first_line_word'] = df['min_line_word_num'] == df['word_num']
    
    #Add last line of block 
    df['max_block_line_num'] = df.groupby(['page_num','seq_block_num']).line_num.transform('max')
    df['last_block_line'] = df['max_block_line_num'] == df['line_num']
    
    #Add first line of block 
    df['min_block_line_num'] = df.groupby(['page_num','seq_block_num']).line_num.transform('min')
    df['first_block_line'] = df['min_block_line_num'] == df['line_num']
    
    #Add booleand of last word of block
    df['last_block_word']  = (df['last_block_line'] & df['last_line_word'])
    df['first_block_word']  = (df['first_block_line'] & df['first_line_word'])
    
    #Add last block of page 
    df['max_page_block_num'] = df.groupby(['page_num']).seq_block_num.transform('max')
    df['last_page_block'] = df['max_page_block_num'] == df['seq_block_num']
    
    #Add first block of page 
    df['min_page_block_num'] = df.groupby(['page_num']).seq_block_num.transform('min')
    df['first_page_block'] = df['min_page_block_num'] == df['seq_block_num']
    
    #Add booleand of last word of page
    df['last_page_word'] = (df['last_page_block'] & df['last_block_line'] & df['last_line_word'] )
    
    #Block word count
    df['block_word_count'] = df.groupby(['page_num','seq_block_num']).word_num.transform('count')
    
    #marks if a block is reference term like "[term] See [another therm]."
    block = df.loc[(df['text'] == 'See'),'seq_block_num'].drop_duplicates().values
    df.loc[:,'reference_term'] = df['seq_block_num'].apply(lambda x : x in block)
    
    return df


In [8]:
def drop_ocr_data(df):
    df = df.loc[~((df['block_word_count']<=2))]    
    return df

In [9]:
def fix_break_words(df):

    #Fixes broken lines inside the block
    mask_condition = (df['last_line_word']) & (df['text'].str.endswith('-')) & (~df['last_block_word'])
    idx = df.loc[mask_condition].index

    df.loc[idx,'text'] = df['text'].apply(lambda x:x[:-1])
    
    df.loc[idx,'text'] = df.loc[idx,'text'].values + df.loc[idx+1,'text'].values
    df = df.drop(idx+1)
    
    
    #Fixes at the end of the block
    #mask_condition = (df['last_line_word']) & (df['text'].str.endswith('-')) & (df['last_block_word'])
    #idx = df.loc[mask_condition].index
    #df.loc[idx,'text'] = df['text'].apply(lambda x:x[:-1])


    return df.reset_index(drop=True)

# Build Text Blocks

In [10]:
def build_text_block(df):
    
    text_df = df.groupby(['page_num','column','seq_block_num','block_word_count','reference_term']).agg(
        first_word=('text', 'first'),
        last_word = ('text','last'),
        text=('text', ' '.join),
        lines_count = ('line_num','max')
    ).reset_index()
        
    text_df.loc[:,'first_letter'] = text_df['first_word'].str[:1].str.upper()
    text_df = text_df.rename(columns={'seq_block_num':'block','block_word_count':'word_count'})
 
    return text_df


In [11]:
def assign_alphabet(df):

    df['letter_change'] = False
    df.loc[(df['first_word']=='A-10'), ['letter_change','alphabet']] = (True,'A')
    df.loc[(df['first_word']=='BI'), ['letter_change','alphabet']] = (True,'B')
    df.loc[(df['first_word']=='C2-R4'), ['letter_change','alphabet']] = (True,'C')
    df.loc[(df['first_word']=='D-127X'), ['letter_change','alphabet']] = (True,'D')
    df.loc[(df['first_word']=='11-17'), ['letter_change','alphabet']] = (True,'E')
    df.loc[(df['first_word']=='4-LOM'), ['letter_change','alphabet']] = (True,'F')
    df.loc[(df['first_word']=='G-003'), ['letter_change','alphabet']] = (True,'G')
    df.loc[(df['first_word']=='H-10'), ['letter_change','alphabet']] = (True,'H')
    df.loc[(df['first_word']=='12-AM3'), ['first_word','letter_change','alphabet']] = ('I2-AM3',True,'I')
    df.loc[(df['first_word']=='jl'), ['first_word','letter_change','alphabet']] = ('J1',True,'J')
    df.loc[(df['first_word']=='KI-IR'), ['letter_change','alphabet']] = (True,'K')
    df.loc[(df['first_word']=='L8-L9'), ['letter_change','alphabet']] = (True,'L')
    df.loc[(df['first_word']=='M-12'), ['letter_change','alphabet']] = (True,'M')
    df.loc[(df['first_word']=='N-0O9'), ['first_word','letter_change','alphabet']] = ('N-09',True,'N')
    df.loc[(df['first_word']=='1Olst'), ['first_word','letter_change','alphabet']] = ('101st',True,'O')
    df.loc[(df['first_word']=='P-100'), ['letter_change','alphabet']] = (True,'P') #
    df.loc[(df['first_word']=='Q-2'), ['letter_change','alphabet']] = (True,'Q') #
    df.loc[(df['first_word']=='R-I'), ['first_word','letter_change','alphabet']] = ('R-1',True,'R')
    df.loc[(df['first_word']=='2nd'), ['letter_change','alphabet']] = (True,'S') #
    df.loc[(df['first_word']=='2-1B'), ['letter_change','alphabet']] = (True,'T')
    df.loc[(df['first_word']=='U2-Cl'), ['first_word','letter_change','alphabet']] = ('U2-C1',True,'U')
    df.loc[(df['first_word']=='V-1'), ['letter_change','alphabet']] = (True,'V')
    df.loc[(df['first_word']=='w-165'), ['letter_change','alphabet']] = (True,'W')
    df.loc[(df['first_word']=='X0O-X'), ['first_word','letter_change','alphabet']] = ('X0-X1',True,'X')
    df.loc[(df['first_word']=='Y-4'), ['letter_change','alphabet']] = (True,'Y')
    df.loc[(df['first_word']=='Z-18'), ['letter_change','alphabet']] = (True,'Z')
    
    
    df.loc[:df.loc[df['alphabet']=='B'].index.values[0] - 1,'alphabet'] = 'A'
    df.loc[df.loc[df['alphabet']=='B'].index.values[0]:df.loc[df['alphabet']=='C'].index.values[0] - 1,'alphabet'] = 'B'
    df.loc[df.loc[df['alphabet']=='C'].index.values[0]:df.loc[df['alphabet']=='D'].index.values[0] - 1,'alphabet'] = 'C'
    df.loc[df.loc[df['alphabet']=='D'].index.values[0]:df.loc[df['alphabet']=='E'].index.values[0] - 1,'alphabet'] = 'D'
    df.loc[df.loc[df['alphabet']=='E'].index.values[0]:df.loc[df['alphabet']=='F'].index.values[0] - 1,'alphabet'] = 'E'
    df.loc[df.loc[df['alphabet']=='F'].index.values[0]:df.loc[df['alphabet']=='G'].index.values[0] - 1,'alphabet'] = 'F'
    df.loc[df.loc[df['alphabet']=='G'].index.values[0]:df.loc[df['alphabet']=='H'].index.values[0] - 1,'alphabet'] = 'G'
    df.loc[df.loc[df['alphabet']=='H'].index.values[0]:df.loc[df['alphabet']=='I'].index.values[0] - 1,'alphabet'] = 'H'
    df.loc[df.loc[df['alphabet']=='I'].index.values[0]:df.loc[df['alphabet']=='J'].index.values[0] - 1,'alphabet'] = 'I'
    df.loc[df.loc[df['alphabet']=='J'].index.values[0]:df.loc[df['alphabet']=='K'].index.values[0] - 1,'alphabet'] = 'J'
    df.loc[df.loc[df['alphabet']=='K'].index.values[0]:df.loc[df['alphabet']=='L'].index.values[0] - 1,'alphabet'] = 'K'
    df.loc[df.loc[df['alphabet']=='L'].index.values[0]:df.loc[df['alphabet']=='M'].index.values[0] - 1,'alphabet'] = 'L'
    df.loc[df.loc[df['alphabet']=='M'].index.values[0]:df.loc[df['alphabet']=='N'].index.values[0] - 1,'alphabet'] = 'M'
    df.loc[df.loc[df['alphabet']=='N'].index.values[0]:df.loc[df['alphabet']=='O'].index.values[0] - 1,'alphabet'] = 'N'
    df.loc[df.loc[df['alphabet']=='O'].index.values[0]:df.loc[df['alphabet']=='P'].index.values[0] - 1,'alphabet'] = 'O'
    df.loc[df.loc[df['alphabet']=='P'].index.values[0]:df.loc[df['alphabet']=='Q'].index.values[0] - 1,'alphabet'] = 'P'
    df.loc[df.loc[df['alphabet']=='Q'].index.values[0]:df.loc[df['alphabet']=='R'].index.values[0] - 1,'alphabet'] = 'Q'
    df.loc[df.loc[df['alphabet']=='R'].index.values[0]:df.loc[df['alphabet']=='S'].index.values[0] - 1,'alphabet'] = 'R'
    df.loc[df.loc[df['alphabet']=='S'].index.values[0]:df.loc[df['alphabet']=='T'].index.values[0] - 1,'alphabet'] = 'S'
    df.loc[df.loc[df['alphabet']=='T'].index.values[0]:df.loc[df['alphabet']=='U'].index.values[0] - 1,'alphabet'] = 'T'
    df.loc[df.loc[df['alphabet']=='U'].index.values[0]:df.loc[df['alphabet']=='V'].index.values[0] - 1,'alphabet'] = 'U'
    df.loc[df.loc[df['alphabet']=='V'].index.values[0]:df.loc[df['alphabet']=='W'].index.values[0] - 1,'alphabet'] = 'V'
    df.loc[df.loc[df['alphabet']=='W'].index.values[0]:df.loc[df['alphabet']=='X'].index.values[0] - 1,'alphabet'] = 'W'
    df.loc[df.loc[df['alphabet']=='X'].index.values[0]:df.loc[df['alphabet']=='Y'].index.values[0] - 1,'alphabet'] = 'X'
    df.loc[df.loc[df['alphabet']=='Y'].index.values[0]:df.loc[df['alphabet']=='Z'].index.values[0] - 1,'alphabet'] = 'Y'
    df.loc[df.loc[df['alphabet']=='Z'].index.values[0]:,'alphabet'] = 'Z'    
    
    return df


In [12]:
def clean_term_data(df):
    df = df.loc[~((df['word_count']==3)&(~df['reference_term'])&(~df['text'].str.endswith('.')))]
    df = df.loc[~((df['word_count']==4)&(~df['reference_term'])&(~df['text'].str.endswith('.'))&(~df['text'].str.endswith(',')))]
    return df

# Join Broken Blocks

In [13]:
def fix_broken_blocks(df):
    df_cpy = df.copy()
    df_cpy.reset_index(drop=True, inplace=True)
    df_cpy.loc[:,'term']=False
    indices = list(df_cpy.index)

    for idx in reversed(indices):
        if(idx != 0):
            prev_idx, cur_idx = idx-1, idx
            prev_text = df_cpy.loc[prev_idx,'text']
            
            #if (df_cpy.loc[idx,'alphabet'] == df_cpy.loc[idx,'first_letter']) and (df_cpy.loc[idx,'first_word']>=df_cpy.loc[prev_idx,'first_word']):
            if (df_cpy.loc[idx,'alphabet'] == df_cpy.loc[idx,'first_letter']):
                df_cpy.loc[idx,'term'] = True
            
            if (re.search(r'-$', prev_text)) and (df_cpy.loc[cur_idx,'text'][0:1].islower()):
                #print(df_cpy.loc[prev_idx,'definition'] + ' ' + df_cpy.loc[cur_idx,'definition'])
                df_cpy.loc[prev_idx,'text'] = df_cpy.loc[prev_idx,'text'][:-1] + df_cpy.loc[cur_idx,'text']
                df_cpy.drop(cur_idx,inplace=True)
            
            if (not re.search(r'(-$)|(\.$)', prev_text)) and (df_cpy.loc[cur_idx,'text'][0:1].islower()):
                #print(df_cpy.loc[prev_idx,'definition'] + ' ' + df_cpy.loc[cur_idx,'definition'])
                df_cpy.loc[prev_idx,'text'] = df_cpy.loc[prev_idx,'text'] + ' ' + df_cpy.loc[cur_idx,'text']
                df_cpy.drop(cur_idx,inplace=True)
        else:
            df_cpy.loc[idx,'term']=True
    return df_cpy

In [14]:
def delete_known_image_description(df):
    img_desc = pd.read_csv('./data/img_desc_to_remove.csv')
    df = df.loc[~df['text'].isin(list(img_desc['text'].values))]
    
    return df

# Parse Data

## Read PDF Data

In [15]:
if 1==2: 
    extract_images(pdf_path,first_page=9,last_page=22,output_folder=img_folder,prefix_name=img_prefix_name)
    validate_images(img_folder)
    ocr_data = load_parse_images(img_folder)
    ocr_data.to_csv('./data/encyclopedia_data.csv')


## Build Words Dataframe

In [16]:
ocr_data = pd.read_csv('./data/encyclopedia_data.csv',index_col='idx')
clean_data = clean_ocr_data(ocr_data)
clean_data = add_features(clean_data)
clean_data = drop_ocr_data(clean_data)
clean_data = fix_break_words(clean_data)
clean_data = add_features(clean_data)


## Build Text Data Frame

In [17]:
encyclopedia = build_text_block(clean_data)
encyclopedia = assign_alphabet(encyclopedia)
encyclopedia = clean_term_data(encyclopedia)
encyclopedia = delete_known_image_description(encyclopedia)
encyclopedia = fix_broken_blocks(encyclopedia)
encyclopedia = encyclopedia.reset_index(drop=True)




In [18]:
encyclopedia.head()

Unnamed: 0,page_num,column,block,word_count,reference_term,first_word,last_word,text,lines_count,first_letter,letter_change,alphabet,term
0,1,1,2,5,True,A-10,Interceptor.,A-10 See A-9 Vigilance Interceptor.,1,A,True,A,True
1,1,1,3,27,False,A-121,trips.,A-121 An antiquated variety of astrogation plo...,4,A,False,A,True
2,1,1,4,14,False,Al4,work.,Al4 A one-being repulsor disk manufactured by ...,3,A,False,A,True
3,1,1,5,7,True,A-17,Able-1707.,A-17 See Alpha (A-17). A-1707 See Able-1707.,2,A,False,A,True
4,1,1,6,18,False,A-175,Orbit.,A-175 An Action IV transport captained by Joh ...,3,A,False,A,True


## Generate File

In [None]:

encyclopedia.to_csv('./data/encyclopedia_terms.csv')