## This Notebook attempts to get the ground truth aligned with the OCR Raw Stuff

In [1]:
import os
import subprocess
import pandas as pd
import csv
import camel_tools.utils.charsets
import pyarabic.araby as araby
import pyarabic.number as number
import difflib
from PyPDF2 import PdfFileWriter, PdfFileReader
import PyPDF2

In [2]:
arabic_punctuation = [c for c in camel_tools.utils.charsets.UNICODE_PUNCT_CHARSET if 1536 <= ord(c) <= 1791]

In [3]:
os.chdir("..")

In [4]:
os.getcwd()

'/Users/anasjawed/Documents/aocr/arafix_ocr'

In [5]:
book_name = "cornell_aco000143"
book_format = ".pdf"
book_path = "data/" + book_name
ground_truth_work = book_path + "/" + book_name + "_ground_truth_work/"

### OFFSET CHANGES WITH BOOK 
offset = -6

In [6]:
ocr_path = book_path + "/" + book_name + "_raw_ocr/"
ocr_prefix = "ocr_space_output_"
ocr_suffix = ".txt"

truth_path = ground_truth_work
truth_prefix = book_name + "_ground_truth_"

realignment_path = book_path + "/" + book_name + "_realignment/"

## Split Ground Truth PDF

In [7]:
os.mkdir(ground_truth_work)

In [8]:
inputpdf = PdfFileReader(open(book_path + "/" + book_name + "_ground_truth" + book_format, "rb"))

for i in range(inputpdf.numPages):
    output = PdfFileWriter()
    output.addPage(inputpdf.getPage(i))
    with open( ground_truth_work + book_name + "_ground_truth" + "_%s.pdf" % (i+1), "wb") as outputStream:
        output.write(outputStream)

## Make Ground Truth Text Files
Use Adobe Acrobat DC to carry out the PDF to txt conversion

## Make ground truth text files (deprecated)

In [14]:
from tika import parser
pdf_files = [f for f in os.listdir(ground_truth_path) if ".pdf" in f]
pdf_files

In [291]:
## Extract all text from ground_truth pdf and put it in a text file
for p in pdf_files:
#     print(p)
    ground_truth = parser.from_file(ground_truth_path + p)
    text = ""
    
    try:
        text = " ".join([" ".join(c.split(" ")[::-1]) for c in ground_truth["content"].split("\n") if c.strip() != ""])

        text = "".join([c for c in text if (c not in arabic_punctuation and 1536 <= ord(c) <= 1791) or c == " " ])

    except:
        pass
    
#     text_file_name = p.replace(".pdf", ".txt")
#     text_file = open(ground_truth_path + text_file_name, "w", encoding = "utf8")
#     text_file.write(text)
#     text_file.close()
    
    #break

## Alignment Functions

In [7]:
def parseOssamaBasic(file_name):
    df = pd.read_csv(file_name, sep = "\t", header = None, engine="python", quoting=csv.QUOTE_NONE)
    df.columns = ["one", 'op', "two", "extra"]
    df["operation"] = df.apply(operationName, axis = 1)
    return df[["operation", "one", "two"]]

def operationName(row):
    if row["op"] == "=":
        return "OK"
    
    elif row["op"] == "|":
        return "SUB"
    
    elif row["op"] == "<":
        return "INS"
    
    else:
        return "DEL"

In [8]:
# Basic alignment - Ossama's code
def alignFilesBasic(OneEncodePrefix, OneEncodeFolder, TwoEncodePrefix, TwoEncodeFolder, saveAlignmentAs, alignerLocation, results_prefix):

    OneName = OneEncodePrefix + ".txt"
    TwoName = TwoEncodePrefix + ".txt"

    #"python align_text.py -r ocr_tokenized.txt -c rafed_tokenized.txt -m basic -o sample/sample.ar"
    command = "python3 " + alignerLocation + " -s " + OneEncodeFolder + OneName + " -t " + TwoEncodeFolder + TwoName 
    command +=  " -m basic -o " + saveAlignmentAs + results_prefix

    print(command)
    p = subprocess.getstatusoutput(command)
    print(p)
#         print()
#         break
#         if i%50 == 0:
#             print(i)

## Compiling Part
Specify page ranges and make single line files 

In [9]:
ranges = pd.read_csv(book_path + "/" + book_name + "_" + "ranges.csv")
ranges

Unnamed: 0,Ocr start page,Ocr end page,Truth start page,Truth end page,Num pages,Unnamed: 5,Unnamed: 6
0,13,31,9,24,,,
1,32,50,25,38,,,
2,51,72,39,54,,,
3,73,90,55,68,,,
4,91,113,69,86,,,
5,114,133,87,102,,,
6,134,156,103,120,,,
7,157,173,121,132,,,
8,174,194,133,147,,,


In [271]:
# os.mkdir(book_path + "/" + book_name + "_realignment/")

In [17]:
# os.mkdir(realignment_path)

In [None]:
for i in range(len(ranges)):
    row = ranges.iloc[i]
    segment = i

    ocr_start = int(row["Ocr start page"]) + offset
    ocr_end =  int(row["Ocr end page"]) + offset #INCLUSIVE

    truth_start = int(row["Truth start page"])
    truth_end = int(row["Truth end page"]) #INCLUSIVE


    print("\nsegment: ", segment, "ocr_start: ", ocr_start, "ocr_end: ", ocr_end, "truth_start: ", truth_start, "truth_end: ", truth_end)
    
    ocr_file_name = book_name + "_s" + str(segment) + "_raw_ocr_" + str(ocr_start) + "-" + str(ocr_end)  + ".txt"
    print(ocr_file_name)

    truth_file_name = book_name + "_s" + str(segment) + "_ground_truth_" + str(truth_start) + "-" + str(truth_end) + ".txt"
    print(truth_file_name)
    
    
    # First do ocr
    ocr_line = ""
    for i in range(ocr_start, ocr_end + 1):
#         if i < 10:
#             file_name = ocr_prefix + "00" + str(i) + ocr_suffix

#         elif i <100:
#             file_name = ocr_prefix + "0" + str(i) + ocr_suffix

#         elif i < 1000:
#             file_name = ocr_prefix + str(i) + ocr_suffix

#         else:
#             print("fix numbering issue")
#             break

        file_name = ocr_prefix + str(i) + ocr_suffix

        content = ""
        content = open(ocr_path + file_name, "r", encoding = "utf8").read()

        try:
            content = content.replace("\n", " ")
            content = "".join([c for c in content if (c not in arabic_punctuation and 1536 <= ord(c) <= 1791) or c == " " ])

        except:
            pass

        ocr_line += content + " <endpage> "
        
        
    # write ocr line to the realignment folder
    ocr_file = open(realignment_path + ocr_file_name, "w", encoding = "utf8")
    ocr_file.write(ocr_line)
    ocr_file.close()
    print("ocr file written: ", ocr_file_name)
    
    # Now do ground truth
    truth_line = ""
    for i in range(truth_start, truth_end + 1):

        file_name = truth_prefix + str(i) + ".txt"

        content = ""
        content = open(truth_path + file_name, "r", encoding = "utf8").read()

        try:
            content = content.replace("\n", " ")
            content = "".join([c for c in content if (c not in arabic_punctuation and 1536 <= ord(c) <= 1791) or c == " " ])

        except:
            pass

        truth_line += content + " "
        
    # Write the truth_line
    truth_file = open(realignment_path + truth_file_name, "w", encoding = "utf8")
    truth_file.write(truth_line)
    truth_file.close()
    print("truth file written: ", truth_file_name)

    #alignment part
    ground_truth_file_prefix = realignment_path + truth_file_name.replace(".txt", "")
    raw_file_prefix = realignment_path + ocr_file_name.replace(".txt", "")
    save_alignment_as = realignment_path
    results_prefix = book_name + "_s" + str(segment) + "_ocr_" + str(ocr_start) + "-" + str(ocr_end) + "_truth_" + str(truth_start) + "-" + str(truth_end)
    alignerLocation = "code/ced_word_alignment/align_text.py"
    print("alignment started: ", results_prefix)
    alignFilesBasic(ground_truth_file_prefix, "", raw_file_prefix, "", save_alignment_as, alignerLocation, results_prefix)
    print("alignment done")
    
    #Make pages
    df = parseOssamaBasic(save_alignment_as + results_prefix + ".basic")
    page = ""

    page_num = int(results_prefix.replace(book_name, "").split("_")[3].split("-")[0])
    segment_num = results_prefix.replace(book_name, "").split("_")[1].replace("s", "")

    print(page_num, segment_num)
    
    segment_path = book_path + "/" + book_name + "_segment_" + str(segment_num) + "/" 
    os.mkdir(segment_path)
    realigned_prefix = book_name + "_realigned_truth_"
    realigned_prefix = "ground_truth_"
    
    for i in range(len(df)):
        row = df.iloc[i]
        one = row["one"]
        two = row["two"]

        if two == "<endpage>":
            to_write = open(segment_path + realigned_prefix + str(page_num) + ".txt", "w", encoding = "utf8")
            page_num += 1
            to_write.write(page)
            to_write.close()
            page = ""

        elif str(one) != "nan":
            page += str(one) + " "
            
    print("pages made")


segment:  0 ocr_start:  7 ocr_end:  25 truth_start:  9 truth_end:  24
cornell_aco000143_s0_raw_ocr_7-25.txt
cornell_aco000143_s0_ground_truth_9-24.txt
ocr file written:  cornell_aco000143_s0_raw_ocr_7-25.txt
truth file written:  cornell_aco000143_s0_ground_truth_9-24.txt
alignment started:  cornell_aco000143_s0_ocr_7-25_truth_9-24
python3 code/ced_word_alignment/align_text.py -s data/cornell_aco000143/cornell_aco000143_realignment/cornell_aco000143_s0_ground_truth_9-24.txt -t data/cornell_aco000143/cornell_aco000143_realignment/cornell_aco000143_s0_raw_ocr_7-25.txt -m basic -o data/cornell_aco000143/cornell_aco000143_realignment/cornell_aco000143_s0_ocr_7-25_truth_9-24


## OLD SEGMENTS

In [274]:
ocr_file_name = book_name + "_s" + str(segment) + "_raw_ocr_" + str(ocr_start) + "-" + str(ocr_end)  + ".txt"
print(ocr_file_name)

truth_file_name = book_name + "_s" + str(segment) + "_ground_truth_" + str(truth_start) + "-" + str(truth_end) + ".txt"
print(truth_file_name)

aub_aco001048_hi_s7_raw_ocr_85-101.txt
aub_aco001048_hi_s7_ground_truth_142-165.txt


In [275]:
# First do ocr
ocr_line = ""
for i in range(ocr_start, ocr_end + 1):
    if i < 10:
        file_name = ocr_prefix + "00" + str(i) + ocr_suffix
        
    elif i <100:
        file_name = ocr_prefix + "0" + str(i) + ocr_suffix
        
    elif i < 1000:
        file_name = ocr_prefix + str(i) + ocr_suffix
        
    else:
        print("fix numbering issue")
        break
        
    content = ""
    content = open(ocr_path + file_name, "r", encoding = "utf8").read()
    
    try:
        content = content.replace("\n", " ")
        content = "".join([c for c in content if (c not in arabic_punctuation and 1536 <= ord(c) <= 1791) or c == " " ])
        
    except:
        pass
    
    ocr_line += content + " <endpage> "

In [276]:
# write ocr line to the realignment folder
ocr_file = open(realignment_path + ocr_file_name, "w", encoding = "utf8")
ocr_file.write(ocr_line)
ocr_file.close()

In [277]:
# Now do ground truth
truth_line = ""
for i in range(truth_start, truth_end + 1):
    
    file_name = truth_prefix + str(i) + ".txt"
    
    content = ""
    content = open(truth_path + file_name, "r", encoding = "utf8").read()
    
    try:
        content = content.replace("\n", " ")
        content = "".join([c for c in content if (c not in arabic_punctuation and 1536 <= ord(c) <= 1791) or c == " " ])
        
    except:
        pass
    
    truth_line += content + " "

In [278]:
# Write the truth_line
truth_file = open(realignment_path + truth_file_name, "w", encoding = "utf8")
truth_file.write(truth_line)
truth_file.close()

## Actual Alignment Part

In [281]:
book_name = "aub_aco001048_hi"

In [282]:
ground_truth_file_prefix = realignment_path + truth_file_name.replace(".txt", "")
raw_file_prefix = realignment_path + ocr_file_name.replace(".txt", "")
save_alignment_as = realignment_path
results_prefix = book_name + "_s" + str(segment) + "_ocr_" + str(ocr_start) + "-" + str(ocr_end) + "_truth_" + str(truth_start) + "-" + str(truth_end)
alignerLocation = "code/ced_word_alignment/align_text.py"

In [283]:
results_prefix

'aub_aco001048_hi_s7_ocr_85-101_truth_142-165'

In [284]:
alignFilesBasic(ground_truth_file_prefix, "", raw_file_prefix, "", save_alignment_as, alignerLocation, results_prefix)

python3 code/ced_word_alignment/align_text.py -r data/aub_aco001048_hi/aub_aco001048_hi_realignment/aub_aco001048_hi_s7_ground_truth_142-165.txt -c data/aub_aco001048_hi/aub_aco001048_hi_realignment/aub_aco001048_hi_s7_raw_ocr_85-101.txt -m basic -o data/aub_aco001048_hi/aub_aco001048_hi_realignment/aub_aco001048_hi_s7_ocr_85-101_truth_142-165
(0, 'Basic alignments are saved to: data/aub_aco001048_hi/aub_aco001048_hi_realignment/aub_aco001048_hi_s7_ocr_85-101_truth_142-165.basic')


## Make New Pages

In [285]:
df = parseOssamaBasic(save_alignment_as + results_prefix + ".basic")

In [286]:
df[df["two"] == "<endpage>"].head()

Unnamed: 0,operation,one,two
309,INS,,<endpage>
770,INS,,<endpage>
1076,INS,,<endpage>
1462,INS,,<endpage>
1814,INS,,<endpage>


In [287]:
page = ""

page_num = int(results_prefix.replace(book_name, "").split("_")[3].split("-")[0])
segment_num = results_prefix.replace(book_name, "").split("_")[1].replace("s", "")

print(page_num, segment_num)

85 7


In [288]:
segment_path = book_path + "/" + book_name + "_segment" + str(segment_num) + "/" 
os.mkdir(segment_path)
realigned_prefix = book_name + "_realigned_truth_"

In [289]:
for i in range(len(df)):
    row = df.iloc[i]
    one = row["one"]
    two = row["two"]
    
    if two == "<endpage>":
        to_write = open(segment_path + realigned_prefix + str(page_num) + ".txt", "w", encoding = "utf8")
        page_num += 1
        to_write.write(page)
        to_write.close()
        page = ""
        
    elif str(one) != "nan":
        page += str(one) + " "