In [None]:
import sys
import os
from os import listdir
import pandas as pd
from statistics import mean,median
from alive_progress import alive_bar
from pdfminer.high_level import extract_text
from pdfrw import PdfReader, PdfWriter, PageMerge

In [None]:
# Function to split .pdf files based on character per line  
# Splits on mean and average of character count
# Run this for the .pdf files in /data

def split_files_vertically():
    directory = "../data"

    try:
        os.mkdir("../split")
    except:
        pass

    def splitpage(src):
        """Split a page into two (left and right)"""
        # Yield a result for each half of the page
        for x_pos in (0, 0.5):
            yield PageMerge().add(src, viewrect=(x_pos, 0, 0.5, 1)).render()

    split = 0
    y=0
    already_done=os.listdir('split')
    print (len(already_done))
    dirs=os.listdir(directory)
    for dir in dirs:
        paths=os.listdir(f"{directory}/{dir}")
        for document in paths :
            y+=1
            print (y,end='\r')
            if document[-4:]=='.pdf' and document not in already_done:
                text = extract_text(f"{directory}/{dir}/{document}")
                x=[]
                #print (text)
                for i in text.splitlines():

                    # Filter out table data
                    if (len(i)>40):

                        x.append(len(i))
                # Calculate the average length of sentences outside tables
                try: 
                    res_mean=mean(x)
                except:
                    res_mean=100
                try:
                    res_median=median(x)
                except:
                    res_median=100
                    print ('Error on file:',document,'\n')
                text = text.splitlines()

                if len(max(text, key=len)) <=67 or len(max(text, key=len)) >67 and res_mean <55 and res_median<55:
                    #print ('Splitting ',document,end='\r')
                    
                    writer = PdfWriter()
                    page_number = 0
                    for page in PdfReader(f"{directory}/{dir}/{document}").pages:
                        writer.addpages(splitpage(page))
                    writer.write(f"split/{document}")
                    split += 1
  
    print(f"Split pdfs: {split}")

In [None]:
# Function to join .pdf files after vertical split
#
# Output files from the split_vertical function splits files in pages sequentially
# so they have alternating language per page
#
# This function makes 2 .pdfs from this file in the corresponding language

def join_files_vertically():
    path = "../split"

    try:
        os.mkdir("../Nederlands")
    except:
        pass

    try:
        os.mkdir("../Francais")
    except:
        pass

    for document in os.listdir(path):
        writer_NL = PdfWriter()
        writer_FR = PdfWriter()
        page_number = 0
        for page in PdfReader(f"{path}/{document}").pages: 
            page_number += 1
            if page_number%2 == 1:
                writer_NL.addpage(page)
            else:
                writer_FR.addpage(page)
        writer_NL.write(f"Nederlands/NL_{document}")
        writer_FR.write(f"Francais/FR_{document}")

In [None]:
# Execute the above functions once

#split_files_vertically()
#join_files_vertically()

In [None]:

# FASTTEXT LANGUAGE detection 
import fasttext as ft 
ft_model = ft.load_model("lid.176.ftz") 
def fasttext_language_predict(text, model = ft_model): 
  text = text.replace('\n', " ")
  prediction = model.predict([text]) 
  return prediction


In [None]:

def split_text_horizontally():
    
    # Function to split the processed .txt files that are NOT vertically split
    # based on paragraphs language detection 
    # in the dataframe obtained after google DocumentAI processing

    import numpy as np
    import re
    df=pd.DataFrame()

    input_csv='../new_processed_data.csv'
    df=pd.read_csv(input_csv)

    # Set the paths
    input_path=os.path.join('..','new_processed_data')
    output_path=os.path.join('..','new_processed_data')
    output_path_NL=os.path.join(output_path,'NL')
    output_path_FR=os.path.join(output_path,'FR')

    try:
        os.mkdir(output_path_NL)
        os.mkdir(output_path_FR)
    except:
        pass

    for file in listdir(input_path):

        if '.txt' in file:
            
            document_id=file[:-4]
            document_pdf=f'{document_id}.pdf'

            x=int (1)
            t_nl=int (1)
            t_fr=int (1)
            
            # This keeps a list to remember in what language we are right now
            p_language_list=[]

            # output paragraphs to text
            nl_text=str()
            fr_text=str()

            while df[df['document_id']==document_pdf][f'p{x}'].values.astype(str)[0]!='nan':
                #print (df[df['document_id']==document_pdf]['p5'].values.astype(str))
                paragraph=df[df['document_id']==document_pdf][f'p{x}'].values.astype(str)[0]
                paragraph=re.sub('\\n',' ',paragraph)
                
                if len(paragraph)>44:
                    p_language=fasttext_language_predict(paragraph)[0][0][0][-2:] 
                    #print (f'{x} : {p_language} {paragraph[:63]}')    

                x+=1  

                if p_language!='nl' and p_language!='fr':
                    try:
                        last_language=p_language_list[x-1]
                        p_language=last_language
                    except:
                        last_language=''
                else: 
                    p_language_list.append(p_language)
                        
                if 'Neerlegging-Dépôt' in paragraph:
                    fr_text+=paragraph 
                    fr_text+='\n'
                    nl_text+=paragraph 
                    nl_text+='\n'
                    continue
                        
                if p_language=='nl':
                    nl_text+=paragraph
                    nl_text+='\n'
                    t_nl+=1
                    #nl_text.append('\n')

                if p_language=='fr':
                    fr_text+=paragraph
                    fr_text+='\n'
                    t_fr+=1
                    #fr_text.append('\n')

                # Break if we are at max paragraph column
                if x>(len(df.columns)-4): 
                    break

            fr_txt=f'FR_{document_id}.txt'
            nl_txt=f'NL_{document_id}.txt'

            output_file_FR=os.path.join(output_path_FR,fr_txt)
            output_file_NL=os.path.join(output_path_NL,nl_txt)

            with open (output_file_NL,'w',encoding="utf-8") as fp:
                fp.write(nl_text)

            with open (output_file_FR,'w',encoding="utf-8") as fp:
                fp.write(fr_text)

            print (f'{document_pdf} has {x} paragraphs, NL : {t_nl}, FR: {t_fr}')

        else:
            continue


In [None]:
# Execute the split text horizontally function

split_text_horizontally()