### Iterate over corpus, process PDFs by:
1) remove formatting
2) chunk into paragraphs and sentences


In [1]:
#MAKE SURE YOU CHANGE THIS TO BE APPROPRIATE FOR YOUR ENVIRONMENT
REPO_ROOT="C://Users//ericf//source//repos//text-inferences"


In [2]:
import sys
import os

sys.path.append(os.path.join(os.path.realpath('.'),'..','..','common_functions'))

from IPython.display import display, HTML
display(HTML("<style>.container { width:70% !important; }</style>"))


In [3]:
### shouldn't need to modify these, but you will need to make sure the dirs exist
CORPUS = os.path.join(REPO_ROOT,'corpora','ifrc_evaluations')
INTERMEDIATES = os.path.join('.','file_intermediates')
sys.path.append(os.path.join(REPO_ROOT,'src','common_functions'))

In [4]:
from tqdm.auto import tqdm

import re
import pandas as pd
import numpy as np
#import seaborn as sns

from gensim.utils import simple_preprocess


#from sklearn.feature_extraction.text import TfidfVectorizer
#from spacy.lang.en.stop_words import STOP_WORDS as stopwords
#from sklearn.decomposition import NMF

import spacy


#custom common functions
import parse_pdf
import file_utils

In [5]:
nlp = spacy.load("en_core_web_md")

# First process the PDFs, convert to text files

In [6]:


def generate_text_corpus_from_pdfs(CORPUS):
    # search the staged_files folder (a subdir of the corpus folder)
    # read them in, process and output cleansed text files to corpus folder
    staged_files = os.path.join(CORPUS,'staged_files')
    
    def write_text(file_content, pdf):
        #takes in file content and name and outputs to text
        filename = pdf.replace(".pdf",".txt")
        full_filepath = os.path.join(CORPUS,filename)


        with open(full_filepath, "w", encoding="utf-8") as f:
            f.write(file_content)
        
        
    
    # get available evaluation files
    pdf_list = [pdf for pdf in os.listdir(staged_files) if pdf[-4:] == '.pdf']


    text_content = ""
    for pdf in tqdm(pdf_list):
        filepath = os.path.join(staged_files,pdf)

        try:
            file_content = parse_pdf.read_pdf_document(filepath)
            write_text(file_content, pdf)

            text_content = text_content + "\n" + file_content
        except Exception as e:
            print(e)
            pass
    

generate_text_corpus_from_pdfs(CORPUS)

  0%|          | 0/10 [00:00<?, ?it/s]

Xref table not zero-indexed. ID numbers for objects will be corrected.


## After Text Files have been generated, we can now preprocess them

In [7]:
# paragraph level split. This function creates the csv file needed to do
# paragraph level analysis


def generate_paragraph_level_data_from_corpus():
    txt_list = [txt for txt in os.listdir(CORPUS) if txt[-4:] == '.txt']
    print(f"about to process {len(txt_list)} files.")


    df_paragraph = pd.DataFrame(columns=['file','original_text'])


    for file in tqdm(txt_list):

        with open (os.path.join(CORPUS,file), "r", encoding='utf-8') as f:
            content = f.read()
        
        paras = re.split(r'\n\s*\n', content)

        paragraphs = []

        for p in paras:

            #if the presumed paragraph is excessively long, split again into
            #sentences and group into 5 sentences per para
            if len(p) > 2500:
                sents = p.split('.')

                pseudo_para = ['. '.join(sents[i:i+5]) for i in range(0, len(sents), 5)]
                for pp in pseudo_para:

                    #paragraphs.append([file,pp]) if len(pp) >= 14 else next

                    if len(pp) >= 14:
                      
                        new_row = pd.DataFrame([[file,pp]], columns=df_paragraph.columns)
                        
                        df_paragraph = pd.concat([df_paragraph, new_row], ignore_index=True)
            elif len(p) < 14:
                #paragraph is too short... just noise...
                next
            else:

                r = [file,p]
                new_row = pd.DataFrame([[file,p]], columns=['file','original_text'])
                df_paragraph = pd.concat([df_paragraph, new_row], ignore_index=True)


    df_paragraph.to_csv("file_outputs\evals_by_paragraph.csv")
    
    print("output has been written to 'file_outputs' to avert accidentally overwriting the working file.")
    print("if you want to use it, make sure to move it to 'file_intermediates'")



generate_paragraph_level_data_from_corpus()

about to process 10 files.


  0%|          | 0/10 [00:00<?, ?it/s]

output has been written to 'file_outputs' to avert accidentally overwriting the working file.
if you want to use it, make sure to move it to 'file_intermediates'


## Now Generate the preprocessed, sentence-level pickle file.

In [None]:
def generate_sentence_level_preprocessed_file_for_analysis(CORPUS):
    
    txt_list = [txt for txt in os.listdir(CORPUS) if txt[-4:] == '.txt']
    print(f"This function takes about 4 seconds per file to run. It has a list of files {len(txt_list)} items long.")
    print(f"Expect this to take about {len(txt_list) * 4 / 60} minutes to complete.")

    #df_paragraph = pd.DataFrame(columns=['file','paragraph'])
    text_content = ""
    df = pd.DataFrame(columns=['file','original_text','preprocessed_text'])


    segments=[]
    for txt in tqdm(txt_list):

        filepath = os.path.join(CORPUS,txt)

        try:
            with open(filepath,"r", encoding="utf-8") as f:
                file_content = f.read()

                doc = nlp(file_content)
                segments = []

                for sent in doc.sents:
                    row = [txt]

                    sent_after_stops = ' '.join([str(w.lemma_) for w in sent if w.is_stop == False])
                    sent_after_stops = sent_after_stops.replace("\n"," ")
                    segment = simple_preprocess(sent_after_stops, deacc=True) 
                    if len(segment) > 1 :
                        row.append(sent.text)
                        row.append(segment)

                        df.loc[len(df.index)] = row 

        except Exception as e:
            print(e)
            pass

    output_file = file_utils.format_outfile_name('eval_sents',extension='pkl')
    df.to_pickle(output_file)
    print("output has been written to 'file_outputs' to avert accidentally overwriting the working file.")
    print("if you want to use it, make sure to move it to 'file_intermediates' and rename accordingly.")
    
generate_sentence_level_preprocessed_file_for_analysis(CORPUS)

This function takes about 4 seconds per file to run. It has a list of files 10 items long.
Expect this to take about 0.6666666666666666 minutes to complete.


  0%|          | 0/10 [00:00<?, ?it/s]