In [None]:
# HYBRID OF TEXT EXTRACTION TOOLS WITH TESSERACT OCR. CONVERT PDF TO MARKDOWN FORMAT.

import os
import fitz
from pdf2image import convert_from_path
import pytesseract
import pypandoc


def extract_text_or_perform_ocr(pdf_path, page_number):
    """Extracts text from a PDF page; performs OCR if the page is image-based."""
    doc = fitz.open(pdf_path)
    page = doc[page_number]
    text = page.get_text().strip()
    doc.close()

    if not text:  # If no text, perform OCR on the page
        images = convert_from_path(pdf_path, first_page=page_number + 1, last_page=page_number + 1)
        text = pytesseract.image_to_string(images[0])
    
    return text

def convert_pdf_to_md(pdf_path, output_path):
    """Converts a PDF file to Markdown format using Pandoc."""
    doc = fitz.open(pdf_path)
    # Extract text from each page (with OCR for image-based pages)
    texts = [extract_text_or_perform_ocr(pdf_path, pn) for pn in range(len(doc))]
    doc.close()

    full_text = '\n\n'.join(texts)

    # Writing the full extracted text to a temporary .txt file
    temp_txt_path = "temp_extracted_text.txt"
    with open(temp_txt_path, 'w', encoding='utf-8') as temp_file:
        temp_file.write(full_text)

    # Convert the text file to Markdown using Pandoc
    output_md = pypandoc.convert_file(temp_txt_path, 'md', format='markdown', outputfile=output_path)

    os.remove(temp_txt_path)  # Clean up the temporary file

    return output_md

def batch_convert_pdf_to_md(pdf_folder, output_folder):
    """Converts all PDF files in a folder to Markdown format."""
    if not os.path.exists(output_folder):
        os.makedirs(output_folder)

    for pdf_file in os.listdir(pdf_folder):
        if pdf_file.lower().endswith('.pdf'):
            pdf_path = os.path.join(pdf_folder, pdf_file)
            md_filename = os.path.splitext(pdf_file)[0] + '.txt'
            output_path = os.path.join(output_folder, md_filename)
            convert_pdf_to_md(pdf_path, output_path)
            print(f"Converted '{pdf_file}' to Markdown format as '{md_filename}'.")

# Example usage
pdf_folder = 'pdf'
output_folder = 'txt'
batch_convert_pdf_to_md(pdf_folder, output_folder)


In [None]:
# ALTEWRNATIVE EXTRACTION TOOLS WITH EASYOCR. CONVERT PDF TO MARKDOWN FORMAT.

import os
import fitz  # PyMuPDF
from pdf2image import convert_from_path
import easyocr
import pypandoc
import numpy as np  # Add this import at the beginning of your script

def extract_text_or_perform_ocr(pdf_path, page_number):
    """Extracts text from a PDF page; performs OCR if the page is image-based."""
    doc = fitz.open(pdf_path)
    page = doc[page_number]
    text = page.get_text().strip()
    doc.close()

    if not text:  # If no text, perform OCR on the page
        images = convert_from_path(pdf_path, first_page=page_number + 1, last_page=page_number + 1)
        reader = easyocr.Reader(['en'])  # Initialize EasyOCR reader, specify language as needed
        image_np = np.array(images[0])  # Convert PIL image to NumPy array
        results = reader.readtext(image_np, paragraph=True)  # Use `paragraph` for better grouping
        text = ' '.join([result[1] for result in results])
    
    return text

def convert_pdf_to_md(pdf_path, output_path):
    """Converts a PDF file to Markdown format using Pandoc."""
    doc = fitz.open(pdf_path)
    texts = [extract_text_or_perform_ocr(pdf_path, pn) for pn in range(len(doc))]
    doc.close()

    full_text = '\n\n'.join(texts)
    temp_txt_path = "temp_extracted_text.txt"
    with open(temp_txt_path, 'w', encoding='utf-8') as temp_file:
        temp_file.write(full_text)

    output_md = pypandoc.convert_file(temp_txt_path, 'md', format='markdown', outputfile=output_path)
    os.remove(temp_txt_path)  # Clean up the temporary file

    return output_md

def batch_convert_pdf_to_md(pdf_folder, output_folder):
    """Converts all PDF files in a folder to Markdown format."""
    if not os.path.exists(output_folder):
        os.makedirs(output_folder)

    for pdf_file in os.listdir(pdf_folder):
        if pdf_file.lower().endswith('.pdf'):
            pdf_path = os.path.join(pdf_folder, pdf_file)
            md_filename = os.path.splitext(pdf_file)[0] + '.txt'
            output_path = os.path.join(output_folder, md_filename)
            convert_pdf_to_md(pdf_path, output_path)
            print(f"Converted '{pdf_file}' to Markdown format as '{md_filename}'.")

# Example usage
pdf_folder = 'pdf'
output_folder = 'txt'
batch_convert_pdf_to_md(pdf_folder, output_folder)


In [None]:
import os
import re

def process_text(text):
    text = re.sub(r"([a-zA-Z])(\n)([A-Z])", r"\1. \3", text)  # Adds missing dot between sentences split across lines.
    text = re.sub(r"([a-zA-Z])\n", r"\1 ", text)  # Removes inappropriate line breaks within paragraphs.
    #text = re.sub(r"\s{2,}", " ", text)  # Replace multiple spaces with a single space, if needed.
    return text

def process_files(directory_path):
    if not os.path.exists(directory_path):
        print("Directory does not exist:", directory_path)
        return
    for filename in os.listdir(directory_path):
        if filename.endswith(".txt"):
            filepath = os.path.join(directory_path, filename)
            try:
                with open(filepath, 'r', encoding='utf-8') as file:
                    content = file.read()

                processed_content = process_text(content)

                # Change the file extension from .md to .txt
                txt_filename = os.path.splitext(filename)[0] + '.txt'
                txt_filepath = os.path.join(directory_path, txt_filename)

                with open(txt_filepath, 'w', encoding='utf-8') as file:
                    file.write(processed_content)
                print(f"Processed and saved {txt_filename}")
            except Exception as e:
                print(f"Error processing {filename}: {e}")

directory_path = 'txt'
process_files(directory_path)


# Work for .docx, .docm, and .dotx 

TODO:// .doc, .dot Check with Cesare if is not faster to do manual conversion

In [None]:
import os
from docx import Document
import glob

def process_document(doc_path):
    print(f"Processing {doc_path}...")
    doc = Document(doc_path)
    output = []

    for paragraph in doc.paragraphs:
        output.append(paragraph.text + "\n")

    for table in doc.tables:
        output.append("\n[Table Start]\n")
        for row in table.rows:
            row_data = []
            for cell in row.cells:
                cell_text = cell.text.replace('\n', ' ').strip()
                row_data.append(cell_text)
            output.append(" | ".join(row_data) + "\n")
        output.append("[Table End]\n")
    
    return ''.join(output)

def save_text(output_text, output_path):
    with open(output_path, 'w', encoding='utf-8') as file:
        file.write(output_text)

def process_folder(folder_path, output_folder):
    for doc_path in glob.glob(os.path.join(folder_path, '*.docx')):
        structured_text = process_document(doc_path)
        
        # Generate output path based on the document name
        output_file_name = os.path.basename(doc_path).replace('.docx', '.txt')
        output_path = os.path.join(output_folder, output_file_name)
        
        save_text(structured_text, output_path)
        print(f"Saved processed text to {output_path}")

# Path to your folder containing Word documents
folder_path = "C:\\Users\\david\\My Drive\\data\\analysis_git_data\\cgiar\\sample_word"

# Desired output folder for the text files
output_folder ='C:\\Users\\david\\My Drive\\data\\analysis_git_data\\cgiar\\sample_processed_text'

# Process all documents in the folder
process_folder(folder_path, output_folder)


# Excel for .xls

In [5]:
import os
import glob
import openpyxl

def excel_to_txt(excel_path, txt_path):
    workbook = openpyxl.load_workbook(excel_path)
    with open(txt_path, 'w', encoding='utf-8') as txt_file:
        for ws in workbook.worksheets:
            txt_file.write(f"--- {ws.title} ---\n")  #add title //TODO: maybe add section instead of title
            for row in ws.iter_rows(values_only=True):
                row_values = [str(cell) if cell is not None else '' for cell in row]
                txt_file.write('\t'.join(row_values) + '\n')
            txt_file.write('\n')  # Separate sheet with a space

def process_folder(source_folder, target_folder):
    for excel_file in glob.glob(os.path.join(source_folder, '*.xlsx')):
        file_name = os.path.basename(excel_file)
        txt_file_name = file_name.replace('.xlsx', '.txt')
        txt_path = os.path.join(target_folder, txt_file_name)
        
        print(f"Processing {excel_file}...")
        excel_to_txt(excel_file, txt_path)
        print(f"Saved processed text to {txt_path}")

# Paths to your source and target folders
source_folder = r"C:\Users\david\My Drive\data\analysis_git_data\cgiar\sample_excel"
target_folder = r"C:\Users\david\My Drive\data\analysis_git_data\cgiar\sample_processed_excel"


process_folder(source_folder, target_folder)


Processing C:\Users\david\My Drive\data\analysis_git_data\cgiar\sample_excel\10_10_web_Copy of AF RICE Madagascar PPR 2017-2018_final_10 June.xlsx...
Saved processed text to C:\Users\david\My Drive\data\analysis_git_data\cgiar\sample_processed_excel\10_10_web_Copy of AF RICE Madagascar PPR 2017-2018_final_10 June.txt
Processing C:\Users\david\My Drive\data\analysis_git_data\cgiar\sample_excel\10_10_Wor WEB_AF RICE Madagascar PPR 2017 -  Final_rev Sept 2018_3Dec.xlsx...
Saved processed text to C:\Users\david\My Drive\data\analysis_git_data\cgiar\sample_processed_excel\10_10_Wor WEB_AF RICE Madagascar PPR 2017 -  Final_rev Sept 2018_3Dec.txt
Processing C:\Users\david\My Drive\data\analysis_git_data\cgiar\sample_excel\12107_01. KEMITRAAN_PPR_Payo-Payo_Year II Final_Januari 2024 - for web.xlsx...
Saved processed text to C:\Users\david\My Drive\data\analysis_git_data\cgiar\sample_processed_excel\12107_01. KEMITRAAN_PPR_Payo-Payo_Year II Final_Januari 2024 - for web.txt
Processing C:\Users\d

  warn(msg)
  warn(msg)
  warn(msg)


Saved processed text to C:\Users\david\My Drive\data\analysis_git_data\cgiar\sample_processed_excel\1429_1429_PPR 2020-2021_Profonanpe_final_rev_VF_for_web.txt
Processing C:\Users\david\My Drive\data\analysis_git_data\cgiar\sample_excel\1429_PPR 2021-2022_Profonanpe_VF _ web.xlsx...
Saved processed text to C:\Users\david\My Drive\data\analysis_git_data\cgiar\sample_processed_excel\1429_PPR 2021-2022_Profonanpe_VF _ web.txt
Processing C:\Users\david\My Drive\data\analysis_git_data\cgiar\sample_excel\1430_1430_2020_Report_MS_ACREI_PPPR Year 2_for_web.xlsx...
Saved processed text to C:\Users\david\My Drive\data\analysis_git_data\cgiar\sample_processed_excel\1430_1430_2020_Report_MS_ACREI_PPPR Year 2_for_web.txt
Processing C:\Users\david\My Drive\data\analysis_git_data\cgiar\sample_excel\1430_1430_2021_Report_WMO_ACREI_PPR Year3 Revised v5c - for web.xlsx...
Saved processed text to C:\Users\david\My Drive\data\analysis_git_data\cgiar\sample_processed_excel\1430_1430_2021_Report_WMO_ACREI_P

  warn(msg)


Saved processed text to C:\Users\david\My Drive\data\analysis_git_data\cgiar\sample_processed_excel\16_16_For web_ no procurement PIMS 4453 Revised PPR 2017 dated 29 April 2019.txt
Processing C:\Users\david\My Drive\data\analysis_git_data\cgiar\sample_excel\16_16_For web_no procurement PIMS 4453 Revised PPR 2018 dated 29 April 2019.xlsx...
Saved processed text to C:\Users\david\My Drive\data\analysis_git_data\cgiar\sample_processed_excel\16_16_For web_no procurement PIMS 4453 Revised PPR 2018 dated 29 April 2019.txt
Processing C:\Users\david\My Drive\data\analysis_git_data\cgiar\sample_excel\26_26_web_4569_AF_Cook Island_PPR_Mar2017.xlsx...
Saved processed text to C:\Users\david\My Drive\data\analysis_git_data\cgiar\sample_processed_excel\26_26_web_4569_AF_Cook Island_PPR_Mar2017.txt
Processing C:\Users\david\My Drive\data\analysis_git_data\cgiar\sample_excel\3066_3066_web_Copy of PPR3 Colombia-Ecuador adjusted.xlsx...
Saved processed text to C:\Users\david\My Drive\data\analysis_git_d

Save different headings in different columns  WORD

//TODO: Check with Cesare if the following approaches can work. Still a bunch of cleaning but could also be a way to split sentence.

In [None]:
import pandas as pd
def process_document_to_df(doc_path):
    doc = Document(doc_path)
    data = {}
    current_heading = None

    for paragraph in doc.paragraphs:
        if paragraph.style.name.startswith('Heading'):
            current_heading = paragraph.text
            data[current_heading] = []
        else:
            if current_heading:
                data[current_heading].append(paragraph.text)
            else:
                pass
    df = pd.DataFrame(dict([(k,pd.Series(v)) for k,v in data.items()]))
    return df


# Path to your Word document
doc_path = "C:\\Users\\david\\My Drive\\data\\analysis_git_data\\cgiar\\AF_docs\\50_50_MTR-AF-Uzbekistan-Report-FINAL2.docx"

# Desired output path for the text file
output_path ='C:\\Users\\david\\My Drive\\data\\analysis_git_data\\cgiar'

# Process the document and get structured text
structured_text = process_document_to_df(doc_path)




TRY TO SAVE ONLY FROM H1 TO H1

In [None]:
def process_document_to_df(doc_path):
    doc = Document(doc_path)
    data = {}
    current_heading = None
    current_content = []

    for paragraph in doc.paragraphs:
        if paragraph.style.name == 'Heading 1':
            if current_heading is not None:
                data[current_heading] = "\n".join(current_content)
                current_content = []
            current_heading = paragraph.text
            data[current_heading] = []
        else:
            if current_heading:
                current_content.append(paragraph.text)

    if current_heading is not None and current_heading not in data:
        data[current_heading] = "\n".join(current_content)

    df = pd.DataFrame(list(data.items()), columns=['Heading', 'Content'])
    return df

# Directory containing your Word documents
doc_directory = "C:\\Users\\david\\My Drive\\data\\analysis_git_data\\cgiar\\sample_word"

dfs = []

for filename in os.listdir(doc_directory):
    if filename.endswith(".docx"):
        doc_path = os.path.join(doc_directory, filename)
        
        print(f"Processing document: {filename}")  # Print statement added here
        df = process_document_to_df(doc_path)
        
        df['Document'] = filename
        dfs.append(df)

combined_df = pd.concat(dfs, ignore_index=True, sort=False)

# Optional: save the combined DataFrame to a CSV file
output_csv_path = os.path.join(doc_directory, 'combined_documents.csv')
combined_df.to_csv(output_csv_path, index=False)

print(f"Combined DataFrame saved to {output_csv_path}")