In [1]:
# HYBRID OF TEXT EXTRACTION TOOLS WITH TESSERACT OCR. CONVERT PDF TO MARKDOWN FORMAT.

import os
import fitz
from pdf2image import convert_from_path
import pytesseract
import pypandoc


def extract_text_or_perform_ocr(pdf_path, page_number):
    """Extracts text from a PDF page; performs OCR if the page is image-based."""
    doc = fitz.open(pdf_path)
    page = doc[page_number]
    text = page.get_text().strip()
    doc.close()

    if not text:  # If no text, perform OCR on the page
        images = convert_from_path(pdf_path, first_page=page_number + 1, last_page=page_number + 1)
        text = pytesseract.image_to_string(images[0])
    
    return text

def convert_pdf_to_md(pdf_path, output_path):
    """Converts a PDF file to Markdown format using Pandoc."""
    doc = fitz.open(pdf_path)
    # Extract text from each page (with OCR for image-based pages)
    texts = [extract_text_or_perform_ocr(pdf_path, pn) for pn in range(len(doc))]
    doc.close()

    full_text = '\n\n'.join(texts)

    # Writing the full extracted text to a temporary .txt file
    temp_txt_path = "temp_extracted_text.txt"
    with open(temp_txt_path, 'w', encoding='utf-8') as temp_file:
        temp_file.write(full_text)

    # Convert the text file to Markdown using Pandoc
    output_md = pypandoc.convert_file(temp_txt_path, 'md', format='markdown', outputfile=output_path)

    os.remove(temp_txt_path)  # Clean up the temporary file

    return output_md

def batch_convert_pdf_to_md(pdf_folder, output_folder):
    """Converts all PDF files in a folder to Markdown format."""
    if not os.path.exists(output_folder):
        os.makedirs(output_folder)

    for pdf_file in os.listdir(pdf_folder):
        if pdf_file.lower().endswith('.pdf'):
            pdf_path = os.path.join(pdf_folder, pdf_file)
            md_filename = os.path.splitext(pdf_file)[0] + '.txt'
            output_path = os.path.join(output_folder, md_filename)
            convert_pdf_to_md(pdf_path, output_path)
            print(f"Converted '{pdf_file}' to Markdown format as '{md_filename}'.")

# Example usage
pdf_folder = 'C:\\Users\\david\\My Drive\\data\\analysis_git_data\\cgiar\\AF_pdf'
output_folder = 'C:\\Users\\david\\My Drive\\data\\analysis_git_data\\cgiar\\af_txt'
batch_convert_pdf_to_md(pdf_folder, output_folder)


Converted '12038_12038_2020-05-08-SierraLeone_AF-Inception Workshop Report.pdf' to Markdown format as '12038_12038_2020-05-08-SierraLeone_AF-Inception Workshop Report.txt'.
Converted '12038_12038_Project Proposal Sierra Leone Adaptation Fund IFAD_13May2019CLEAN.pdf' to Markdown format as '12038_12038_Project Proposal Sierra Leone Adaptation Fund IFAD_13May2019CLEAN.txt'.
Converted '12039_12039_Clean for posting .pdf' to Markdown format as '12039_12039_Clean for posting .txt'.
Converted '12039_12039_Inception report_AF project_Tajikistan.pdf' to Markdown format as '12039_12039_Inception report_AF project_Tajikistan.txt'.
Converted '12040_12040_PROJECT LAUNCH AND INCEPTION REPORT FOR SAINT LUCIA AF PROJECT.pdf' to Markdown format as '12040_12040_PROJECT LAUNCH AND INCEPTION REPORT FOR SAINT LUCIA AF PROJECT.txt'.
Converted '12040_12040_St Lucia for posting.pdf' to Markdown format as '12040_12040_St Lucia for posting.txt'.
Converted '12093_12093_Congo Clean.pdf' to Markdown format as '120

In [None]:
# ALTEWRNATIVE EXTRACTION TOOLS WITH EASYOCR. CONVERT PDF TO MARKDOWN FORMAT.

import os
import fitz  # PyMuPDF
from pdf2image import convert_from_path
import easyocr
import pypandoc
import numpy as np  # Add this import at the beginning of your script

def extract_text_or_perform_ocr(pdf_path, page_number):
    """Extracts text from a PDF page; performs OCR if the page is image-based."""
    doc = fitz.open(pdf_path)
    page = doc[page_number]
    text = page.get_text().strip()
    doc.close()

    if not text:  # If no text, perform OCR on the page
        images = convert_from_path(pdf_path, first_page=page_number + 1, last_page=page_number + 1)
        reader = easyocr.Reader(['en'])  # Initialize EasyOCR reader, specify language as needed
        image_np = np.array(images[0])  # Convert PIL image to NumPy array
        results = reader.readtext(image_np, paragraph=True)  # Use `paragraph` for better grouping
        text = ' '.join([result[1] for result in results])
    
    return text

def convert_pdf_to_md(pdf_path, output_path):
    """Converts a PDF file to Markdown format using Pandoc."""
    doc = fitz.open(pdf_path)
    texts = [extract_text_or_perform_ocr(pdf_path, pn) for pn in range(len(doc))]
    doc.close()

    full_text = '\n\n'.join(texts)
    temp_txt_path = "temp_extracted_text.txt"
    with open(temp_txt_path, 'w', encoding='utf-8') as temp_file:
        temp_file.write(full_text)

    output_md = pypandoc.convert_file(temp_txt_path, 'md', format='markdown', outputfile=output_path)
    os.remove(temp_txt_path)  # Clean up the temporary file

    return output_md

def batch_convert_pdf_to_md(pdf_folder, output_folder):
    """Converts all PDF files in a folder to Markdown format."""
    if not os.path.exists(output_folder):
        os.makedirs(output_folder)

    for pdf_file in os.listdir(pdf_folder):
        if pdf_file.lower().endswith('.pdf'):
            pdf_path = os.path.join(pdf_folder, pdf_file)
            md_filename = os.path.splitext(pdf_file)[0] + '.txt'
            output_path = os.path.join(output_folder, md_filename)
            convert_pdf_to_md(pdf_path, output_path)
            print(f"Converted '{pdf_file}' to Markdown format as '{md_filename}'.")

# Example usage
pdf_folder = 'pdf'
output_folder = 'txt'
batch_convert_pdf_to_md(pdf_folder, output_folder)


In [None]:
import os
import re

def process_text(text):
    text = re.sub(r"([a-zA-Z])(\n)([A-Z])", r"\1. \3", text)  # Adds missing dot between sentences split across lines.
    text = re.sub(r"([a-zA-Z])\n", r"\1 ", text)  # Removes inappropriate line breaks within paragraphs.
    #text = re.sub(r"\s{2,}", " ", text)  # Replace multiple spaces with a single space, if needed.
    return text

def process_files(directory_path):
    if not os.path.exists(directory_path):
        print("Directory does not exist:", directory_path)
        return
    for filename in os.listdir(directory_path):
        if filename.endswith(".txt"):
            filepath = os.path.join(directory_path, filename)
            try:
                with open(filepath, 'r', encoding='utf-8') as file:
                    content = file.read()

                processed_content = process_text(content)

                # Change the file extension from .md to .txt
                txt_filename = os.path.splitext(filename)[0] + '.txt'
                txt_filepath = os.path.join(directory_path, txt_filename)

                with open(txt_filepath, 'w', encoding='utf-8') as file:
                    file.write(processed_content)
                print(f"Processed and saved {txt_filename}")
            except Exception as e:
                print(f"Error processing {filename}: {e}")

directory_path = 'txt'
process_files(directory_path)


# Work for .docx, .docm, and .dotx 

TODO:// .doc, .dot Check with Cesare if is not faster to do manual conversion

In [6]:
import os
from docx import Document
import glob

def process_document(doc_path):
    print(f"Processing {doc_path}...")
    doc = Document(doc_path)
    output = []

    for paragraph in doc.paragraphs:
        output.append(paragraph.text + "\n")

    for table in doc.tables:
        output.append("\n[Table Start]\n")
        for row in table.rows:
            row_data = []
            for cell in row.cells:
                cell_text = cell.text.replace('\n', ' ').strip()
                row_data.append(cell_text)
            output.append(" | ".join(row_data) + "\n")
        output.append("[Table End]\n")
    
    return ''.join(output)

def save_text(output_text, output_path):
    with open(output_path, 'w', encoding='utf-8') as file:
        file.write(output_text)

def process_folder(folder_path, output_folder):
    for doc_path in glob.glob(os.path.join(folder_path, '*.docx')):
        structured_text = process_document(doc_path)
        
        # Generate output path based on the document name
        output_file_name = os.path.basename(doc_path).replace('.docx', '.txt')
        output_path = os.path.join(output_folder, output_file_name)
        
        save_text(structured_text, output_path)
        print(f"Saved processed text to {output_path}")

# Path to your folder containing Word documents
folder_path = "C:\\Users\\david\\My Drive\\data\\analysis_git_data\\cgiar\\af_word_docs"

# Desired output folder for the text files
output_folder ='C:\\Users\\david\\My Drive\\data\\analysis_git_data\\cgiar\\af_word_text'

# Process all documents in the folder
process_folder(folder_path, output_folder)


Processing C:\Users\david\My Drive\data\analysis_git_data\cgiar\af_word_docs\10_10_Madagascar Adaptation Fund Project_MTR report AFrice Final_clean.docx...
Saved processed text to C:\Users\david\My Drive\data\analysis_git_data\cgiar\af_word_text\10_10_Madagascar Adaptation Fund Project_MTR report AFrice Final_clean.txt
Processing C:\Users\david\My Drive\data\analysis_git_data\cgiar\af_word_docs\12094_DiMMA Gender & Youth  Strategy 2021.docx...
Saved processed text to C:\Users\david\My Drive\data\analysis_git_data\cgiar\af_word_text\12094_DiMMA Gender & Youth  Strategy 2021.txt
Processing C:\Users\david\My Drive\data\analysis_git_data\cgiar\af_word_docs\12100_INCEPTION REPORT ENGLISH-HAI.docx...
Saved processed text to C:\Users\david\My Drive\data\analysis_git_data\cgiar\af_word_text\12100_INCEPTION REPORT ENGLISH-HAI.txt
Processing C:\Users\david\My Drive\data\analysis_git_data\cgiar\af_word_docs\1430_1430_3 - AF Preconcept ACREI 8th February Clean 4 FEB.docx...
Saved processed text to

# Excel for .xls 


In [3]:
import os
import glob
import pandas as pd
import xlrd 

def excel_to_txt(excel_path, txt_path):
    with pd.ExcelFile(excel_path) as xls:
        with open(txt_path, 'w', encoding='utf-8') as txt_file:
            for sheet_name in xls.sheet_names:
                txt_file.write(f"--- {sheet_name} ---\n")
                df = pd.read_excel(xls, sheet_name=sheet_name)
                for index, row in df.iterrows():
                    # Process each cell, append a period if the cell is not empty.
                    row_values = [(str(cell) + '.' if pd.notnull(cell) and str(cell).strip() != '' else '') for cell in row]
                    txt_file.write('\t'.join(row_values) + '\n')
                txt_file.write('\n')  # Separate sheet with a space
                
def process_folder(source_folder, target_folder):
    # Search for both .xls and .xlsx files
    for excel_file in glob.glob(os.path.join(source_folder, '*.xls*')):
        file_name = os.path.basename(excel_file)
        # Preserve original extension in replacement to handle both .xls and .xlsx
        txt_file_name = file_name.rsplit('.', 1)[0] + '.txt'
        txt_path = os.path.join(target_folder, txt_file_name)
        
        print(f"Processing {excel_file}...")
        excel_to_txt(excel_file, txt_path)
        print(f"Saved processed text to {txt_path}")

# Update these paths to your source and target folders
source_folder = r"C:\Users\david\My Drive\data\analysis_git_data\cgiar\af_excel"
target_folder = r"C:\Users\david\My Drive\data\analysis_git_data\cgiar\af_excel_text"

process_folder(source_folder, target_folder)


Processing C:\Users\david\My Drive\data\analysis_git_data\cgiar\af_excel\10_10_ATTRZVIH - 1st PPR for website.xls...
Saved processed text to C:\Users\david\My Drive\data\analysis_git_data\cgiar\af_excel_text\10_10_ATTRZVIH - 1st PPR for website.txt
Processing C:\Users\david\My Drive\data\analysis_git_data\cgiar\af_excel\10_10_For Website_AF RICE PPR - final for AF submission - revised 270117.xls...
Saved processed text to C:\Users\david\My Drive\data\analysis_git_data\cgiar\af_excel_text\10_10_For Website_AF RICE PPR - final for AF submission - revised 270117.txt
Processing C:\Users\david\My Drive\data\analysis_git_data\cgiar\af_excel\10_10_For Website_Madagascar PPRA Oct 2013-oCT 2014 REVISED-Oct  2015. FINAL for AF.xls...
Saved processed text to C:\Users\david\My Drive\data\analysis_git_data\cgiar\af_excel_text\10_10_For Website_Madagascar PPRA Oct 2013-oCT 2014 REVISED-Oct  2015. FINAL for AF.txt
Processing C:\Users\david\My Drive\data\analysis_git_data\cgiar\af_excel\10_10_PPR Mada

  warn(msg)
  warn(msg)
  warn(msg)
  warn(msg)
  warn(msg)
  warn(msg)
  warn(msg)


Saved processed text to C:\Users\david\My Drive\data\analysis_git_data\cgiar\af_excel_text\12_12_For web PPR_PACV_AF_2019 060120_Final.txt
Processing C:\Users\david\My Drive\data\analysis_git_data\cgiar\af_excel\12_12_For Website_4789_AF_Mali_PPR 2_revised June 2018.xlsx...


  warn(msg)
  warn(msg)
  warn(msg)
  warn(msg)
  warn(msg)
  warn(msg)
  warn(msg)


Saved processed text to C:\Users\david\My Drive\data\analysis_git_data\cgiar\af_excel_text\12_12_For Website_4789_AF_Mali_PPR 2_revised June 2018.txt
Processing C:\Users\david\My Drive\data\analysis_git_data\cgiar\af_excel\12_12_For Website_Revised PPR _ Mali AF Project PIMS 4789_18 Oct 2017.xlsx...
Saved processed text to C:\Users\david\My Drive\data\analysis_git_data\cgiar\af_excel_text\12_12_For Website_Revised PPR _ Mali AF Project PIMS 4789_18 Oct 2017.txt
Processing C:\Users\david\My Drive\data\analysis_git_data\cgiar\af_excel\12_12_web_Final PPR PIMS_4789_du 9 mars 2021.xlsx...
Saved processed text to C:\Users\david\My Drive\data\analysis_git_data\cgiar\af_excel_text\12_12_web_Final PPR PIMS_4789_du 9 mars 2021.txt
Processing C:\Users\david\My Drive\data\analysis_git_data\cgiar\af_excel\1417_1417_PPR II-Ayninacuy_Project_for_web.xlsx...
Saved processed text to C:\Users\david\My Drive\data\analysis_git_data\cgiar\af_excel_text\1417_1417_PPR II-Ayninacuy_Project_for_web.txt
Proces

  warn(msg)
  warn(msg)
  warn(msg)
  warn(msg)
  warn(msg)
  warn(msg)
  warn(msg)
  warn(msg)


Saved processed text to C:\Users\david\My Drive\data\analysis_git_data\cgiar\af_excel_text\1418_1418_PPR 1 AF KPC YI Apr17-Mar18_31 January 2019_for web.txt
Processing C:\Users\david\My Drive\data\analysis_git_data\cgiar\af_excel\1418_1418_Revised PPR_AF_KPC_Apr18-Mar19_Final_For web.xlsx...
Saved processed text to C:\Users\david\My Drive\data\analysis_git_data\cgiar\af_excel_text\1418_1418_Revised PPR_AF_KPC_Apr18-Mar19_Final_For web.txt
Processing C:\Users\david\My Drive\data\analysis_git_data\cgiar\af_excel\1418_1418_Revised PPR_AF_KPC_YIII_Apr 19 - Mar 20_ Final_for web.xlsx...


  warn(msg)
  warn(msg)
  warn(msg)
  warn(msg)
  warn(msg)
  warn(msg)


Saved processed text to C:\Users\david\My Drive\data\analysis_git_data\cgiar\af_excel_text\1418_1418_Revised PPR_AF_KPC_YIII_Apr 19 - Mar 20_ Final_for web.txt
Processing C:\Users\david\My Drive\data\analysis_git_data\cgiar\af_excel\1419_1419_PPR-CSE Senegal Project_2021_For_web.xlsx...
Saved processed text to C:\Users\david\My Drive\data\analysis_git_data\cgiar\af_excel_text\1419_1419_PPR-CSE Senegal Project_2021_For_web.txt
Processing C:\Users\david\My Drive\data\analysis_git_data\cgiar\af_excel\1419_1419_PPR1-CSE Senegal Project_2019_for_web.xlsx...
Saved processed text to C:\Users\david\My Drive\data\analysis_git_data\cgiar\af_excel_text\1419_1419_PPR1-CSE Senegal Project_2019_for_web.txt
Processing C:\Users\david\My Drive\data\analysis_git_data\cgiar\af_excel\1419_1419_PPR2-CSE Senegal Project_2020_VF_for_web.xlsx...
Saved processed text to C:\Users\david\My Drive\data\analysis_git_data\cgiar\af_excel_text\1419_1419_PPR2-CSE Senegal Project_2020_VF_for_web.txt
Processing C:\Users\

  warn(msg)
  warn(msg)
  warn(msg)
  warn(msg)
  warn(msg)
  warn(msg)
  warn(msg)
  warn(msg)


Saved processed text to C:\Users\david\My Drive\data\analysis_git_data\cgiar\af_excel_text\1424_1424_CLEARED WEB_PPR1 13 November 2019.txt
Processing C:\Users\david\My Drive\data\analysis_git_data\cgiar\af_excel\1424_1424_PPR3_FINAL_WEB.xlsx...
Saved processed text to C:\Users\david\My Drive\data\analysis_git_data\cgiar\af_excel_text\1424_1424_PPR3_FINAL_WEB.txt
Processing C:\Users\david\My Drive\data\analysis_git_data\cgiar\af_excel\1424_1424_web_PPR2 of 23 October 2020.xlsx...
Saved processed text to C:\Users\david\My Drive\data\analysis_git_data\cgiar\af_excel_text\1424_1424_web_PPR2 of 23 October 2020.txt
Processing C:\Users\david\My Drive\data\analysis_git_data\cgiar\af_excel\1429_1429_PPR 2020-2021_Profonanpe_final_rev_VF_for_web.xlsx...


  warn(msg)
  warn(msg)


Saved processed text to C:\Users\david\My Drive\data\analysis_git_data\cgiar\af_excel_text\1429_1429_PPR 2020-2021_Profonanpe_final_rev_VF_for_web.txt
Processing C:\Users\david\My Drive\data\analysis_git_data\cgiar\af_excel\1429_1429_PPR1_2019-2020_Profonanpe_for_web.xlsx...


  warn(msg)
  warn(msg)


Saved processed text to C:\Users\david\My Drive\data\analysis_git_data\cgiar\af_excel_text\1429_1429_PPR1_2019-2020_Profonanpe_for_web.txt
Processing C:\Users\david\My Drive\data\analysis_git_data\cgiar\af_excel\1429_PPR 2021-2022_Profonanpe_VF _ web.xlsx...


  warn(msg)
  warn(msg)


Saved processed text to C:\Users\david\My Drive\data\analysis_git_data\cgiar\af_excel_text\1429_PPR 2021-2022_Profonanpe_VF _ web.txt
Processing C:\Users\david\My Drive\data\analysis_git_data\cgiar\af_excel\1430_1430_2020_Report_MS_ACREI_PPPR Year 2_for_web.xlsx...
Saved processed text to C:\Users\david\My Drive\data\analysis_git_data\cgiar\af_excel_text\1430_1430_2020_Report_MS_ACREI_PPPR Year 2_for_web.txt
Processing C:\Users\david\My Drive\data\analysis_git_data\cgiar\af_excel\1430_1430_2021_Report_WMO_ACREI_PPR Year3 Revised v5c - for web.xlsx...
Saved processed text to C:\Users\david\My Drive\data\analysis_git_data\cgiar\af_excel_text\1430_1430_2021_Report_WMO_ACREI_PPR Year3 Revised v5c - for web.txt
Processing C:\Users\david\My Drive\data\analysis_git_data\cgiar\af_excel\1430_1430_ACREI PPR Year 1_Oct 2019_Final_Public.xlsx...
Saved processed text to C:\Users\david\My Drive\data\analysis_git_data\cgiar\af_excel_text\1430_1430_ACREI PPR Year 1_Oct 2019_Final_Public.txt
Processing

  warn(msg)
  warn(msg)
  warn(msg)
  warn(msg)
  warn(msg)
  warn(msg)


Saved processed text to C:\Users\david\My Drive\data\analysis_git_data\cgiar\af_excel_text\1434_1434_web_PPR_Adapting_to_Climate_Change_Jan 2020.txt
Processing C:\Users\david\My Drive\data\analysis_git_data\cgiar\af_excel\1434_1434_WEB_Project 2019-2020_ppr.xlsx...
Saved processed text to C:\Users\david\My Drive\data\analysis_git_data\cgiar\af_excel_text\1434_1434_WEB_Project 2019-2020_ppr.txt
Processing C:\Users\david\My Drive\data\analysis_git_data\cgiar\af_excel\14500_PPR-AF Syria-2022-final- IE responses update- finance adjusted_for web.xlsx...
Saved processed text to C:\Users\david\My Drive\data\analysis_git_data\cgiar\af_excel_text\14500_PPR-AF Syria-2022-final- IE responses update- finance adjusted_for web.txt
Processing C:\Users\david\My Drive\data\analysis_git_data\cgiar\af_excel\14_14_For Webiste_PPR3-AF-Mauritania_Sept2016-Aug2017.xlsx...
Saved processed text to C:\Users\david\My Drive\data\analysis_git_data\cgiar\af_excel_text\14_14_For Webiste_PPR3-AF-Mauritania_Sept2016-A

  warn(msg)


Saved processed text to C:\Users\david\My Drive\data\analysis_git_data\cgiar\af_excel_text\16_16_For web_ no procurement PIMS 4453 Revised PPR 2017 dated 29 April 2019.txt
Processing C:\Users\david\My Drive\data\analysis_git_data\cgiar\af_excel\16_16_For web_no procurement PIMS 4453 Revised PPR 2018 dated 29 April 2019.xlsx...


  warn(msg)


Saved processed text to C:\Users\david\My Drive\data\analysis_git_data\cgiar\af_excel_text\16_16_For web_no procurement PIMS 4453 Revised PPR 2018 dated 29 April 2019.txt
Processing C:\Users\david\My Drive\data\analysis_git_data\cgiar\af_excel\18_18_1st PPR for website.xls...
Saved processed text to C:\Users\david\My Drive\data\analysis_git_data\cgiar\af_excel_text\18_18_1st PPR for website.txt
Processing C:\Users\david\My Drive\data\analysis_git_data\cgiar\af_excel\18_18_Copy of RWANDA-PPR-FINAL_19-August-2017.xlsx...
Saved processed text to C:\Users\david\My Drive\data\analysis_git_data\cgiar\af_excel_text\18_18_Copy of RWANDA-PPR-FINAL_19-August-2017.txt
Processing C:\Users\david\My Drive\data\analysis_git_data\cgiar\af_excel\18_18_RWANDA PROJECT - PROJECT PERFORMANCE REPORT - FINAL - for website.xlsx...
Saved processed text to C:\Users\david\My Drive\data\analysis_git_data\cgiar\af_excel_text\18_18_RWANDA PROJECT - PROJECT PERFORMANCE REPORT - FINAL - for website.txt
Processing C:\

  warn(msg)
  warn(msg)
  warn(msg)
  warn(msg)
  warn(msg)
  warn(msg)
  warn(msg)
  warn(msg)


Saved processed text to C:\Users\david\My Drive\data\analysis_git_data\cgiar\af_excel_text\19_19_For Website_4775_AF_Seychelles EBA PPR_15 Dec 15.txt
Processing C:\Users\david\My Drive\data\analysis_git_data\cgiar\af_excel\19_19_For Website_4775_AF_Seychelles EBA PPR_Jul 2016.xlsx...


  warn(msg)
  warn(msg)
  warn(msg)
  warn(msg)
  warn(msg)
  warn(msg)
  warn(msg)
  warn(msg)


Saved processed text to C:\Users\david\My Drive\data\analysis_git_data\cgiar\af_excel_text\19_19_For Website_4775_AF_Seychelles EBA PPR_Jul 2016.txt
Processing C:\Users\david\My Drive\data\analysis_git_data\cgiar\af_excel\19_19_For Website_4775_AF_Seychelles-EBA-PPR_Oct-2017.xlsx...


  warn(msg)
  warn(msg)
  warn(msg)
  warn(msg)
  warn(msg)
  warn(msg)
  warn(msg)
  warn(msg)
  warn(msg)


Saved processed text to C:\Users\david\My Drive\data\analysis_git_data\cgiar\af_excel_text\19_19_For Website_4775_AF_Seychelles-EBA-PPR_Oct-2017.txt
Processing C:\Users\david\My Drive\data\analysis_git_data\cgiar\af_excel\19_19_WEB_Revised PPR 2018 EBA project_17 Sep 2018.xlsx...


  warn(msg)
  warn(msg)
  warn(msg)
  warn(msg)
  warn(msg)
  warn(msg)
  warn(msg)
  warn(msg)
  warn(msg)


Saved processed text to C:\Users\david\My Drive\data\analysis_git_data\cgiar\af_excel_text\19_19_WEB_Revised PPR 2018 EBA project_17 Sep 2018.txt
Processing C:\Users\david\My Drive\data\analysis_git_data\cgiar\af_excel\20_20_For Website_2nd half year report_CSE_17Febr2012.xls...
Saved processed text to C:\Users\david\My Drive\data\analysis_git_data\cgiar\af_excel_text\20_20_For Website_2nd half year report_CSE_17Febr2012.txt
Processing C:\Users\david\My Drive\data\analysis_git_data\cgiar\af_excel\20_20_For website_CSE_4th -Half Year Report_March2014_revised.xls...
Saved processed text to C:\Users\david\My Drive\data\analysis_git_data\cgiar\af_excel_text\20_20_For website_CSE_4th -Half Year Report_March2014_revised.txt
Processing C:\Users\david\My Drive\data\analysis_git_data\cgiar\af_excel\20_20_For Website_CSE_5th-Report-20_Sept-2013_v2.xls...
Saved processed text to C:\Users\david\My Drive\data\analysis_git_data\cgiar\af_excel_text\20_20_For Website_CSE_5th-Report-20_Sept-2013_v2.txt

  warn(msg)
  warn(msg)
  warn(msg)
  warn(msg)
  warn(msg)
  warn(msg)
  warn(msg)
  warn(msg)


Saved processed text to C:\Users\david\My Drive\data\analysis_git_data\cgiar\af_excel_text\24_24_SANBI_URP For Web_PPR Y2_180328_update_180531.txt
Processing C:\Users\david\My Drive\data\analysis_git_data\cgiar\af_excel\24_24_SANBI_URP_Y5 PPR_update 210526_for_web.xlsx...
Saved processed text to C:\Users\david\My Drive\data\analysis_git_data\cgiar\af_excel_text\24_24_SANBI_URP_Y5 PPR_update 210526_for_web.txt
Processing C:\Users\david\My Drive\data\analysis_git_data\cgiar\af_excel\25_25_FOR WEBSITE SANBI_SGF_Y2 PPR_180312.xlsx...


  warn(msg)
  warn(msg)
  warn(msg)
  warn(msg)
  warn(msg)
  warn(msg)
  warn(msg)
  warn(msg)


Saved processed text to C:\Users\david\My Drive\data\analysis_git_data\cgiar\af_excel_text\25_25_FOR WEBSITE SANBI_SGF_Y2 PPR_180312.txt
Processing C:\Users\david\My Drive\data\analysis_git_data\cgiar\af_excel\25_25_For Website_SANBI_Small-Grants-Facility_Year-1-PPR_25-11-2016_revised-13-02-2017.xlsx...


  warn(msg)
  warn(msg)
  warn(msg)
  warn(msg)
  warn(msg)
  warn(msg)
  warn(msg)


Saved processed text to C:\Users\david\My Drive\data\analysis_git_data\cgiar\af_excel_text\25_25_For Website_SANBI_Small-Grants-Facility_Year-1-PPR_25-11-2016_revised-13-02-2017.txt
Processing C:\Users\david\My Drive\data\analysis_git_data\cgiar\af_excel\25_25_SANBI_Report_Year 4 PPR_SGF_Final_for_web.xlsx...


  warn(msg)
  warn(msg)
  warn(msg)
  warn(msg)
  warn(msg)
  warn(msg)
  warn(msg)
  warn(msg)
  warn(msg)
  warn(msg)
  warn(msg)


Saved processed text to C:\Users\david\My Drive\data\analysis_git_data\cgiar\af_excel_text\25_25_SANBI_Report_Year 4 PPR_SGF_Final_for_web.txt
Processing C:\Users\david\My Drive\data\analysis_git_data\cgiar\af_excel\25_25_SANBI_SGF_5_PPR_For_web.xlsx...


  warn(msg)
  warn(msg)
  warn(msg)
  warn(msg)
  warn(msg)
  warn(msg)
  warn(msg)
  warn(msg)


Saved processed text to C:\Users\david\My Drive\data\analysis_git_data\cgiar\af_excel_text\25_25_SANBI_SGF_5_PPR_For_web.txt
Processing C:\Users\david\My Drive\data\analysis_git_data\cgiar\af_excel\25_25_web_SGF Year 3_PPR - Revised Final_Submitted to AF_190514.xlsx...
Saved processed text to C:\Users\david\My Drive\data\analysis_git_data\cgiar\af_excel_text\25_25_web_SGF Year 3_PPR - Revised Final_Submitted to AF_190514.txt
Processing C:\Users\david\My Drive\data\analysis_git_data\cgiar\af_excel\25_SANBI SGF_Project Completion Summary_Expenditure Report_220923.xlsx...
Saved processed text to C:\Users\david\My Drive\data\analysis_git_data\cgiar\af_excel_text\25_SANBI SGF_Project Completion Summary_Expenditure Report_220923.txt
Processing C:\Users\david\My Drive\data\analysis_git_data\cgiar\af_excel\26_26_4569_AF_PPR.SRIC.CC.Cook.Islands_26nov2015_no_procurement.xls...
Saved processed text to C:\Users\david\My Drive\data\analysis_git_data\cgiar\af_excel_text\26_26_4569_AF_PPR.SRIC.CC.Co

  warn(msg)
  warn(msg)
  warn(msg)
  warn(msg)
  warn(msg)
  warn(msg)


Saved processed text to C:\Users\david\My Drive\data\analysis_git_data\cgiar\af_excel_text\3062_2022-Feb_Y5PPR_Laos Y5_revised_20230428 - for web.txt
Processing C:\Users\david\My Drive\data\analysis_git_data\cgiar\af_excel\3062_3062_For Website_PPR-2017-2018 - LAO_PDR submission 022218.xlsx...


  warn(msg)
  warn(msg)
  warn(msg)
  warn(msg)
  warn(msg)
  warn(msg)
  warn(msg)
  warn(msg)


Saved processed text to C:\Users\david\My Drive\data\analysis_git_data\cgiar\af_excel_text\3062_3062_For Website_PPR-2017-2018 - LAO_PDR submission 022218.txt
Processing C:\Users\david\My Drive\data\analysis_git_data\cgiar\af_excel\3062_3062_For web_PPR_LAO_MIE_DRR_2016_1_03.30.20.xlsx...


  warn(msg)
  warn(msg)
  warn(msg)
  warn(msg)
  warn(msg)
  warn(msg)


Saved processed text to C:\Users\david\My Drive\data\analysis_git_data\cgiar\af_excel_text\3062_3062_For web_PPR_LAO_MIE_DRR_2016_1_03.30.20.txt
Processing C:\Users\david\My Drive\data\analysis_git_data\cgiar\af_excel\3062_3062_Y3PPR_Laos  RevV2 For web.xlsx...


  warn(msg)
  warn(msg)
  warn(msg)
  warn(msg)
  warn(msg)
  warn(msg)


Saved processed text to C:\Users\david\My Drive\data\analysis_git_data\cgiar\af_excel_text\3062_3062_Y3PPR_Laos  RevV2 For web.txt
Processing C:\Users\david\My Drive\data\analysis_git_data\cgiar\af_excel\3062_Copy of 2021-Feb_Y4PPR_Laos Y4_Revised_13 August 2021_Final_For web.xlsx...


  warn(msg)
  warn(msg)
  warn(msg)
  warn(msg)
  warn(msg)
  warn(msg)


Saved processed text to C:\Users\david\My Drive\data\analysis_git_data\cgiar\af_excel_text\3062_Copy of 2021-Feb_Y4PPR_Laos Y4_Revised_13 August 2021_Final_For web.txt
Processing C:\Users\david\My Drive\data\analysis_git_data\cgiar\af_excel\3062_Y6PPR_Laos_20240228sent - for web.xlsx...


  warn(msg)
  warn(msg)


Saved processed text to C:\Users\david\My Drive\data\analysis_git_data\cgiar\af_excel_text\3062_Y6PPR_Laos_20240228sent - for web.txt
Processing C:\Users\david\My Drive\data\analysis_git_data\cgiar\af_excel\3066_3066_web_Copy of PPR1_WFP Colombia-Ecuador_2018-2019_15October2019.xlsx...


  warn(msg)
  warn(msg)
  warn(msg)
  warn(msg)
  warn(msg)
  warn(msg)
  warn(msg)
  warn(msg)


Saved processed text to C:\Users\david\My Drive\data\analysis_git_data\cgiar\af_excel_text\3066_3066_web_Copy of PPR1_WFP Colombia-Ecuador_2018-2019_15October2019.txt
Processing C:\Users\david\My Drive\data\analysis_git_data\cgiar\af_excel\3066_3066_web_Copy of PPR3 Colombia-Ecuador adjusted.xlsx...
Saved processed text to C:\Users\david\My Drive\data\analysis_git_data\cgiar\af_excel_text\3066_3066_web_Copy of PPR3 Colombia-Ecuador adjusted.txt
Processing C:\Users\david\My Drive\data\analysis_git_data\cgiar\af_excel\3066_3066_WEB_PPR2_WFP Colombia-Ecuador_2019-2020 1 October 2020.xlsx...
Saved processed text to C:\Users\david\My Drive\data\analysis_git_data\cgiar\af_excel_text\3066_3066_WEB_PPR2_WFP Colombia-Ecuador_2019-2020 1 October 2020.txt
Processing C:\Users\david\My Drive\data\analysis_git_data\cgiar\af_excel\3066_web_Copy of PPR4 Regional project Colombia-Ecuador Feb2023.xlsx...
Saved processed text to C:\Users\david\My Drive\data\analysis_git_data\cgiar\af_excel_text\3066_web_

  warn(msg)
  warn(msg)
  warn(msg)
  warn(msg)
  warn(msg)
  warn(msg)
  warn(msg)
  warn(msg)


Saved processed text to C:\Users\david\My Drive\data\analysis_git_data\cgiar\af_excel_text\3070_3070_Uzbekistan_PPR-Template_Amended-October-2019-GLOFCA-CLEAN_for_web.txt
Processing C:\Users\david\My Drive\data\analysis_git_data\cgiar\af_excel\31_31_Climate Proofing PPR-1_9 May 2018.xlsx...
Saved processed text to C:\Users\david\My Drive\data\analysis_git_data\cgiar\af_excel_text\31_31_Climate Proofing PPR-1_9 May 2018.txt
Processing C:\Users\david\My Drive\data\analysis_git_data\cgiar\af_excel\31_31_PPR 2 for web.xlsx...
Saved processed text to C:\Users\david\My Drive\data\analysis_git_data\cgiar\af_excel_text\31_31_PPR 2 for web.txt
Processing C:\Users\david\My Drive\data\analysis_git_data\cgiar\af_excel\32_32_WEB_Revised TAAL PPR 1 May 2018.xlsx...
Saved processed text to C:\Users\david\My Drive\data\analysis_git_data\cgiar\af_excel_text\32_32_WEB_Revised TAAL PPR 1 May 2018.txt
Processing C:\Users\david\My Drive\data\analysis_git_data\cgiar\af_excel\32_32_WEB_Revised TAAL PPR 2 May

  warn(msg)
  warn(msg)


Saved processed text to C:\Users\david\My Drive\data\analysis_git_data\cgiar\af_excel_text\37_37_web_UNEP_Cambodia AF_PPR_Year 6.txt
Processing C:\Users\david\My Drive\data\analysis_git_data\cgiar\af_excel\37_37_web_UNEP_Cambodia AF_PPR_Year 7_revised_8Dec2020.xlsx...
Saved processed text to C:\Users\david\My Drive\data\analysis_git_data\cgiar\af_excel_text\37_37_web_UNEP_Cambodia AF_PPR_Year 7_revised_8Dec2020.txt
Processing C:\Users\david\My Drive\data\analysis_git_data\cgiar\af_excel\38_38_AgriCal IFAD_AF_revised_PPR_2020_public.xls...
Saved processed text to C:\Users\david\My Drive\data\analysis_git_data\cgiar\af_excel_text\38_38_AgriCal IFAD_AF_revised_PPR_2020_public.txt
Processing C:\Users\david\My Drive\data\analysis_git_data\cgiar\af_excel\38_38_IFAD-AF-Lebanon-AgriCAL-PPR2-2020-2021_07192021_for_web.xlsx...
Saved processed text to C:\Users\david\My Drive\data\analysis_git_data\cgiar\af_excel_text\38_38_IFAD-AF-Lebanon-AgriCAL-PPR2-2020-2021_07192021_for_web.txt
Processing C:\

  warn(msg)
  warn(msg)
  warn(msg)
  warn(msg)
  warn(msg)
  warn(msg)


Saved processed text to C:\Users\david\My Drive\data\analysis_git_data\cgiar\af_excel_text\39_39_Copy of PPR6 AF Sri Lanka June 2021_for web.txt
Processing C:\Users\david\My Drive\data\analysis_git_data\cgiar\af_excel\39_39_For web PPR3 AF Sri Lanka _Sept2016-Aug2017_resubmitted_July 2019.xlsx...
Saved processed text to C:\Users\david\My Drive\data\analysis_git_data\cgiar\af_excel_text\39_39_For web PPR3 AF Sri Lanka _Sept2016-Aug2017_resubmitted_July 2019.txt
Processing C:\Users\david\My Drive\data\analysis_git_data\cgiar\af_excel\39_39_For website_Sri Lanka Year 1 PPR - 12 Nov 2015 .xlsx...
Saved processed text to C:\Users\david\My Drive\data\analysis_git_data\cgiar\af_excel_text\39_39_For website_Sri Lanka Year 1 PPR - 12 Nov 2015 .txt
Processing C:\Users\david\My Drive\data\analysis_git_data\cgiar\af_excel\39_39_For Website_Sri Lanka Year 2 PPR -  2016  - 20 Jan 2017.xls...
Saved processed text to C:\Users\david\My Drive\data\analysis_git_data\cgiar\af_excel_text\39_39_For Website_

  warn(msg)
  warn(msg)
  warn(msg)


Saved processed text to C:\Users\david\My Drive\data\analysis_git_data\cgiar\af_excel_text\40_40_WEB_4703_AF_Myanmar_Final PPR_revised 6 April 2020.txt
Processing C:\Users\david\My Drive\data\analysis_git_data\cgiar\af_excel\4131_4131_PPR1 Artik report_For_Web.xls...
Saved processed text to C:\Users\david\My Drive\data\analysis_git_data\cgiar\af_excel_text\4131_4131_PPR1 Artik report_For_Web.txt
Processing C:\Users\david\My Drive\data\analysis_git_data\cgiar\af_excel\4131_4131_PPR2-Artik-report_cleared_for web.xls...
Saved processed text to C:\Users\david\My Drive\data\analysis_git_data\cgiar\af_excel_text\4131_4131_PPR2-Artik-report_cleared_for web.txt
Processing C:\Users\david\My Drive\data\analysis_git_data\cgiar\af_excel\4131_Artik city closed stonepit wastes and flood management pilot project_PPR3_for_web.xls...
Saved processed text to C:\Users\david\My Drive\data\analysis_git_data\cgiar\af_excel_text\4131_Artik city closed stonepit wastes and flood management pilot project_PPR3_f

  warn(msg)
  warn(msg)
  warn(msg)
  warn(msg)
  warn(msg)
  warn(msg)
  warn(msg)
  warn(msg)
  warn(msg)
  warn(msg)


Saved processed text to C:\Users\david\My Drive\data\analysis_git_data\cgiar\af_excel_text\4133_4133_Fund PPR-2nd Year 25nov.2021_web.txt
Processing C:\Users\david\My Drive\data\analysis_git_data\cgiar\af_excel\4133_4133_web_PPR-1 IDDI Resilience Project 24.7.2020.xlsx...


  warn(msg)
  warn(msg)
  warn(msg)
  warn(msg)
  warn(msg)
  warn(msg)
  warn(msg)
  warn(msg)
  warn(msg)
  warn(msg)


Saved processed text to C:\Users\david\My Drive\data\analysis_git_data\cgiar\af_excel_text\4133_4133_web_PPR-1 IDDI Resilience Project 24.7.2020.txt
Processing C:\Users\david\My Drive\data\analysis_git_data\cgiar\af_excel\4133_WEB_Adapt. Fund PPR-Year3. 14.12.2022.xlsx...


  warn(msg)
  warn(msg)
  warn(msg)
  warn(msg)
  warn(msg)
  warn(msg)
  warn(msg)
  warn(msg)
  warn(msg)
  warn(msg)


Saved processed text to C:\Users\david\My Drive\data\analysis_git_data\cgiar\af_excel_text\4133_WEB_Adapt. Fund PPR-Year3. 14.12.2022.txt
Processing C:\Users\david\My Drive\data\analysis_git_data\cgiar\af_excel\4135_4135_Copy of PPR 2 Resilient Honiara Project for web.xlsx...


  warn(msg)
  warn(msg)
  warn(msg)
  warn(msg)
  warn(msg)
  warn(msg)
  warn(msg)


Saved processed text to C:\Users\david\My Drive\data\analysis_git_data\cgiar\af_excel_text\4135_4135_Copy of PPR 2 Resilient Honiara Project for web.txt
Processing C:\Users\david\My Drive\data\analysis_git_data\cgiar\af_excel\4135_4135_PPR 1 Resilient Honiara Project 2019  rev Feb 2020 for web.xlsx...


  warn(msg)
  warn(msg)
  warn(msg)
  warn(msg)
  warn(msg)
  warn(msg)
  warn(msg)


Saved processed text to C:\Users\david\My Drive\data\analysis_git_data\cgiar\af_excel_text\4135_4135_PPR 1 Resilient Honiara Project 2019  rev Feb 2020 for web.txt
Processing C:\Users\david\My Drive\data\analysis_git_data\cgiar\af_excel\4135_Revised PPR 3 for web Resilient Honiara Project 2021 UN-Habitat.xlsx...
Saved processed text to C:\Users\david\My Drive\data\analysis_git_data\cgiar\af_excel_text\4135_Revised PPR 3 for web Resilient Honiara Project 2021 UN-Habitat.txt
Processing C:\Users\david\My Drive\data\analysis_git_data\cgiar\af_excel\4136_4136_Copy of PPR 1st report Fiji Resilient 2018-19 for web.xlsx...


  warn(msg)
  warn(msg)
  warn(msg)
  warn(msg)
  warn(msg)
  warn(msg)
  warn(msg)
  warn(msg)


Saved processed text to C:\Users\david\My Drive\data\analysis_git_data\cgiar\af_excel_text\4136_4136_Copy of PPR 1st report Fiji Resilient 2018-19 for web.txt
Processing C:\Users\david\My Drive\data\analysis_git_data\cgiar\af_excel\4136_4136_PPR 2nd report Fiji Resilient 2019-2020 comments UN-H for web.xlsx...


  warn(msg)
  warn(msg)
  warn(msg)
  warn(msg)
  warn(msg)
  warn(msg)
  warn(msg)
  warn(msg)


Saved processed text to C:\Users\david\My Drive\data\analysis_git_data\cgiar\af_excel_text\4136_4136_PPR 2nd report Fiji Resilient 2019-2020 comments UN-H for web.txt
Processing C:\Users\david\My Drive\data\analysis_git_data\cgiar\af_excel\4136_Revised PPR 3rd for web Fiji Resilient 2020-2021 comments UN-Habitat.xlsx...
Saved processed text to C:\Users\david\My Drive\data\analysis_git_data\cgiar\af_excel_text\4136_Revised PPR 3rd for web Fiji Resilient 2020-2021 comments UN-Habitat.txt
Processing C:\Users\david\My Drive\data\analysis_git_data\cgiar\af_excel\4137_4137_web_5839 PPR2 ADAPTARC 24jun21.xlsx...
Saved processed text to C:\Users\david\My Drive\data\analysis_git_data\cgiar\af_excel_text\4137_4137_web_5839 PPR2 ADAPTARC 24jun21.txt
Processing C:\Users\david\My Drive\data\analysis_git_data\cgiar\af_excel\4137_4137_web_5839_AF_Honduras_PPR_final_rvChecklist17May2020.xlsx...
Saved processed text to C:\Users\david\My Drive\data\analysis_git_data\cgiar\af_excel_text\4137_4137_web_583

  warn(msg)
  warn(msg)
  warn(msg)
  warn(msg)
  warn(msg)
  warn(msg)
  warn(msg)
  warn(msg)


Saved processed text to C:\Users\david\My Drive\data\analysis_git_data\cgiar\af_excel_text\42_42_4582_AF_Maldives_IWRM - PPR_final_2015_November_final_for web.txt
Processing C:\Users\david\My Drive\data\analysis_git_data\cgiar\af_excel\42_42_For Website_4582_AF_Maldives_IWRM_PPR_ 2013_resubmission_MI-v01-18Oct2013.xls...
Saved processed text to C:\Users\david\My Drive\data\analysis_git_data\cgiar\af_excel_text\42_42_For Website_4582_AF_Maldives_IWRM_PPR_ 2013_resubmission_MI-v01-18Oct2013.txt
Processing C:\Users\david\My Drive\data\analysis_git_data\cgiar\af_excel\43_43_Nepal 3rd PPR revised_9Jun2022 - for web.xlsx...
Saved processed text to C:\Users\david\My Drive\data\analysis_git_data\cgiar\af_excel_text\43_43_Nepal 3rd PPR revised_9Jun2022 - for web.txt
Processing C:\Users\david\My Drive\data\analysis_git_data\cgiar\af_excel\43_43_Nepal PPR 2 17-12-2020_final_for_web.xlsx...
Saved processed text to C:\Users\david\My Drive\data\analysis_git_data\cgiar\af_excel_text\43_43_Nepal PPR 2

  warn(msg)
  warn(msg)
  warn(msg)
  warn(msg)
  warn(msg)
  warn(msg)
  warn(msg)
  warn(msg)
  warn(msg)


Saved processed text to C:\Users\david\My Drive\data\analysis_git_data\cgiar\af_excel_text\45_45_For Website_PPR 3_PNG-AF_2015-PPR_05nov2015_final_for-web.txt
Processing C:\Users\david\My Drive\data\analysis_git_data\cgiar\af_excel\46_46_For Website_Pakistan_PPR_Final_May_2013.xls...
Saved processed text to C:\Users\david\My Drive\data\analysis_git_data\cgiar\af_excel_text\46_46_For Website_Pakistan_PPR_Final_May_2013.txt
Processing C:\Users\david\My Drive\data\analysis_git_data\cgiar\af_excel\46_46_For Website_PPR3-AF-Pakistan.xlsx...
Saved processed text to C:\Users\david\My Drive\data\analysis_git_data\cgiar\af_excel_text\46_46_For Website_PPR3-AF-Pakistan.txt
Processing C:\Users\david\My Drive\data\analysis_git_data\cgiar\af_excel\46_46_For website_RESUBMISSION_Revised 2013 PPR Pakistan GLOF Project 26 June.xlsx...
Saved processed text to C:\Users\david\My Drive\data\analysis_git_data\cgiar\af_excel_text\46_46_For website_RESUBMISSION_Revised 2013 PPR Pakistan GLOF Project 26 June.

  warn(msg)
  warn(msg)
  warn(msg)
  warn(msg)
  warn(msg)
  warn(msg)
  warn(msg)


Saved processed text to C:\Users\david\My Drive\data\analysis_git_data\cgiar\af_excel_text\50_50_WEB_5002_AF_Uzbekistan_PPR_revised_Dec 17_2018.txt
Processing C:\Users\david\My Drive\data\analysis_git_data\cgiar\af_excel\50_50_web_PPR 5002_ AP _PPR_2020_updated_051120.xlsx...
Saved processed text to C:\Users\david\My Drive\data\analysis_git_data\cgiar\af_excel_text\50_50_web_PPR 5002_ AP _PPR_2020_updated_051120.txt
Processing C:\Users\david\My Drive\data\analysis_git_data\cgiar\af_excel\50_50_WEB__Uzbekistan_PPR_2019_revised.xlsx...


  warn(msg)
  warn(msg)
  warn(msg)
  warn(msg)
  warn(msg)
  warn(msg)
  warn(msg)
  warn(msg)


Saved processed text to C:\Users\david\My Drive\data\analysis_git_data\cgiar\af_excel_text\50_50_WEB__Uzbekistan_PPR_2019_revised.txt
Processing C:\Users\david\My Drive\data\analysis_git_data\cgiar\af_excel\5169_2020 report- revised version_September 21 2023 - for web.xlsx...
Saved processed text to C:\Users\david\My Drive\data\analysis_git_data\cgiar\af_excel_text\5169_2020 report- revised version_September 21 2023 - for web.txt
Processing C:\Users\david\My Drive\data\analysis_git_data\cgiar\af_excel\5169_5169_AF 2019 report_Nov2020_for_web.xlsx...
Saved processed text to C:\Users\david\My Drive\data\analysis_git_data\cgiar\af_excel_text\5169_5169_AF 2019 report_Nov2020_for_web.txt
Processing C:\Users\david\My Drive\data\analysis_git_data\cgiar\af_excel\5169_5169_Climate Smart Integrated Rural Development Project PPR1_web.xlsx...


  warn(msg)
  warn(msg)
  warn(msg)
  warn(msg)
  warn(msg)
  warn(msg)
  warn(msg)
  warn(msg)


Saved processed text to C:\Users\david\My Drive\data\analysis_git_data\cgiar\af_excel_text\5169_5169_Climate Smart Integrated Rural Development Project PPR1_web.txt
Processing C:\Users\david\My Drive\data\analysis_git_data\cgiar\af_excel\5192_5192_Antigua and Barbuda January 2019 - June 2020_Revised PPR Feb 2021_For web.xlsx...
Saved processed text to C:\Users\david\My Drive\data\analysis_git_data\cgiar\af_excel_text\5192_5192_Antigua and Barbuda January 2019 - June 2020_Revised PPR Feb 2021_For web.txt
Processing C:\Users\david\My Drive\data\analysis_git_data\cgiar\af_excel\5192_5192_PPR 3 for Antigua and Barbuda.xlsx...
Saved processed text to C:\Users\david\My Drive\data\analysis_git_data\cgiar\af_excel_text\5192_5192_PPR 3 for Antigua and Barbuda.txt
Processing C:\Users\david\My Drive\data\analysis_git_data\cgiar\af_excel\5192_5192_WEB_Revised PPR Adaptation Fund.xlsx...


  warn(msg)
  warn(msg)
  warn(msg)
  warn(msg)
  warn(msg)


Saved processed text to C:\Users\david\My Drive\data\analysis_git_data\cgiar\af_excel_text\5192_5192_WEB_Revised PPR Adaptation Fund.txt
Processing C:\Users\david\My Drive\data\analysis_git_data\cgiar\af_excel\5192_Antigua and Barbuda January 2022 - December  2022_Final vr_web.xlsx...
Saved processed text to C:\Users\david\My Drive\data\analysis_git_data\cgiar\af_excel_text\5192_Antigua and Barbuda January 2022 - December  2022_Final vr_web.txt
Processing C:\Users\david\My Drive\data\analysis_git_data\cgiar\af_excel\5194_5194_web_210726_FSM AF 2020 Annual Report.xlsx...
Saved processed text to C:\Users\david\My Drive\data\analysis_git_data\cgiar\af_excel_text\5194_5194_web_210726_FSM AF 2020 Annual Report.txt
Processing C:\Users\david\My Drive\data\analysis_git_data\cgiar\af_excel\5194_5194_web_Annual Progress Report 2018_FSM_FSM_RIE.xlsx...
Saved processed text to C:\Users\david\My Drive\data\analysis_git_data\cgiar\af_excel_text\5194_5194_web_Annual Progress Report 2018_FSM_FSM_RIE.t

  warn(msg)
  warn(msg)
  warn(msg)
  warn(msg)
  warn(msg)
  warn(msg)
  warn(msg)
  warn(msg)
  warn(msg)


Saved processed text to C:\Users\david\My Drive\data\analysis_git_data\cgiar\af_excel_text\51_51_For Website_4667 AF Samoa_ PPR FINAL submission Apr 2015.txt
Processing C:\Users\david\My Drive\data\analysis_git_data\cgiar\af_excel\51_51_For Website_4667_AF_Samoa_PPR 29Aug2016.xls...
Saved processed text to C:\Users\david\My Drive\data\analysis_git_data\cgiar\af_excel_text\51_51_For Website_4667_AF_Samoa_PPR 29Aug2016.txt
Processing C:\Users\david\My Drive\data\analysis_git_data\cgiar\af_excel\51_51_For Website_RESUBMISSION_PPR AF Samoa - 28May2014.xls...
Saved processed text to C:\Users\david\My Drive\data\analysis_git_data\cgiar\af_excel_text\51_51_For Website_RESUBMISSION_PPR AF Samoa - 28May2014.txt
Processing C:\Users\david\My Drive\data\analysis_git_data\cgiar\af_excel\5281_5281_Revised PPR 1 March 2022 for web.xlsx...


  warn(msg)
  warn(msg)
  warn(msg)
  warn(msg)
  warn(msg)
  warn(msg)
  warn(msg)


Saved processed text to C:\Users\david\My Drive\data\analysis_git_data\cgiar\af_excel_text\5281_5281_Revised PPR 1 March 2022 for web.txt
Processing C:\Users\david\My Drive\data\analysis_git_data\cgiar\af_excel\5281_PPR2-ADAPT-WAP_Year2_April21-March22_OSS_web.xlsx...
Saved processed text to C:\Users\david\My Drive\data\analysis_git_data\cgiar\af_excel_text\5281_PPR2-ADAPT-WAP_Year2_April21-March22_OSS_web.txt
Processing C:\Users\david\My Drive\data\analysis_git_data\cgiar\af_excel\52_52_4583_Georgia_AF_PPR_ Sep2015_revised_no_procurement.xls...
Saved processed text to C:\Users\david\My Drive\data\analysis_git_data\cgiar\af_excel_text\52_52_4583_Georgia_AF_PPR_ Sep2015_revised_no_procurement.txt
Processing C:\Users\david\My Drive\data\analysis_git_data\cgiar\af_excel\52_52_for web_4583_Georgia_AF_PPR_May 2017.xlsx...
Saved processed text to C:\Users\david\My Drive\data\analysis_git_data\cgiar\af_excel_text\52_52_for web_4583_Georgia_AF_PPR_May 2017.txt
Processing C:\Users\david\My Driv

  warn(msg)


Saved processed text to C:\Users\david\My Drive\data\analysis_git_data\cgiar\af_excel_text\53_53_1 PPR web_ARG-MIE-Rural-2011-1-P125804.txt
Processing C:\Users\david\My Drive\data\analysis_git_data\cgiar\af_excel\53_53_Argentina-PPR-Feb-2018-Rev-ARG-MIE-Rural-2011-1-World-Bank-.xlsx...


  warn(msg)
  warn(msg)
  warn(msg)
  warn(msg)
  warn(msg)
  warn(msg)
  warn(msg)
  warn(msg)
  warn(msg)
  warn(msg)
  warn(msg)


Saved processed text to C:\Users\david\My Drive\data\analysis_git_data\cgiar\af_excel_text\53_53_Argentina-PPR-Feb-2018-Rev-ARG-MIE-Rural-2011-1-World-Bank-.txt
Processing C:\Users\david\My Drive\data\analysis_git_data\cgiar\af_excel\53_53_WEB_AF PPRT Argentina World Bank July 2017 June 2018 final rev WB final.xlsx...


  warn(msg)


Saved processed text to C:\Users\david\My Drive\data\analysis_git_data\cgiar\af_excel_text\53_53_WEB_AF PPRT Argentina World Bank July 2017 June 2018 final rev WB final.txt
Processing C:\Users\david\My Drive\data\analysis_git_data\cgiar\af_excel\54_54_For web_NorthEast ARGENTINA - Annual Report Year 1-v3.xls...
Saved processed text to C:\Users\david\My Drive\data\analysis_git_data\cgiar\af_excel_text\54_54_For web_NorthEast ARGENTINA - Annual Report Year 1-v3.txt
Processing C:\Users\david\My Drive\data\analysis_git_data\cgiar\af_excel\54_54_For web_PPR 2016 NorthEast ARGENTINA 2016.xls...
Saved processed text to C:\Users\david\My Drive\data\analysis_git_data\cgiar\af_excel_text\54_54_For web_PPR 2016 NorthEast ARGENTINA 2016.txt
Processing C:\Users\david\My Drive\data\analysis_git_data\cgiar\af_excel\54_54_NorthEast ARGENTINA - Annual Report Year2_final_for web.xls...
Saved processed text to C:\Users\david\My Drive\data\analysis_git_data\cgiar\af_excel_text\54_54_NorthEast ARGENTINA - 

  warn(msg)
  warn(msg)
  warn(msg)
  warn(msg)
  warn(msg)
  warn(msg)
  warn(msg)
  warn(msg)
  warn(msg)


Saved processed text to C:\Users\david\My Drive\data\analysis_git_data\cgiar\af_excel_text\55_55_For web_PPR_MCCAP 2017 BLZMIECoastal2018-3.txt
Processing C:\Users\david\My Drive\data\analysis_git_data\cgiar\af_excel\55_55_MCCAP Adaptation Fund Progress Report 2019_FOR WEB.xlsx...
Saved processed text to C:\Users\david\My Drive\data\analysis_git_data\cgiar\af_excel_text\55_55_MCCAP Adaptation Fund Progress Report 2019_FOR WEB.txt
Processing C:\Users\david\My Drive\data\analysis_git_data\cgiar\af_excel\55_PPR5_Amended_MCCAP2020ext_Final_web.xlsx...
Saved processed text to C:\Users\david\My Drive\data\analysis_git_data\cgiar\af_excel_text\55_PPR5_Amended_MCCAP2020ext_Final_web.txt
Processing C:\Users\david\My Drive\data\analysis_git_data\cgiar\af_excel\56_56_Chile AGCID PPR 3 final_for web.xlsx...
Saved processed text to C:\Users\david\My Drive\data\analysis_git_data\cgiar\af_excel_text\56_56_Chile AGCID PPR 3 final_for web.txt
Processing C:\Users\david\My Drive\data\analysis_git_data\cg

  warn(msg)
  warn(msg)
  warn(msg)
  warn(msg)
  warn(msg)
  warn(msg)


Saved processed text to C:\Users\david\My Drive\data\analysis_git_data\cgiar\af_excel_text\56_56_No procurement_1er PPR  Proyecto CHL-NIE-Agri-2013-1_27112018.txt
Processing C:\Users\david\My Drive\data\analysis_git_data\cgiar\af_excel\57_57_1st and 2nd PPR Colombia for website.xls...
Saved processed text to C:\Users\david\My Drive\data\analysis_git_data\cgiar\af_excel_text\57_57_1st and 2nd PPR Colombia for website.txt
Processing C:\Users\david\My Drive\data\analysis_git_data\cgiar\af_excel\57_57_4805_AF_Colombia_PPR_resubmission - for website.xlsx...
Saved processed text to C:\Users\david\My Drive\data\analysis_git_data\cgiar\af_excel_text\57_57_4805_AF_Colombia_PPR_resubmission - for website.txt
Processing C:\Users\david\My Drive\data\analysis_git_data\cgiar\af_excel\57_57_4805_AF_Colombia_revised PPR_Jul 17_for web.xlsx...
Saved processed text to C:\Users\david\My Drive\data\analysis_git_data\cgiar\af_excel_text\57_57_4805_AF_Colombia_revised PPR_Jul 17_for web.txt
Processing C:\Us

  warn(msg)
  warn(msg)
  warn(msg)
  warn(msg)
  warn(msg)
  warn(msg)
  warn(msg)
  warn(msg)
  warn(msg)


Saved processed text to C:\Users\david\My Drive\data\analysis_git_data\cgiar\af_excel_text\57_57_AF_Colombia_PPR Jun 2018 for website resub 26Jul18.txt
Processing C:\Users\david\My Drive\data\analysis_git_data\cgiar\af_excel\57_57_Copy of 4805_AF_Colombia_PPR_Jun 2018_for web.xlsx...
Saved processed text to C:\Users\david\My Drive\data\analysis_git_data\cgiar\af_excel_text\57_57_Copy of 4805_AF_Colombia_PPR_Jun 2018_for web.txt
Processing C:\Users\david\My Drive\data\analysis_git_data\cgiar\af_excel\57_57_web_Copy of PPR_AF_Colombia_4805 05July19 for resub.xlsx...
Saved processed text to C:\Users\david\My Drive\data\analysis_git_data\cgiar\af_excel_text\57_57_web_Copy of PPR_AF_Colombia_4805 05July19 for resub.txt
Processing C:\Users\david\My Drive\data\analysis_git_data\cgiar\af_excel\58_58_CLEARED WEB_PPRTemplate_1-Fundecooperacion-3Year Final-19-12-18.xlsx...
Saved processed text to C:\Users\david\My Drive\data\analysis_git_data\cgiar\af_excel_text\58_58_CLEARED WEB_PPRTemplate_1-Fu

  warn(msg)
  warn(msg)
  warn(msg)
  warn(msg)
  warn(msg)
  warn(msg)
  warn(msg)
  warn(msg)


Saved processed text to C:\Users\david\My Drive\data\analysis_git_data\cgiar\af_excel_text\61_61_4386_AF_Guatemala PPR 2016 15DEC2016 - for website.txt
Processing C:\Users\david\My Drive\data\analysis_git_data\cgiar\af_excel\61_61_4386_AF_Guatemala_PPR 8 May 18_final_for web_.xlsx...
Saved processed text to C:\Users\david\My Drive\data\analysis_git_data\cgiar\af_excel_text\61_61_4386_AF_Guatemala_PPR 8 May 18_final_for web_.txt
Processing C:\Users\david\My Drive\data\analysis_git_data\cgiar\af_excel\61_61_For WEB_4386_AF_Guatemala_PPR_24 Jul 17.xlsx...


  warn(msg)
  warn(msg)
  warn(msg)
  warn(msg)
  warn(msg)
  warn(msg)
  warn(msg)
  warn(msg)
  warn(msg)


Saved processed text to C:\Users\david\My Drive\data\analysis_git_data\cgiar\af_excel_text\61_61_For WEB_4386_AF_Guatemala_PPR_24 Jul 17.txt
Processing C:\Users\david\My Drive\data\analysis_git_data\cgiar\af_excel\61_61_web_PPR Guatemala 4386 -final report - 27 March 2019 resubmission.xlsx...
Saved processed text to C:\Users\david\My Drive\data\analysis_git_data\cgiar\af_excel_text\61_61_web_PPR Guatemala 4386 -final report - 27 March 2019 resubmission.txt
Processing C:\Users\david\My Drive\data\analysis_git_data\cgiar\af_excel\62_62_AF_ ProjectPerformanceReport_2012 Honduras_October2012_no procurement.xls...
Saved processed text to C:\Users\david\My Drive\data\analysis_git_data\cgiar\af_excel_text\62_62_AF_ ProjectPerformanceReport_2012 Honduras_October2012_no procurement.txt
Processing C:\Users\david\My Drive\data\analysis_git_data\cgiar\af_excel\62_62_AF_ ProjectPerformanceReport_2013 Honduras_Final_no procurement.xls...
Saved processed text to C:\Users\david\My Drive\data\analysis_

  warn(msg)


Saved processed text to C:\Users\david\My Drive\data\analysis_git_data\cgiar\af_excel_text\63_63_for web_AF_PPR_Year 4_Jan172017.txt
Processing C:\Users\david\My Drive\data\analysis_git_data\cgiar\af_excel\63_63_Forweb_PPR7.xlsx...
Saved processed text to C:\Users\david\My Drive\data\analysis_git_data\cgiar\af_excel_text\63_63_Forweb_PPR7.txt
Processing C:\Users\david\My Drive\data\analysis_git_data\cgiar\af_excel\63_63_GOJ-AF_PPR_October-2021 PPR 9 for web.xlsx...
Saved processed text to C:\Users\david\My Drive\data\analysis_git_data\cgiar\af_excel_text\63_63_GOJ-AF_PPR_October-2021 PPR 9 for web.txt
Processing C:\Users\david\My Drive\data\analysis_git_data\cgiar\af_excel\63_AF_PPR_Year 5_Nov2017_Amended final_for web.xlsx...


  warn(msg)


Saved processed text to C:\Users\david\My Drive\data\analysis_git_data\cgiar\af_excel_text\63_AF_PPR_Year 5_Nov2017_Amended final_for web.txt
Processing C:\Users\david\My Drive\data\analysis_git_data\cgiar\af_excel\63_AF_PPR_Year 6_2018_final - for web.xlsx...


  warn(msg)


Saved processed text to C:\Users\david\My Drive\data\analysis_git_data\cgiar\af_excel_text\63_AF_PPR_Year 6_2018_final - for web.txt
Processing C:\Users\david\My Drive\data\analysis_git_data\cgiar\af_excel\64_64_4448 Nicaragua AF PPR 2014 submission_18March2015rev_no procurement.xls...
Saved processed text to C:\Users\david\My Drive\data\analysis_git_data\cgiar\af_excel_text\64_64_4448 Nicaragua AF PPR 2014 submission_18March2015rev_no procurement.txt
Processing C:\Users\david\My Drive\data\analysis_git_data\cgiar\af_excel\64_64_PPR- ESTERO REAL -JULIO 2014 A JUNIO 2015 -  NUEVAS  INCORPORACIONES - 30SEP2_noprocurement.xls...
Saved processed text to C:\Users\david\My Drive\data\analysis_git_data\cgiar\af_excel_text\64_64_PPR- ESTERO REAL -JULIO 2014 A JUNIO 2015 -  NUEVAS  INCORPORACIONES - 30SEP2_noprocurement.txt
Processing C:\Users\david\My Drive\data\analysis_git_data\cgiar\af_excel\64_64_PPR_Nicaragua_2013_AF_Final_no procurement.xls...
Saved processed text to C:\Users\david\My Dr

  warn(msg)
  warn(msg)
  warn(msg)
  warn(msg)
  warn(msg)
  warn(msg)
  warn(msg)
  warn(msg)
  warn(msg)
  warn(msg)


Saved processed text to C:\Users\david\My Drive\data\analysis_git_data\cgiar\af_excel_text\69_69_For website_ Project Performance Report 2_22102013_revised.txt
Processing C:\Users\david\My Drive\data\analysis_git_data\cgiar\af_excel\69_69_For Website_PPR 3_Template 2015_9 agosto.xlsx...
Saved processed text to C:\Users\david\My Drive\data\analysis_git_data\cgiar\af_excel_text\69_69_For Website_PPR 3_Template 2015_9 agosto.txt
Processing C:\Users\david\My Drive\data\analysis_git_data\cgiar\af_excel\69_69_For Website_PPR 4_Template 2016 - 2.xlsx...
Saved processed text to C:\Users\david\My Drive\data\analysis_git_data\cgiar\af_excel_text\69_69_For Website_PPR 4_Template 2016 - 2.txt
Processing C:\Users\david\My Drive\data\analysis_git_data\cgiar\af_excel\69_69_For website_Uruguay PPR 1 final .xlsx...


  warn(msg)
  warn(msg)
  warn(msg)
  warn(msg)
  warn(msg)
  warn(msg)
  warn(msg)
  warn(msg)
  warn(msg)
  warn(msg)


Saved processed text to C:\Users\david\My Drive\data\analysis_git_data\cgiar\af_excel_text\69_69_For website_Uruguay PPR 1 final .txt
Processing C:\Users\david\My Drive\data\analysis_git_data\cgiar\af_excel\69_69_web_Copy of PPRTemplate 2017_26 march 2019.xlsx...
Saved processed text to C:\Users\david\My Drive\data\analysis_git_data\cgiar\af_excel_text\69_69_web_Copy of PPRTemplate 2017_26 march 2019.txt
Processing C:\Users\david\My Drive\data\analysis_git_data\cgiar\af_excel\69_69_web_PPR Template 2018.xlsx...
Saved processed text to C:\Users\david\My Drive\data\analysis_git_data\cgiar\af_excel_text\69_69_web_PPR Template 2018.txt
Processing C:\Users\david\My Drive\data\analysis_git_data\cgiar\af_excel\69_69_web_PPRTemplate 2019.xlsx...
Saved processed text to C:\Users\david\My Drive\data\analysis_git_data\cgiar\af_excel_text\69_69_web_PPRTemplate 2019.txt
Processing C:\Users\david\My Drive\data\analysis_git_data\cgiar\af_excel\6_6_CLEARED FOR WEB_Copy of 4952_AF_Ghana_PPR_Aug 2019_re

  warn(msg)
  warn(msg)
  warn(msg)
  warn(msg)
  warn(msg)
  warn(msg)
  warn(msg)
  warn(msg)
  warn(msg)
  warn(msg)


Saved processed text to C:\Users\david\My Drive\data\analysis_git_data\cgiar\af_excel_text\9969_30012023PPR_Year 1_SWAHAT revised 22 Feb 2023 (Rev4) - for web.txt
Processing C:\Users\david\My Drive\data\analysis_git_data\cgiar\af_excel\9_9_ADA-PPR_2019_For_Web.xlsx...
Saved processed text to C:\Users\david\My Drive\data\analysis_git_data\cgiar\af_excel_text\9_9_ADA-PPR_2019_For_Web.txt
Processing C:\Users\david\My Drive\data\analysis_git_data\cgiar\af_excel\9_9_For Website_170412 PPR_PACCZO-2016.xlsx...
Saved processed text to C:\Users\david\My Drive\data\analysis_git_data\cgiar\af_excel_text\9_9_For Website_170412 PPR_PACCZO-2016.txt
Processing C:\Users\david\My Drive\data\analysis_git_data\cgiar\af_excel\9_9_web 280119_ADA-Morocco_PPR FINAL.xlsx...
Saved processed text to C:\Users\david\My Drive\data\analysis_git_data\cgiar\af_excel_text\9_9_web 280119_ADA-Morocco_PPR FINAL.txt
Processing C:\Users\david\My Drive\data\analysis_git_data\cgiar\af_excel\9_9_web_ADA-PPR_2020_V reviwed_cle

Save different headings in different columns  WORD

//TODO: Check with Cesare if the following approaches can work. Still a bunch of cleaning but could also be a way to split sentence.

In [2]:
import pandas as pd
def process_document_to_df(doc_path):
    doc = Document(doc_path)
    data = {}
    current_heading = None

    for paragraph in doc.paragraphs:
        if paragraph.style.name.startswith('Heading'):
            current_heading = paragraph.text
            data[current_heading] = []
        else:
            if current_heading:
                data[current_heading].append(paragraph.text)
            else:
                pass
    df = pd.DataFrame(dict([(k,pd.Series(v)) for k,v in data.items()]))
    return df


# Path to your Word document
doc_path = "C:\\Users\\david\\My Drive\\data\\analysis_git_data\\cgiar\\AF_docs\\50_50_MTR-AF-Uzbekistan-Report-FINAL2.docx"

# Desired output path for the text file
output_path ='C:\\Users\\david\\My Drive\\data\\analysis_git_data\\cgiar'

# Process the document and get structured text
structured_text = process_document_to_df(doc_path)




NameError: name 'Document' is not defined

TRY TO SAVE ONLY FROM H1 TO H1

In [None]:

def process_document_to_df(doc_path):
    doc = Document(doc_path)
    data = {}
    current_heading = None
    current_content = []

    for paragraph in doc.paragraphs:
        if paragraph.style.name == 'Heading 1':
            if current_heading is not None:
                data[current_heading] = "\n".join(current_content)
                current_content = []
            current_heading = paragraph.text
            data[current_heading] = []
        else:
            if current_heading:
                current_content.append(paragraph.text)

    if current_heading is not None and current_heading not in data:
        data[current_heading] = "\n".join(current_content)

    df = pd.DataFrame(list(data.items()), columns=['Heading', 'Content'])
    return df

# Directory containing your Word documents
doc_directory = "C:\\Users\\david\\My Drive\\data\\analysis_git_data\\cgiar\\sample_word"

dfs = []

for filename in os.listdir(doc_directory):
    if filename.endswith(".docx"):
        doc_path = os.path.join(doc_directory, filename)
        
        print(f"Processing document: {filename}")  # Print statement added here
        df = process_document_to_df(doc_path)
        
        df['Document'] = filename
        dfs.append(df)

combined_df = pd.concat(dfs, ignore_index=True, sort=False)

# Optional: save the combined DataFrame to a CSV file
output_csv_path = os.path.join(doc_directory, 'combined_documents.csv')
combined_df.to_csv(output_csv_path, index=False)

print(f"Combined DataFrame saved to {output_csv_path}")