# Notebook: Convert the CSVs with the corrected OCR to TXT files
-> each CSV to one TXT

In [None]:
# connect to drive
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


## 1. Import libraries

In [None]:
import os
import pandas as pd

## 2. Function that takes a filepath to the CSVs and returns TXTs

In [None]:
def convert_csv_to_txt(folder_path):

    # set directories
    parent_folder = os.path.dirname(folder_path)
    output_folder = os.path.join(parent_folder, "TXT_Files")
    os.makedirs(output_folder, exist_ok=True)

    # Loop through each CSV file in the folder
    for csv_file in os.listdir(folder_path):
        if csv_file.endswith('.csv'):
            # Construct file paths
            csv_path = os.path.join(folder_path, csv_file)
            base_name = os.path.basename(folder_path)
            txt_file_name = f"{base_name}_{csv_file[-6:-4]}.txt"
            txt_path = os.path.join(output_folder, txt_file_name)

            # read the CSV file
            df = pd.read_csv(csv_path, usecols=['corrected_ocr_text', 'page_text'])

            # Write to the TXT file
            with open(txt_path, 'w') as txt_file:
                for _, row in df.iterrows():
                    # check for NaN and skip if necessary
                    if pd.isna(row['corrected_ocr_text']) or pd.isna(row['page_text']):
                        continue

                    try: # error handling
                        corrected_text = str(row['corrected_ocr_text'])
                        page_text = str(row['page_text'])
                        txt_file.write(corrected_text + "\n")
                        txt_file.write(page_text + "\n\n")
                    except TypeError as e:
                        print(f"Skipping a row in {csv_file} due to a TypeError: {e}")
                    except Exception as e:
                        print(f"Skipping a row in {csv_file} due to an unexpected error: {e}")

            print(f"Converted {csv_file} to {txt_file_name}")

## 3. Use it

In [None]:
folder_path = '../data/source_documents_CSV_corrected/1935_36'
convert_csv_to_txt(folder_path)

Converted df_Heft_21.csv to 1935_36_21.txt
Converted df_Heft_22.csv to 1935_36_22.txt
Converted df_Heft_23.csv to 1935_36_23.txt
Converted df_Heft_24.csv to 1935_36_24.txt
Converted df_Heft_25.csv to 1935_36_25.txt
Converted df_Heft_10.csv to 1935_36_10.txt
Converted df_Heft_11.csv to 1935_36_11.txt
Converted df_Heft_12.csv to 1935_36_12.txt
Converted df_Heft_13.csv to 1935_36_13.txt
Converted df_Heft_14.csv to 1935_36_14.txt
Converted df_Heft_15.csv to 1935_36_15.txt
Converted df_Heft_16.csv to 1935_36_16.txt
Converted df_Heft_17.csv to 1935_36_17.txt
Converted df_Heft_18.csv to 1935_36_18.txt
Converted df_Heft_19.csv to 1935_36_19.txt
Converted df_Heft_08.csv to 1935_36_08.txt
Converted df_Heft_05.csv to 1935_36_05.txt
Converted df_Heft_04.csv to 1935_36_04.txt
Converted df_Heft_03.csv to 1935_36_03.txt
Converted df_Heft_02.csv to 1935_36_02.txt
