In [5]:
import pandas as pd
import os

# --- Configuration ---

# 1. INPUT FILE NAME: CHANGE THIS to your actual file name or path.
# We are changing this to read the .xlsx file.
INPUT_FILENAME = r"C:\Users\bhatt\OneDrive\Desktop\Sameek\0. University of North Texas\UNT Masters Course\Semester 3 - Fall 2025\CSCE 5310 - Methods in Empirical Analysis\Project\Code\raw_data_essay_set1.xlsx" 

# 2. TARGET DIRECTORY: The exact path where you want to save the .txt files
OUTPUT_DIR = r"C:\Users\bhatt\OneDrive\Desktop\Sameek\0. University of North Texas\UNT Masters Course\Semester 3 - Fall 2025\CSCE 5310 - Methods in Empirical Analysis\Project\Code\Repo"

# 3. COLUMN NAMES (Ensure these match your file's headers)
DOC_NUM_COL = 'document_number'
ESSAY_COL = 'essay'

# --- Main Execution Function ---

def export_essays_to_txt_from_excel(input_file, output_directory):
    """Reads the Excel dataset and saves each essay into a separate text file."""
    
    # --- File Loading (MODIFIED TO READ EXCEL) ---
    try:
        # Use read_excel() instead of read_csv()
        df = pd.read_excel(input_file) 
        print(f"Successfully loaded data from: {input_file}")
    except FileNotFoundError:
        print(f"❌ Error: The input file '{input_file}' was not found.")
        print("Please check the INPUT_FILENAME configuration.")
        return
    except Exception as e:
        print(f"❌ Error reading file: {e}")
        return

    # 1. Ensure the output directory exists
    try:
        os.makedirs(output_directory, exist_ok=True)
        print(f"Output directory ensured: {output_directory}")
    except OSError as e:
        print(f"❌ Error creating directory: {e}")
        return

    # 2. Iterate through the DataFrame and create files
    files_created_count = 0
    
    for index, row in df.iterrows():
        try:
            doc_num = str(row[DOC_NUM_COL])
            essay_content = str(row[ESSAY_COL])

            # Construct the full path and filename
            filename = f"{doc_num}.txt"
            file_path = os.path.join(output_directory, filename)

            # Write the essay content to the .txt file
            with open(file_path, 'w', encoding='utf-8') as f:
                f.write(essay_content)
            
            files_created_count += 1
            
        except KeyError as e:
            print(f"❌ Error: Column {e} not found. Check DOC_NUM_COL and ESSAY_COL configuration.")
            return
        except Exception as e:
            print(f"❌ Failed to process essay at index {index}: {e}")

    print("\n✅ Execution Complete.")
    print(f"Total files successfully created: {files_created_count}")
    print(f"Files saved in: {output_directory}")


# Execute the function
export_essays_to_txt_from_excel(INPUT_FILENAME, OUTPUT_DIR)

Successfully loaded data from: C:\Users\bhatt\OneDrive\Desktop\Sameek\0. University of North Texas\UNT Masters Course\Semester 3 - Fall 2025\CSCE 5310 - Methods in Empirical Analysis\Project\Code\raw_data_essay_set1.xlsx
Output directory ensured: C:\Users\bhatt\OneDrive\Desktop\Sameek\0. University of North Texas\UNT Masters Course\Semester 3 - Fall 2025\CSCE 5310 - Methods in Empirical Analysis\Project\Code\Repo

✅ Execution Complete.
Total files successfully created: 141
Files saved in: C:\Users\bhatt\OneDrive\Desktop\Sameek\0. University of North Texas\UNT Masters Course\Semester 3 - Fall 2025\CSCE 5310 - Methods in Empirical Analysis\Project\Code\Repo
