In [4]:
import pandas as pd
import os
import math

In [5]:
# --- Configuration ---
file_directory = r"C:/Users/jason/OneDrive - The Pennsylvania State University/DAAN 881, Data Drive Decision Making/Project/Auto Data/Merged/"
csv_filename = "PA_PROCESSED_corrected.csv"
excel_filename = "PA_PROCESSED_corrected_multi_sheet.xlsx" # Changed name slightly

csv_filepath = os.path.join(file_directory, csv_filename)
excel_filepath = os.path.join(file_directory, excel_filename)

# Define max rows per Excel sheet (slightly less than the limit for safety)
max_rows_per_sheet = 1000000

print(f"Input CSV file: {csv_filepath}")
print(f"Output Excel file (multi-sheet): {excel_filepath}")
print(f"Max rows per sheet: {max_rows_per_sheet:,}")

Input CSV file: C:/Users/jason/OneDrive - The Pennsylvania State University/DAAN 881, Data Drive Decision Making/Project/Auto Data/Merged/PA_PROCESSED_corrected.csv
Output Excel file (multi-sheet): C:/Users/jason/OneDrive - The Pennsylvania State University/DAAN 881, Data Drive Decision Making/Project/Auto Data/Merged/PA_PROCESSED_corrected_multi_sheet.xlsx
Max rows per sheet: 1,000,000


In [6]:
# --- Read CSV ---
try:
    print(f"\nReading '{csv_filename}'...")
    # It might be beneficial to read in chunks if memory is also an issue,
    # but for now, let's read the whole thing first if possible.
    df = pd.read_csv(csv_filepath, low_memory=False)
    total_rows = len(df)
    print(f"Successfully read {total_rows:,} rows and {len(df.columns)} columns.")

    if total_rows == 0:
        print("CSV file is empty. No Excel file will be created.")
        exit()

except FileNotFoundError:
    print(f"\nERROR: Input file not found at '{csv_filepath}'. Please check the path and filename.")
    exit()
except Exception as e:
    print(f"\nERROR: An error occurred while reading the CSV file: {e}")
    exit()

# --- Write to Excel (Multiple Sheets) ---
try:
    print(f"\nWriting data to multiple sheets in '{excel_filename}'...")
    # Calculate the number of sheets needed
    num_sheets = math.ceil(total_rows / max_rows_per_sheet)
    print(f"Data will be split into {num_sheets} sheet(s).")

    # Use ExcelWriter to write to the same file
    with pd.ExcelWriter(excel_filepath, engine='openpyxl') as writer:
        for i in range(num_sheets):
            start_row = i * max_rows_per_sheet
            end_row = min((i + 1) * max_rows_per_sheet, total_rows)
            sheet_name = f'Data_Part_{i+1}'

            print(f"  Writing sheet '{sheet_name}' (rows {start_row+1:,} to {end_row:,})...")

            # Select the chunk of the DataFrame
            df_chunk = df.iloc[start_row:end_row]

            # Write the chunk to the current sheet
            df_chunk.to_excel(writer, sheet_name=sheet_name, index=False)

    print(f"\nSuccessfully wrote {num_sheets} sheets to '{excel_filepath}'.")

except ImportError:
     print("\nERROR: The 'openpyxl' library is required to write Excel files.")
     print("Please install it using: pip install openpyxl")
except Exception as e:
    print(f"\nERROR: An error occurred while writing the Excel file: {e}")

print("\nScript finished.")


Reading 'PA_PROCESSED_corrected.csv'...
Successfully read 5,538,978 rows and 15 columns.

Writing data to multiple sheets in 'PA_PROCESSED_corrected_multi_sheet.xlsx'...
Data will be split into 6 sheet(s).
  Writing sheet 'Data_Part_1' (rows 1 to 1,000,000)...
  Writing sheet 'Data_Part_2' (rows 1,000,001 to 2,000,000)...
  Writing sheet 'Data_Part_3' (rows 2,000,001 to 3,000,000)...
  Writing sheet 'Data_Part_4' (rows 3,000,001 to 4,000,000)...
  Writing sheet 'Data_Part_5' (rows 4,000,001 to 5,000,000)...
  Writing sheet 'Data_Part_6' (rows 5,000,001 to 5,538,978)...

Successfully wrote 6 sheets to 'C:/Users/jason/OneDrive - The Pennsylvania State University/DAAN 881, Data Drive Decision Making/Project/Auto Data/Merged/PA_PROCESSED_corrected_multi_sheet.xlsx'.

Script finished.
