In [1]:
import pandas as pd
import numpy as np
import os
import math

In [2]:
# --- Configuration ---
# Define the directory where the file is located
file_directory = r"C:/Users/jason/OneDrive - The Pennsylvania State University/DAAN 881, Data Drive Decision Making/Project/Auto Data/Merged/"

# Define the input CSV filename
input_csv_filename = "TN_PROCESSED_corrected_v2.csv"

# Define the base for output CSV filenames
output_csv_basename = "TN_PROCESSED_corrected_v2"

# Define the number of parts to split into
num_parts = 4

# Construct full input path
input_filepath = os.path.join(file_directory, input_csv_filename)

print(f"Input CSV file: {input_filepath}")
print(f"Number of output parts: {num_parts}")

Input CSV file: C:/Users/jason/OneDrive - The Pennsylvania State University/DAAN 881, Data Drive Decision Making/Project/Auto Data/Merged/TN_PROCESSED_corrected_v2.csv
Number of output parts: 4


In [3]:
# --- Read CSV ---
try:
    print(f"\nReading '{input_csv_filename}'...")
    # Use low_memory=False for potentially large files with mixed types
    df = pd.read_csv(input_filepath, low_memory=False)
    total_rows = len(df)
    print(f"Successfully read {total_rows:,} rows and {len(df.columns)} columns.")

    if total_rows == 0:
        print("Input CSV file is empty. Cannot split.")
        exit()
    elif total_rows < num_parts:
        print(f"Warning: Total rows ({total_rows}) is less than the number of parts ({num_parts}).")
        # Adjust num_parts or handle as needed, here we'll create files with fewer rows
        # num_parts = total_rows # Option: create one file per row if rows < parts
        pass # Continue and create potentially empty files for parts > total_rows

except FileNotFoundError:
    print(f"\nERROR: Input file not found at '{input_filepath}'. Please check the path and filename.")
    exit()
except Exception as e:
    print(f"\nERROR: An error occurred while reading the CSV file: {e}")
    exit()

# --- Split DataFrame ---
print(f"\nSplitting data into {num_parts} parts...")

# Use numpy.array_split to handle potentially uneven splits
# It returns a list of DataFrames
df_chunks = np.array_split(df, num_parts)

print("Data split complete.")

# --- Write Output CSV Files ---
print("\nWriting output CSV files...")
try:
    for i, df_chunk in enumerate(df_chunks):
        part_number = i + 1
        # Construct the output filename for this part
        output_filename = f"{output_csv_basename}_p{part_number}.csv"
        output_filepath = os.path.join(file_directory, output_filename)

        print(f"  Writing '{output_filename}' ({len(df_chunk):,} rows)...")

        # Only write if the chunk is not empty (handles case where total_rows < num_parts)
        if not df_chunk.empty:
             df_chunk.to_csv(output_filepath, index=False)
             print(f"  Successfully wrote '{output_filename}'.")
        else:
             print(f"  Skipping empty chunk for '{output_filename}'.")


except Exception as e:
    print(f"\nERROR: An error occurred while writing one of the output CSV files: {e}")

print("\nScript finished.")


Reading 'TN_PROCESSED_corrected_v2.csv'...
Successfully read 9,502,214 rows and 15 columns.

Splitting data into 4 parts...


  return bound(*args, **kwds)


Data split complete.

Writing output CSV files...
  Writing 'TN_PROCESSED_corrected_v2_p1.csv' (2,375,554 rows)...
  Successfully wrote 'TN_PROCESSED_corrected_v2_p1.csv'.
  Writing 'TN_PROCESSED_corrected_v2_p2.csv' (2,375,554 rows)...
  Successfully wrote 'TN_PROCESSED_corrected_v2_p2.csv'.
  Writing 'TN_PROCESSED_corrected_v2_p3.csv' (2,375,553 rows)...
  Successfully wrote 'TN_PROCESSED_corrected_v2_p3.csv'.
  Writing 'TN_PROCESSED_corrected_v2_p4.csv' (2,375,553 rows)...
  Successfully wrote 'TN_PROCESSED_corrected_v2_p4.csv'.

Script finished.
