In [1]:
import pandas as pd
import os
import math

In [2]:
# --- Configuration ---
# Define the directory where the file is located
file_directory = r"C:/Users/jason/OneDrive - The Pennsylvania State University/DAAN 881, Data Drive Decision Making/Project/Auto Data/Merged/"

# Define the input CSV filename
input_csv_filename = "PA_PROCESSED_corrected.csv"

# Define the desired output CSV filenames
output_csv_filename_p1 = "PA_PROCESSED_corrected_p1.csv"
output_csv_filename_p2 = "PA_PROCESSED_corrected_p2.csv"

# Construct full paths
input_filepath = os.path.join(file_directory, input_csv_filename)
output_filepath_p1 = os.path.join(file_directory, output_csv_filename_p1)
output_filepath_p2 = os.path.join(file_directory, output_csv_filename_p2)

print(f"Input CSV file: {input_filepath}")
print(f"Output Part 1: {output_filepath_p1}")
print(f"Output Part 2: {output_filepath_p2}")

Input CSV file: C:/Users/jason/OneDrive - The Pennsylvania State University/DAAN 881, Data Drive Decision Making/Project/Auto Data/Merged/PA_PROCESSED_corrected.csv
Output Part 1: C:/Users/jason/OneDrive - The Pennsylvania State University/DAAN 881, Data Drive Decision Making/Project/Auto Data/Merged/PA_PROCESSED_corrected_p1.csv
Output Part 2: C:/Users/jason/OneDrive - The Pennsylvania State University/DAAN 881, Data Drive Decision Making/Project/Auto Data/Merged/PA_PROCESSED_corrected_p2.csv


In [3]:
# --- Read CSV ---
try:
    print(f"\nReading '{input_csv_filename}'...")
    # Use low_memory=False if needed for large files with mixed types
    df = pd.read_csv(input_filepath, low_memory=False)
    total_rows = len(df)
    print(f"Successfully read {total_rows:,} rows and {len(df.columns)} columns.")

    if total_rows == 0:
        print("Input CSV file is empty. Cannot split.")
        exit()

except FileNotFoundError:
    print(f"\nERROR: Input file not found at '{input_filepath}'. Please check the path and filename.")
    exit()
except Exception as e:
    print(f"\nERROR: An error occurred while reading the CSV file: {e}")
    exit()

# --- Split DataFrame ---
# Calculate the midpoint row index (integer division)
midpoint = total_rows // 2
print(f"\nSplitting data at row index {midpoint:,}...")

# Select the first half (rows 0 up to midpoint-1)
df_part1 = df.iloc[:midpoint]

# Select the second half (rows midpoint up to the end)
df_part2 = df.iloc[midpoint:]

print(f"Part 1 contains {len(df_part1):,} rows.")
print(f"Part 2 contains {len(df_part2):,} rows.")

# --- Write Output CSV Files ---
try:
    print(f"\nWriting '{output_csv_filename_p1}'...")
    df_part1.to_csv(output_filepath_p1, index=False)
    print(f"Successfully wrote {output_csv_filename_p1}.")

    print(f"\nWriting '{output_csv_filename_p2}'...")
    df_part2.to_csv(output_filepath_p2, index=False)
    print(f"Successfully wrote {output_csv_filename_p2}.")

except Exception as e:
    print(f"\nERROR: An error occurred while writing one of the output CSV files: {e}")

print("\nScript finished.")


Reading 'PA_PROCESSED_corrected.csv'...
Successfully read 5,538,978 rows and 15 columns.

Splitting data at row index 2,769,489...
Part 1 contains 2,769,489 rows.
Part 2 contains 2,769,489 rows.

Writing 'PA_PROCESSED_corrected_p1.csv'...
Successfully wrote PA_PROCESSED_corrected_p1.csv.

Writing 'PA_PROCESSED_corrected_p2.csv'...
Successfully wrote PA_PROCESSED_corrected_p2.csv.

Script finished.
