In [1]:
import pandas as pd
import os
import time
import glob
from tqdm import tqdm

# Input and output paths
input_files = '/home/charlie/Desktop/Data_Subset/Subset_4/*.csv'
output_dir = '/home/charlie/Desktop/Data_Subset/Masterfile/'  
output_file = os.path.join(output_dir, 'Masterfile.csv')
line_reading_limit = 800  # Number of rows per subset

# Ensure output directory exists
os.makedirs(output_dir, exist_ok=True)

start_time = time.time()
files = glob.glob(input_files)

try:
    # Open the output file and write headers only once
    with open(output_file, 'w') as f_out:
        # Initialize progress bar
        with tqdm(total=len(files), desc='Processing Files') as pbar:
            for file in files:
                try:
                    # Read limited rows from the file
                    df = pd.read_csv(file, nrows=line_reading_limit)
                    
                    # Write headers only for the first file
                    df.to_csv(f_out, index=False, header=f_out.tell() == 0, mode='a')
                    
                    pbar.update(1)

                except Exception as e:
                    print(f"Error Processing: {file}: {e}")
                
    print(f"Master CSV created at: {output_file}")

except Exception as e:
    print(f"CRITICAL ERROR: {e}")

end_time = time.time()
print(f"PROCESSING COMPLETE in {end_time - start_time:.2f} seconds")


Processing Files: 100%|██████████| 15000/15000 [02:05<00:00, 119.69it/s]

Master CSV created at: /home/charlie/Desktop/Data_Subset/Masterfile/Masterfile.csv
PROCESSING COMPLETE in 125.36 seconds



