In [None]:

import os
import pickle

def merge_large_dat_files(input_dir, output_file):
    try:
        files = sorted([f for f in os.listdir(input_dir) if f.endswith('.dat')])  # Sort for consistency
        print(f"Found .dat files: {files}")

        if not files:
            raise ValueError("No .dat files found in the directory.")

        merged_data = []  # List to store all training samples

        for filename in files:
            file_path = os.path.join(input_dir, filename)
            try:
                with open(file_path, 'rb') as f:
                    while True:
                        try:
                            data = pickle.load(f)  # Load object from file
                            if isinstance(data, list) and all(isinstance(item, tuple) for item in data):
                                merged_data.extend(data)  # Append to list
                                print(f"✔ Merged: {filename} ({len(data)} samples)")
                            else:
                                print(f"❌ Skipped {filename} (unexpected format)")
                        except EOFError:
                            break  # Stop reading when end of file is reached
            except (pickle.UnpicklingError, EOFError) as e:
                print(f"❌ Corrupted file: {filename}, skipping. Error: {e}")

        # Save merged data
        with open(output_file, 'wb') as f_out:
            pickle.dump(merged_data, f_out)

        print(f"✅ Merged data saved to: {output_file} (Total: {len(merged_data)} samples)")

    except Exception as e:
        print(f"Unexpected error: {e}")
        import traceback
        traceback.print_exc()  # Print full error details



In [None]:
# Example usage
input_directory = '/content/drive/MyDrive/new_dats_val/test_dats'
output_file = "/content/drive/MyDrive/merged_dats/merged_test_data.dat"

merge_large_dat_files(input_directory, output_file)

In [None]:
input_directory = '/content/drive/MyDrive/new_dats_val/val_dats'
output_file = "/content/drive/MyDrive/merged_dats/merged_val_data.dat"
merge_large_dat_files(input_directory, output_file)

Found .dat files: ['val_set_1val.dat', 'val_set_2val.dat']
✔ Merged: val_set_1val.dat (400 samples)
✔ Merged: val_set_2val.dat (400 samples)
✅ Merged data saved to: /content/drive/MyDrive/merged_dats/merged_val_data.dat (Total: 800 samples)


In [None]:
input_directory = '/content/drive/MyDrive/new_dats'
output_file = "/content/drive/MyDrive/merged_dats/merged_train_data4.dat"
merge_large_dat_files(input_directory, output_file)

Found .dat files: ['Copy of training_set_6train.dat', 'training_set_2train.dat', 'training_set_3train.dat', 'training_set_4train.dat', 'training_set_5train.dat']
✔ Merged: Copy of training_set_6train.dat (400 samples)
✔ Merged: training_set_2train.dat (400 samples)
✔ Merged: training_set_3train.dat (400 samples)
✔ Merged: training_set_4train.dat (400 samples)
✔ Merged: training_set_5train.dat (400 samples)
✅ Merged data saved to: /content/drive/MyDrive/merged_dats/merged_train_data4.dat (Total: 2000 samples)
