In [1]:
import torch

if torch.cuda.is_available():
    print("CUDA is available. PyTorch can use the GPU.")
    print(f"Number of GPUs: {torch.cuda.device_count()}")
    print(f"Current GPU: {torch.cuda.get_device_name(0)}")
else:
    print("CUDA is not available. PyTorch will use the CPU.")

CUDA is available. PyTorch can use the GPU.
Number of GPUs: 1
Current GPU: NVIDIA GeForce RTX 4050 Laptop GPU


In [3]:
import os
import re

# Define the path to the root folder
root_folder_path = r'phh-dataset'

# Counter for deleted files
deleted_count = 0
total_count = 0
folders_processed = 0

print("Starting recursive processing of all .phh files in phh-dataset...")
print("-" * 60)

# Walk through all directories and subdirectories
for root, dirs, files in os.walk(root_folder_path):
    # Skip if no .phh files in current directory
    phh_files = [f for f in files if f.endswith('.phh')]
    if not phh_files:
        continue
    
    folders_processed += 1
    print(f"\nProcessing folder: {root}")
    print(f"Found {len(phh_files)} .phh files")
    
    # Process each .phh file in current directory
    for filename in phh_files:
        total_count += 1
        file_path = os.path.join(root, filename)
        
        try:
            # Read the first few lines to check for variant
            with open(file_path, 'r', encoding='utf-8') as file:
                first_line = file.readline().strip()
                
            # Check if the first line contains variant = 'NT'
            if first_line.startswith("variant = 'NT'"):
                print(f"  ✓ Keeping: {filename} (variant = 'NT')")
            else:
                # Delete the file if it doesn't have variant = 'NT'
                if '=' in first_line:
                    variant_value = first_line.split('=')[1].strip().strip("'")
                else:
                    variant_value = 'unknown'
                
                os.remove(file_path)
                deleted_count += 1
                print(f"  ✗ Deleted: {filename} (variant = '{variant_value}')")
                
        except Exception as e:
            print(f"  ! Error processing {filename}: {e}")

print("\n" + "=" * 60)
print("PROCESSING COMPLETE!")
print(f"Folders processed: {folders_processed}")
print(f"Total files processed: {total_count}")
print(f"Files deleted: {deleted_count}")
print(f"Files kept: {total_count - deleted_count}")
if total_count > 0:
    print(f"Deletion rate: {deleted_count/total_count*100:.1f}%")

Starting recursive processing of all .phh files in phh-dataset...
------------------------------------------------------------

Processing folder: phh-dataset\data\pluribus\100
Found 71 .phh files
  ✓ Keeping: 0.phh (variant = 'NT')
  ✓ Keeping: 1.phh (variant = 'NT')
  ✓ Keeping: 10.phh (variant = 'NT')
  ✓ Keeping: 11.phh (variant = 'NT')
  ✓ Keeping: 12.phh (variant = 'NT')
  ✓ Keeping: 13.phh (variant = 'NT')
  ✓ Keeping: 14.phh (variant = 'NT')

Processing folder: phh-dataset\data\pluribus\100
Found 71 .phh files
  ✓ Keeping: 0.phh (variant = 'NT')
  ✓ Keeping: 1.phh (variant = 'NT')
  ✓ Keeping: 10.phh (variant = 'NT')
  ✓ Keeping: 11.phh (variant = 'NT')
  ✓ Keeping: 12.phh (variant = 'NT')
  ✓ Keeping: 13.phh (variant = 'NT')
  ✓ Keeping: 14.phh (variant = 'NT')
  ✓ Keeping: 15.phh (variant = 'NT')
  ✓ Keeping: 16.phh (variant = 'NT')
  ✓ Keeping: 17.phh (variant = 'NT')
  ✓ Keeping: 18.phh (variant = 'NT')
  ✓ Keeping: 19.phh (variant = 'NT')
  ✓ Keeping: 2.phh (variant = 'NT'