In [None]:
import os
import shutil
import pandas as pd
from tqdm import tqdm
import sys

# ===============================
# USER SETTINGS
# ===============================
# 1. Folder containing the FSC22 Audio files (Source)
NOISE_SOURCE_FOLDER = r"D:/_3rd Year Class/1st Sem/Machine Learning/_ForLE/For DataSet/Audio Wise V1.0"

# 2. Path to the FSC22 Metadata CSV
METADATA_PATH = r"D:/_3rd Year Class/1st Sem/Machine Learning/_ForLE/For DataSet/Metadata V1.0 FSC22.csv"

# 3. Your EXISTING output folder (Destination)
EXISTING_OUTPUT_FOLDER = r"D:/_3rd Year Class/1st Sem/Machine Learning/_ForLE/For DataSet/3_SegmentedAudios"

# 4. How many files do you need?
TARGET_DEFICIT = 385 

# 5. The Allowed Classes
ALLOWED_CLASSES = [
    'Rain', 'Thunderstorm', 'WaterDrops', 'Wind', 'Silence', 
    'TreeFalling', 'Whistling', 'Insect', 'Frog', 
    'BirdChirping', 'WingFlapping', 'Squirrel'
]

# ===============================
# STEP 1: LOAD EXISTING MANIFEST
# ===============================
manifest_path = os.path.join(EXISTING_OUTPUT_FOLDER, "master_manifest_NoEagle.csv")

print("Loading existing manifest structure...")

if not os.path.exists(manifest_path):
    print("‚ùå No existing manifest found. Please ensure master_manifest.csv exists.")
    sys.exit(1)

try:
    existing_manifest = pd.read_csv(manifest_path)
    manifest_columns = existing_manifest.columns.tolist()
    print(f"‚úÖ Found existing manifest with {len(manifest_columns)} columns")
except Exception as e:
    print(f"‚ùå Error reading existing manifest: {e}")
    sys.exit(1)

# ===============================
# STEP 2: LOAD & FILTER METADATA
# ===============================
print("\nLoading and filtering metadata...")
try:
    meta_df = pd.read_csv(METADATA_PATH)
    
    # Filter by class
    filtered_df = meta_df[meta_df['Class Name'].isin(ALLOWED_CLASSES)].copy()
    
    # Shuffle to get a random mix
    filtered_df = filtered_df.sample(frac=1, random_state=42).reset_index(drop=True)
    
    print(f"‚úÖ Found {len(filtered_df)} valid noise files.")
    
except Exception as e:
    print(f"‚ùå Error reading metadata: {e}")
    sys.exit(1)

# ===============================
# STEP 3: PREPARE FOR COPYING
# ===============================
# Calculate next group_id to avoid conflicts
if len(existing_manifest) > 0 and 'group_id' in manifest_columns:
    try:
        next_group_id = int(existing_manifest['group_id'].max()) + 1
    except:
        next_group_id = 10000
else:
    next_group_id = 10000

print(f"Starting group_id from: {next_group_id}")

# ===============================
# STEP 4: COPY FILES
# ===============================
created_count = 0
segments_data = []

print(f"\nüìâ Target to fill: {TARGET_DEFICIT} files")
print(f"üìÇ Copying from: {NOISE_SOURCE_FOLDER}")

# Iterate through the filtered list
for index, row in tqdm(filtered_df.iterrows(), total=min(len(filtered_df), TARGET_DEFICIT)):
    if created_count >= TARGET_DEFICIT:
        break
        
    original_filename = str(row['Dataset File Name']).strip()
    noise_class = str(row['Class Name']).strip()
    
    source_path = os.path.join(NOISE_SOURCE_FOLDER, original_filename)
    
    if not os.path.exists(source_path):
        continue
        
    # Prepare Destination
    new_filename = f"GeneralForest_{noise_class}_{original_filename}"
    
    location = "GeneralForest"
    label = "NoEagleSound"
    
    # Save Folder: Output / GeneralForest / NoEagleSound
    save_folder = os.path.join(EXISTING_OUTPUT_FOLDER, location, label)
    os.makedirs(save_folder, exist_ok=True)
    
    dest_path = os.path.join(save_folder, new_filename)
    
    try:
        # COPY THE FILE
        shutil.copy2(source_path, dest_path)
        
        # Create a row matching the existing manifest structure
        new_row = {}
        
        # Fill in columns that we have data for
        column_mapping = {
            'label': label,
            'label_base': label,
            'label_full': label,
            'group_id': next_group_id + created_count,
            'location_id': location,
            'segment_filename': new_filename,
            'output_folder': os.path.join(location, label),
            'source_audio': original_filename,
            'start': '',
            'end': '',
            'quality': '',
            'selection_numbers': '',
        }
        
        # Initialize all columns from existing manifest structure
        for col in manifest_columns:
            if col in column_mapping:
                new_row[col] = column_mapping[col]
            else:
                new_row[col] = ''  # Leave blank if no data available
        
        segments_data.append(new_row)
        created_count += 1
        
    except Exception as e:
        print(f"‚ö†Ô∏è Error copying {original_filename}: {e}")
        continue

print(f"\n‚ú® Finished! Copied {created_count} noise files.")

# ===============================
# STEP 5: UPDATE MASTER MANIFEST
# ===============================
if segments_data:
    print("\nUpdating master_manifest.csv...")
    try:
        new_df = pd.DataFrame(segments_data)
        
        # Ensure column order matches existing manifest
        new_df = new_df[manifest_columns]
        
        # Append to existing manifest
        combined_df = pd.concat([existing_manifest, new_df], ignore_index=True)
        
        # Save updated manifest
        combined_df.to_csv(manifest_path, index=False)
        print(f"‚úÖ Master manifest updated successfully. Added {created_count} new rows.")
        
    except Exception as e:
        print(f"‚ùå Error updating manifest: {e}")
        # Backup save
        pd.DataFrame(segments_data).to_csv(
            os.path.join(EXISTING_OUTPUT_FOLDER, "added_noise_manifest_backup.csv"), 
            index=False
        )
        print("   Saved new data to 'added_noise_manifest_backup.csv' instead.")
else:
    print("\n‚ö†Ô∏è No files were copied. Manifest not updated.")

Loading existing manifest structure...
‚ùå No existing manifest found. Please ensure master_manifest.csv exists.


SystemExit: 1