In [None]:
import os
import shutil
import pandas as pd
from tqdm import tqdm
import sys

# ===============================
# USER SETTINGS
# ===============================
NOISE_SOURCE_FOLDER = r"D:/_3rd Year Class/1st Sem/Machine Learning/_ForLE/For DataSet/Audio Wise V1.0"
METADATA_PATH = r"D:/_3rd Year Class/1st Sem/Machine Learning/_ForLE/For DataSet/Metadata V1.0 FSC22.csv"
EXISTING_OUTPUT_FOLDER = r"D:/_3rd Year Class/1st Sem/Machine Learning/_ForLE/For DataSet/3_SegmentedAudios"

TARGET_DEFICIT = 2969

ALLOWED_CLASSES = [
    'Rain', 'Thunderstorm', 'WaterDrops', 'Wind', 'Silence', 
    'TreeFalling', 'Whistling', 'Insect', 'Frog', 
    'BirdChirping', 'WingFlapping', 'Squirrel', 'Footsteps', 'Clapping',
    'WolfHowl', 'Speaking'
]

# ===============================
# STEP 1: LOAD EXISTING MANIFEST
# ===============================
manifest_path = os.path.join(EXISTING_OUTPUT_FOLDER, "master_manifest_NoEagle.csv")

print("Loading existing manifest structure...")

if not os.path.exists(manifest_path):
    print("‚ùå No existing manifest found. Please ensure master_manifest.csv exists.")
    sys.exit(1)

try:
    existing_manifest = pd.read_csv(manifest_path)
    manifest_columns = existing_manifest.columns.tolist()
    print(f"‚úÖ Found existing manifest with {len(manifest_columns)} columns")
except Exception as e:
    print(f"‚ùå Error reading existing manifest: {e}")
    sys.exit(1)

# ===============================
# LOAD & FILTER METADATA
# ===============================
print("\nLoading and filtering metadata...")
try:
    meta_df = pd.read_csv(METADATA_PATH)
    
    # Filter by class
    filtered_df = meta_df[meta_df['Class Name'].isin(ALLOWED_CLASSES)].copy()
    
    # Shuffle to get a random mix
    filtered_df = filtered_df.sample(frac=1, random_state=42).reset_index(drop=True)
    
    print(f"‚úÖ Found {len(filtered_df)} valid noise files.")
    
except Exception as e:
    print(f"‚ùå Error reading metadata: {e}")
    sys.exit(1)

# ===============================
# PREPARE FOR COPYING
# ===============================
# Calculate next group_id to avoid conflicts
if len(existing_manifest) > 0 and 'group_id' in manifest_columns:
    try:
        next_group_id = int(existing_manifest['group_id'].max()) + 1
    except:
        next_group_id = 10000
else:
    next_group_id = 10000

print(f"Starting group_id from: {next_group_id}")

# ===============================
# COPY FILES
# ===============================
created_count = 0
segments_data = []

print(f"\nüìâ Target to fill: {TARGET_DEFICIT} files")
print(f"üìÇ Copying from: {NOISE_SOURCE_FOLDER}")

# Iterate through the filtered list
for index, row in tqdm(filtered_df.iterrows(), total=min(len(filtered_df), TARGET_DEFICIT)):
    if created_count >= TARGET_DEFICIT:
        break
        
    original_filename = str(row['Dataset File Name']).strip()
    noise_class = str(row['Class Name']).strip()
    
    source_path = os.path.join(NOISE_SOURCE_FOLDER, original_filename)
    
    if not os.path.exists(source_path):
        continue
        
    # Prepare Destination
    new_filename = f"GeneralForest_{noise_class}_{original_filename}"
    
    location = "GeneralForest"
    label = "NoEagleSound"
    
    # Save Folder: Output / GeneralForest / NoEagleSound
    save_folder = os.path.join(EXISTING_OUTPUT_FOLDER, location, label)
    os.makedirs(save_folder, exist_ok=True)
    
    dest_path = os.path.join(save_folder, new_filename)
    
    try:
        # COPY THE FILE
        shutil.copy2(source_path, dest_path)
        
        # Create a row matching the existing manifest structure
        new_row = {}
        
        # Fill in columns that we have data for
        column_mapping = {
            'label': label,
            'label_base': label,
            'label_full': label,
            'group_id': next_group_id + created_count,
            'location_id': location,
            'segment_filename': new_filename,
            'output_folder': os.path.join(location, label),
            'source_audio': original_filename,
            'start': '',
            'end': '',
            'quality': '',
            'selection_numbers': '',
        }
        
        # Initialize all columns from existing manifest structure
        for col in manifest_columns:
            if col in column_mapping:
                new_row[col] = column_mapping[col]
            else:
                new_row[col] = ''  # Leave blank if no data available
        
        segments_data.append(new_row)
        created_count += 1
        
    except Exception as e:
        print(f"‚ö†Ô∏è Error copying {original_filename}: {e}")
        continue

print(f"\n‚ú® Finished! Copied {created_count} noise files.")

# ===============================
# UPDATE MASTER MANIFEST
# ===============================
if segments_data:
    print("\nUpdating master_manifest.csv...")
    try:
        new_df = pd.DataFrame(segments_data)
        
        # Ensure column order matches existing manifest
        new_df = new_df[manifest_columns]
        
        # Append to existing manifest
        combined_df = pd.concat([existing_manifest, new_df], ignore_index=True)
        
        # Save updated manifest
        combined_df.to_csv(manifest_path, index=False)
        print(f"‚úÖ Master manifest updated successfully. Added {created_count} new rows.")
        
    except Exception as e:
        print(f"‚ùå Error updating manifest: {e}")
        # Backup save
        pd.DataFrame(segments_data).to_csv(
            os.path.join(EXISTING_OUTPUT_FOLDER, "added_noise_manifest_backup.csv"), 
            index=False
        )
        print("   Saved new data to 'added_noise_manifest_backup.csv' instead.")
else:
    print("\n‚ö†Ô∏è No files were copied. Manifest not updated.")

Loading existing manifest structure...
‚úÖ Found existing manifest with 12 columns

Loading and filtering metadata...
‚úÖ Found 1125 valid noise files.
Starting group_id from: 1813

üìâ Target to fill: 2969 files
üìÇ Copying from: D:/_3rd Year Class/1st Sem/Machine Learning/_ForLE/For DataSet/Audio Wise V1.0


100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 1125/1125 [00:01<00:00, 626.88it/s]


‚ú® Finished! Copied 1125 noise files.

Updating master_manifest.csv...
‚úÖ Master manifest updated successfully. Added 1125 new rows.





In [None]:
import os
import shutil
import pandas as pd
from tqdm import tqdm
import sys
import numpy as np
import librosa
import soundfile as sf

# ===============================
# USER SETTINGS
# ===============================
NOISE_SOURCE_FOLDER = r"D:/_3rd Year Class/1st Sem/Machine Learning/_ForLE/For DataSet/archive2/audio/audio"
METADATA_PATH = r"D:/_3rd Year Class/1st Sem/Machine Learning/_ForLE/For DataSet/archive2/esc50.csv"
EXISTING_OUTPUT_FOLDER = r"D:/_3rd Year Class/1st Sem/Machine Learning/_ForLE/For DataSet/3_SegmentedAudios"

TARGET_DEFICIT = 2229 

ALLOWED_CLASSES = [
    'chirping_birds', 'thunderstorm', 'crow', 'door_wood_knock', 'pouring_water', 
    'clapping', 'church_bells', 'water_drops', 'wind', 'sheep', 'frog', 
    'fireworks', 'cow', 'crackling_fire', 'hen', 'insects', 'hand_saw', 
    'pig', 'rooster', 'sea_waves', 'dog', 'breathing', 'siren', 'snoring', 
    'airplane', 'cat', 'door_wood_creaking', 'crickets', 'coughing', 
    'chainsaw', 'drinking_sipping', 'laughing', 'glass_breaking', 
    'engine', 'footsteps', 'crying_baby', 'can_opening'
]

# ===============================
# LOAD MANIFESTS
# ===============================
manifest_path = os.path.join(EXISTING_OUTPUT_FOLDER, "master_manifest_NoEagle.csv")
if not os.path.exists(manifest_path):
    print("‚ùå No existing manifest found.")
    sys.exit(1)

existing_manifest = pd.read_csv(manifest_path)
manifest_columns = existing_manifest.columns.tolist()

# Determine next group ID
if 'group_id' in existing_manifest.columns and not existing_manifest.empty:
    next_group_id = int(existing_manifest['group_id'].max()) + 1
else:
    next_group_id = 10000

# ===============================
# PREPARE SOURCE LIST
# ===============================
print("Loading metadata...")
meta_df = pd.read_csv(METADATA_PATH)
filtered_df = meta_df[meta_df['category'].isin(ALLOWED_CLASSES)].copy()
# Shuffle once
filtered_df = filtered_df.sample(frac=1, random_state=42).reset_index(drop=True)

print(f"‚úÖ Found {len(filtered_df)} unique source files.")
print(f"üìâ Target needed: {TARGET_DEFICIT}")

# ===============================
# FILL LOOP (COPY -> AUGMENT)
# ===============================
created_count = 0
segments_data = []
source_index = 0
total_sources = len(filtered_df)

pbar = tqdm(total=TARGET_DEFICIT)

while created_count < TARGET_DEFICIT:
    # Cycle through the list repeatedly if needed
    row = filtered_df.iloc[source_index % total_sources]
    source_index += 1
    
    orig_filename = str(row['filename']).strip()
    noise_class = str(row['category']).strip()
    source_path = os.path.join(NOISE_SOURCE_FOLDER, orig_filename)
    
    if not os.path.exists(source_path): continue

    # Determine: Copy (Round 1) or Augment (Round 2+)?
    is_augmentation = (source_index > total_sources)
    
    # Destination Setup
    location = "GeneralForest"
    label = "NoEagleSound"
    save_folder = os.path.join(EXISTING_OUTPUT_FOLDER, location, label)
    os.makedirs(save_folder, exist_ok=True)
    
    try:
        if not is_augmentation:
            # --- METHOD A: DIRECT COPY (Fast) ---
            new_filename = f"GeneralForest_{noise_class}_{orig_filename}"
            dest_path = os.path.join(save_folder, new_filename)
            shutil.copy2(source_path, dest_path)
            
        else:
            # --- METHOD B: AUGMENTATION (Fill the rest) ---
            # Load
            y, sr = librosa.load(source_path, sr=None)
            
            # Augment: Pitch Shift (Randomly slightly higher or lower)
            steps = np.random.uniform(-1.5, 1.5)
            y_aug = librosa.effects.pitch_shift(y, sr=sr, n_steps=steps)
            
            # Save as new file
            new_filename = f"GeneralForest_{noise_class}_aug{source_index}_{orig_filename}"
            dest_path = os.path.join(save_folder, new_filename)
            sf.write(dest_path, y_aug, sr)

        # Add to Manifest Data
        new_row = {col: '' for col in manifest_columns} # Init empty
        new_row.update({
            'label': label,
            'label_base': label,
            'label_full': label,
            'group_id': next_group_id + created_count,
            'location_id': location,
            'segment_filename': new_filename,
            'output_folder': os.path.join(location, label),
            'source_audio': orig_filename,
            'start': 0.0,
            'end': 5.0,
            'quality': 'N/A',
            'selection_numbers': 'None'
        })
        
        segments_data.append(new_row)
        created_count += 1
        pbar.update(1)

    except Exception as e:
        # print(f"Error: {e}")
        pass

pbar.close()
print(f"\n‚ú® DONE! Created {created_count} files ({len(filtered_df)} originals + {created_count - len(filtered_df)} augmented).")

# ===============================
# SAVE MANIFEST
# ===============================
if segments_data:
    print("Updating manifest...")
    new_df = pd.DataFrame(segments_data)
    new_df = new_df[manifest_columns] # Align columns
    combined = pd.concat([existing_manifest, new_df], ignore_index=True)
    combined.to_csv(manifest_path, index=False)
    print("‚úÖ Manifest updated.")

Loading metadata...
‚úÖ Found 1440 unique source files.
üìâ Target needed: 2229


100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 2229/2229 [00:49<00:00, 44.95it/s] 


‚ú® DONE! Created 2229 files (1440 originals + 789 augmented).
Updating manifest...
‚úÖ Manifest updated.



