### **Imports**

In [1]:
import os
import pandas as pd
import librosa
import soundfile as sf
import numpy as np

ModuleNotFoundError: No module named 'pandas'

### **Folder Paths**

In [3]:
AUDIO_FOLDER = "D:/_3rd Year Class/1st Sem/Machine Learning/_ForLE/For DataSet/PHEA" 
ANNOTATION_FOLDER = "D:/_3rd Year Class/1st Sem/Machine Learning/_ForLE/For DataSet/SelectionTables"
OUTPUT_FOLDER = "D:/_3rd Year Class/1st Sem/Machine Learning/_ForLE/For DataSet/SegmentedAudios"

### **Sementation Settings**

In [4]:
SEGMENT_LENGTH = 5 

COL_TYPE = "Type"

### **Setup MainLoop**

In [None]:
os.makedirs(OUTPUT_FOLDER, exist_ok=True)

audio_files = [f for f in os.listdir(AUDIO_FOLDER) if f.endswith(".wav")]
annotation_files = [f for f in os.listdir(ANNOTATION_FOLDER) if f.endswith(".txt")]

def get_base(filename):
    return filename.split(".Table")[0].split(".txt")[0].split(".wav")[0]

audio_bases = {get_base(f): f for f in audio_files}
annot_bases = {get_base(f): f for f in annotation_files}

# Master manifest to collect all segments
all_segments = []

# MAIN LOOP

for base_name, audio_file in audio_bases.items():
    if base_name not in annot_bases:
        print(f"No annotation file for {audio_file}, skipping...")
        continue

    annotation_file = annot_bases[base_name]
    audio_path = os.path.join(AUDIO_FOLDER, audio_file)
    annot_path = os.path.join(ANNOTATION_FOLDER, annotation_file)

    print(f"\nProcessing: {audio_file}")
    print(f"   ‚Ü≥ Using annotations from: {annotation_file}")

    # Load audio
    y, sr = librosa.load(audio_path, sr=None)
    total_duration = librosa.get_duration(y=y, sr=sr)

    # Load annotation table
    annotations = pd.read_csv(
        annot_path,
        sep="\t",
        engine="python",
        comment="#",
        skip_blank_lines=True
    )
    annotations.columns = [c.strip() for c in annotations.columns]

    if COL_TYPE not in annotations.columns:
        print(f"No '{COL_TYPE}' column found in {annotation_file}, skipping...")
        continue

    # DEBUG: Print unique types to verify
    print(f"Annotation types found: {annotations[COL_TYPE].unique()}")

    # Separate unnecessary parts - use .copy() to avoid warnings
    # Check for both "unecessary" and "unnecessary" spellings, case-insensitive
    unnecessary = annotations[
        annotations[COL_TYPE].astype(str).str.lower().str.contains("necessar", na=False)
    ].copy()
    
    valid_annots = annotations[
        ~annotations[COL_TYPE].astype(str).str.lower().str.contains("necessar", na=False)
    ].copy()

    print(f"Unnecessary regions: {len(unnecessary)}")
    print(f"Valid eagle annotations: {len(valid_annots)}")

    # Convert times to float
    for col in ["Begin Time (s)", "End Time (s)"]:
        if not valid_annots.empty:
            valid_annots[col] = valid_annots[col].astype(float)
        if not unnecessary.empty:
            unnecessary[col] = unnecessary[col].astype(float)

    def overlaps(a_start, a_end, b_start, b_end):
        return a_start < b_end and a_end > b_start

    seg_idx = 0
    stats = {"Eagle_long": 0, "Eagle_short": 0, "EagleMixed": 0, "NoEagleSound": 0}
    
    # List to store manifest data for this audio file
    file_segments = []
    
    # FIRST PASS: Collect all potential segments
    eagle_segments = []
    no_eagle_segments = []
    
    for start in np.arange(0, total_duration, SEGMENT_LENGTH):
        end = min(start + SEGMENT_LENGTH, total_duration)

        # Skip if overlaps unnecessary region
        skip_unnecessary = False
        if not unnecessary.empty:
            for _, row in unnecessary.iterrows():
                if overlaps(start, end, row["Begin Time (s)"], row["End Time (s)"]):
                    skip_unnecessary = True
                    break
        
        if skip_unnecessary:
            continue

        # Check if this window contains eagle sounds
        if not valid_annots.empty:
            overlapped = valid_annots[
                (valid_annots["Begin Time (s)"] < end) &
                (valid_annots["End Time (s)"] > start)
            ]
        else:
            overlapped = pd.DataFrame()

        # Skip if segment is too short (less than 0.5 seconds)
        start_sample = int(start * sr)
        end_sample = int(end * sr)
        segment = y[start_sample:end_sample]
        
        if len(segment) < 0.5 * sr:
            continue

        # Collect details about overlapping annotations
        annotation_details = []
        if not overlapped.empty:
            # Determine the label type
            types = overlapped[COL_TYPE].unique()
            if len(types) > 1:
                label_text = "EagleMixed"
            else:
                type_str = str(types[0]).lower()
                if "long" in type_str:
                    label_text = "Eagle_long"
                elif "short" in type_str:
                    label_text = "Eagle_short"
                else:
                    label_text = "EagleSound"
            
            # Collect annotation details
            for _, annot in overlapped.iterrows():
                annotation_details.append({
                    'type': annot[COL_TYPE],
                    'begin': annot["Begin Time (s)"],
                    'end': annot["End Time (s)"]
                })
            
            # Add to eagle segments list
            eagle_segments.append({
                'start': start,
                'end': end,
                'label': label_text,
                'annotation_details': annotation_details
            })
        else:
            label_text = "NoEagleSound"
            
            # Add to no eagle segments list
            no_eagle_segments.append({
                'start': start,
                'end': end,
                'label': label_text,
                'annotation_details': []
            })
    
    # Calculate how many NoEagleSound segments we need (same as eagle segments)
    num_eagle = len(eagle_segments)
    num_no_eagle_needed = num_eagle
    
    print(f"Found {num_eagle} eagle segments")
    print(f"Found {len(no_eagle_segments)} potential no-eagle segments")
    print(f"Selecting {num_no_eagle_needed} no-eagle segments for balance")
    
    # Randomly sample NoEagleSound segments if we have more than needed
    if len(no_eagle_segments) > num_no_eagle_needed:
        selected_no_eagle = np.random.choice(
            len(no_eagle_segments), 
            size=num_no_eagle_needed, 
            replace=False
        )
        no_eagle_segments = [no_eagle_segments[i] for i in selected_no_eagle]
    elif len(no_eagle_segments) < num_no_eagle_needed:
        print(f"   ‚ö†Ô∏è  Warning: Only {len(no_eagle_segments)} no-eagle segments available, need {num_no_eagle_needed}")
    
    # SECOND PASS: Save all selected segments
    all_selected_segments = eagle_segments + no_eagle_segments
    
    for segment_data in all_selected_segments:
        start = segment_data['start']
        end = segment_data['end']
        label_text = segment_data['label']
        annotation_details = segment_data['annotation_details']
        
        # Prepare subfolder for label
        label_folder = os.path.join(OUTPUT_FOLDER, label_text)
        os.makedirs(label_folder, exist_ok=True)

        # Extract segment
        start_sample = int(start * sr)
        end_sample = int(end * sr)
        segment = y[start_sample:end_sample]

        # Save segment to label-specific folder
        filename = f"{base_name}_seg{seg_idx:04d}.wav"
        save_path = os.path.join(label_folder, filename)
        sf.write(save_path, segment, sr)
        
        stats[label_text] = stats.get(label_text, 0) + 1
        
        # Record segment info for manifest
        segment_info = {
            'source_audio': audio_file,
            'segment_filename': filename,
            'label': label_text,
            'segment_start_time': round(start, 3),
            'segment_end_time': round(end, 3),
            'segment_duration': round(end - start, 3),
            'num_annotations': len(annotation_details),
            'annotation_types': '; '.join([a['type'] for a in annotation_details]) if annotation_details else 'None',
            'annotation_times': '; '.join([f"{a['begin']:.3f}-{a['end']:.3f}" for a in annotation_details]) if annotation_details else 'None'
        }
        
        file_segments.append(segment_info)
        all_segments.append(segment_info)
        
        seg_idx += 1

        print(f" Saved: {label_text}/{filename}")
    
    # Print summary for this audio file
    print(f"\n Summary for {audio_file}:")
    for label, count in stats.items():
        if count > 0:
            print(f"   ‚Ä¢ {label}: {count} segments")
    
    eagle_total = stats.get("Eagle_long", 0) + stats.get("Eagle_short", 0) + stats.get("EagleMixed", 0)
    no_eagle_total = stats.get("NoEagleSound", 0)
    print(f"   ‚Ä¢ Total Eagle: {eagle_total} segments")
    print(f"   ‚Ä¢ Total No Eagle: {no_eagle_total} segments")
    print(f"   ‚Ä¢ Balance: {eagle_total}/{no_eagle_total} (Eagle/NoEagle)")
    
    # Save per-file manifest
    if file_segments:
        file_manifest_df = pd.DataFrame(file_segments)
        manifest_filename = f"{base_name}_manifest.csv"
        manifest_path = os.path.join(OUTPUT_FOLDER, manifest_filename)
        file_manifest_df.to_csv(manifest_path, index=False)
        print(f" Manifest saved: {manifest_filename}")

# Save master manifest with all segments
if all_segments:
    master_manifest_df = pd.DataFrame(all_segments)
    master_manifest_path = os.path.join(OUTPUT_FOLDER, "master_manifest.csv")
    master_manifest_df.to_csv(master_manifest_path, index=False)
    print(f"\n Master manifest saved: master_manifest.csv")
    
    # Print overall statistics
    print(f"\n Overall Statistics:")
    print(f"   ‚Ä¢ Total segments created: {len(all_segments)}")
    label_counts = master_manifest_df['label'].value_counts()
    for label, count in label_counts.items():
        print(f"   ‚Ä¢ {label}: {count} segments")
    
    eagle_total = label_counts.get("Eagle_long", 0) + label_counts.get("Eagle_short", 0) + label_counts.get("EagleMixed", 0)
    no_eagle_total = label_counts.get("NoEagleSound", 0)
    print(f"   ‚Ä¢ Total Eagle: {eagle_total} segments")
    print(f"   ‚Ä¢ Total No Eagle: {no_eagle_total} segments")
    if eagle_total + no_eagle_total > 0:
        eagle_pct = (eagle_total / (eagle_total + no_eagle_total)) * 100
        print(f"   ‚Ä¢ Balance: {eagle_pct:.1f}% Eagle / {100-eagle_pct:.1f}% No Eagle")

print("\n All audio and annotation files segmented successfully!")


üéß Processing: converted_18537.wav
   ‚Ü≥ Using annotations from: converted_18537.txt
   üìã Annotation types found: ['Unecessary' 'PhilEagle_long' 'PhilEagle_short']
   üö´ Unnecessary regions: 2
   ‚úÖ Valid eagle annotations: 30
   üìä Found 16 eagle segments
   üìä Found 76 potential no-eagle segments
   ‚öñÔ∏è  Selecting 16 no-eagle segments for balance
‚úÖ Saved: EagleMixed/converted_18537_seg0000.wav
‚úÖ Saved: Eagle_short/converted_18537_seg0001.wav
‚úÖ Saved: Eagle_long/converted_18537_seg0002.wav
‚úÖ Saved: EagleMixed/converted_18537_seg0003.wav
‚úÖ Saved: EagleMixed/converted_18537_seg0004.wav
‚úÖ Saved: EagleMixed/converted_18537_seg0005.wav
‚úÖ Saved: EagleMixed/converted_18537_seg0006.wav
‚úÖ Saved: Eagle_long/converted_18537_seg0007.wav
‚úÖ Saved: Eagle_long/converted_18537_seg0008.wav
‚úÖ Saved: Eagle_long/converted_18537_seg0009.wav
‚úÖ Saved: Eagle_long/converted_18537_seg0010.wav
‚úÖ Saved: Eagle_long/converted_18537_seg0011.wav
‚úÖ Saved: Eagle_short/converte