### **Imports**

In [None]:
%pip install pandas librosa soundfile numpy

In [None]:
import os
import pandas as pd
import librosa
import soundfile as sf
import numpy as np

### **Folder Paths**

In [3]:
AUDIO_FOLDER = "D:/_3rd Year Class/1st Sem/Machine Learning/_ForLE/For DataSet/PHEA" 
ANNOTATION_FOLDER = "D:/_3rd Year Class/1st Sem/Machine Learning/_ForLE/For DataSet/SelectionTables"
OUTPUT_FOLDER = "D:/_3rd Year Class/1st Sem/Machine Learning/_ForLE/For DataSet/SegmentedAudios"

### **Sementation Settings**

In [None]:
SEGMENT_LENGTH = 5  # seconds
COL_TYPE = "Type"
COL_QUALITY = "Quality"

# New setting: How to position the 5-second window relative to the annotation
# Options: "start", "center", "end"
WINDOW_POSITION = "start"  # Start the 5-sec window at the annotation's Begin Time

### **Setup MainLoop**

In [None]:
os.makedirs(OUTPUT_FOLDER, exist_ok=True)

audio_files = [f for f in os.listdir(AUDIO_FOLDER) if f.endswith(".wav")]
annotation_files = [f for f in os.listdir(ANNOTATION_FOLDER) if f.endswith(".txt")]

def get_base(filename):
    return filename.split(".Table")[0].split(".txt")[0].split(".wav")[0]

audio_bases = {get_base(f): f for f in audio_files}
annot_bases = {get_base(f): f for f in annotation_files}

# Master manifest to collect all segments
all_segments = []

# ===============================
# MAIN LOOP
# ===============================
for base_name, audio_file in audio_bases.items():
    if base_name not in annot_bases:
        print(f"‚ö†Ô∏è No annotation file for {audio_file}, skipping...")
        continue

    annotation_file = annot_bases[base_name]
    audio_path = os.path.join(AUDIO_FOLDER, audio_file)
    annot_path = os.path.join(ANNOTATION_FOLDER, annotation_file)

    print(f"\nüéß Processing: {audio_file}")
    print(f"   ‚Ü≥ Using annotations from: {annotation_file}")

    # Load audio
    y, sr = librosa.load(audio_path, sr=None)
    total_duration = librosa.get_duration(y=y, sr=sr)

    # Load annotation table
    annotations = pd.read_csv(
        annot_path,
        sep="\t",
        engine="python",
        comment="#",
        skip_blank_lines=True
    )
    annotations.columns = [c.strip() for c in annotations.columns]

    if COL_TYPE not in annotations.columns:
        print(f"‚ö†Ô∏è No '{COL_TYPE}' column found in {annotation_file}, skipping...")
        continue

    # Check if Quality column exists
    has_quality = COL_QUALITY in annotations.columns

    print(f"   üìã Annotation types found: {annotations[COL_TYPE].unique()}")
    if has_quality:
        print(f"   üìã Quality levels found: {annotations[COL_QUALITY].unique()}")

    # Remove ambiguous labels
    annotations = annotations[
        ~annotations[COL_TYPE].astype(str).str.contains(r'\?|ambiguous', case=False, na=False)
    ].copy()

    # Separate unnecessary parts
    unnecessary = annotations[
        annotations[COL_TYPE].astype(str).str.lower().str.contains("necessar", na=False)
    ].copy()
    
    valid_annots = annotations[
        ~annotations[COL_TYPE].astype(str).str.lower().str.contains("necessar", na=False)
    ].copy()

    print(f"   üö´ Unnecessary regions: {len(unnecessary)}")
    print(f"   ‚úÖ Valid eagle annotations: {len(valid_annots)}")

    # Convert times to float
    for col in ["Begin Time (s)", "End Time (s)"]:
        if not valid_annots.empty:
            valid_annots[col] = valid_annots[col].astype(float)
        if not unnecessary.empty:
            unnecessary[col] = unnecessary[col].astype(float)

    if 'Selection' in valid_annots.columns:
        valid_annots['Selection'] = valid_annots['Selection'].astype(int)

    seg_idx = 0
    stats = {}
    file_segments = []
    
    # Process each eagle annotation to create centered segments
    eagle_segments = []
    used_regions = []  # Track which time regions we've already used
    
    if not valid_annots.empty:
        for _, annot in valid_annots.iterrows():
            annot_start = annot["Begin Time (s)"]
            annot_end = annot["End Time (s)"]
            annot_type = str(annot[COL_TYPE])
            
            # Calculate segment window based on position preference
            if WINDOW_POSITION == "start":
                seg_start = annot_start
                seg_end = min(annot_start + SEGMENT_LENGTH, total_duration)
            elif WINDOW_POSITION == "center":
                annot_mid = (annot_start + annot_end) / 2
                seg_start = max(0, annot_mid - SEGMENT_LENGTH / 2)
                seg_end = min(seg_start + SEGMENT_LENGTH, total_duration)
            else:  # "end"
                seg_end = annot_end
                seg_start = max(0, seg_end - SEGMENT_LENGTH)
            
            # Check if this segment overlaps with unnecessary regions
            skip_unnecessary = False
            if not unnecessary.empty:
                for _, unn_row in unnecessary.iterrows():
                    if seg_start < unn_row["End Time (s)"] and seg_end > unn_row["Begin Time (s)"]:
                        skip_unnecessary = True
                        break
            
            if skip_unnecessary:
                print(f"   ‚è≠Ô∏è  Skipping annotation at {annot_start:.2f}s (overlaps unnecessary region)")
                continue
            
            # Check if this segment significantly overlaps with already used regions
            overlaps_used = False
            for used_start, used_end in used_regions:
                overlap_start = max(seg_start, used_start)
                overlap_end = min(seg_end, used_end)
                overlap_duration = max(0, overlap_end - overlap_start)
                if overlap_duration > SEGMENT_LENGTH * 0.5:  # More than 50% overlap
                    overlaps_used = True
                    break
            
            if overlaps_used:
                print(f"   ‚è≠Ô∏è  Skipping annotation at {annot_start:.2f}s (overlaps with existing segment)")
                continue
            
            # Mark this region as used
            used_regions.append((seg_start, seg_end))
            
            # Find all annotations that overlap with this segment window
            overlapped = valid_annots[
                (valid_annots["Begin Time (s)"] < seg_end) &
                (valid_annots["End Time (s)"] > seg_start)
            ]
            
            # Collect annotation details
            annotation_details = []
            types = []
            qualities = []
            selection_numbers = []
            
            for _, overlap_annot in overlapped.iterrows():
                overlap_type = str(overlap_annot[COL_TYPE]).lower()
                
                if "long" in overlap_type or "_long" in overlap_type:
                    call_type = "long"
                elif "short" in overlap_type or "_short" in overlap_type:
                    call_type = "short"
                else:
                    call_type = "unknown"
                
                types.append(call_type)
                
                # Get quality
                if has_quality:
                    quality = str(overlap_annot[COL_QUALITY]).strip()
                    if quality.lower() not in ["high", "medium", "low"]:
                        quality = "Medium"
                else:
                    quality = "Medium"
                
                qualities.append(quality)
                
                if 'Selection' in overlap_annot:
                    selection_numbers.append(int(overlap_annot['Selection']))
                
                annotation_details.append({
                    'selection': int(overlap_annot['Selection']) if 'Selection' in overlap_annot else None,
                    'type': overlap_annot[COL_TYPE],
                    'call_type': call_type,
                    'quality': quality,
                    'begin': overlap_annot["Begin Time (s)"],
                    'end': overlap_annot["End Time (s)"]
                })
            
            # Determine label
            unique_types = list(set(types))
            if len(unique_types) > 1:
                label_base = "EagleMixed"
            elif "long" in unique_types:
                label_base = "Eagle_long"
            elif "short" in unique_types:
                label_base = "Eagle_short"
            else:
                label_base = "EagleSound"
            
            # Determine quality (use highest)
            quality_order = {"High": 3, "Medium": 2, "Low": 1}
            segment_quality = max(qualities, key=lambda q: quality_order.get(q, 0))
            
            label_text = f"{label_base}_{segment_quality}"
            
            eagle_segments.append({
                'start': seg_start,
                'end': seg_end,
                'label_base': label_base,
                'label_full': label_text,
                'quality': segment_quality,
                'selection_numbers': selection_numbers,
                'annotation_details': annotation_details,
                'trigger_annotation': {
                    'start': annot_start,
                    'end': annot_end,
                    'type': annot_type
                }
            })
    
    print(f"   üìä Eagle segments created: {len(eagle_segments)}")
    
    # Create equal number of NoEagleSound segments
    no_eagle_segments = []
    num_no_eagle_needed = len(eagle_segments)
    
    if num_no_eagle_needed > 0:
        # Generate random no-eagle segments
        attempts = 0
        max_attempts = num_no_eagle_needed * 10
        
        while len(no_eagle_segments) < num_no_eagle_needed and attempts < max_attempts:
            attempts += 1
            
            # Random start time
            random_start = np.random.uniform(0, max(0, total_duration - SEGMENT_LENGTH))
            random_end = min(random_start + SEGMENT_LENGTH, total_duration)
            
            # Check if overlaps with eagle annotations
            overlaps_eagle = False
            if not valid_annots.empty:
                for _, annot in valid_annots.iterrows():
                    if random_start < annot["End Time (s)"] and random_end > annot["Begin Time (s)"]:
                        overlaps_eagle = True
                        break
            
            if overlaps_eagle:
                continue
            
            # Check if overlaps with unnecessary
            overlaps_unnecessary = False
            if not unnecessary.empty:
                for _, unn_row in unnecessary.iterrows():
                    if random_start < unn_row["End Time (s)"] and random_end > unn_row["Begin Time (s)"]:
                        overlaps_unnecessary = True
                        break
            
            if overlaps_unnecessary:
                continue
            
            # Check if overlaps with existing no-eagle segments
            overlaps_existing = False
            for existing in no_eagle_segments:
                if random_start < existing['end'] and random_end > existing['start']:
                    overlaps_existing = True
                    break
            
            if overlaps_existing:
                continue
            
            # Valid no-eagle segment
            no_eagle_segments.append({
                'start': random_start,
                'end': random_end,
                'label': "NoEagleSound"
            })
        
        print(f"   üìä No-eagle segments created: {len(no_eagle_segments)}")
    
    # Save all segments
    all_selected_segments = eagle_segments + no_eagle_segments
    
    for segment_data in all_selected_segments:
        start = segment_data['start']
        end = segment_data['end']
        
        if 'label_full' in segment_data:
            # Eagle segment
            label_text = segment_data['label_full']
            label_base = segment_data['label_base']
            quality = segment_data['quality']
            selection_numbers = segment_data.get('selection_numbers', [])
            annotation_details = segment_data['annotation_details']
            trigger = segment_data['trigger_annotation']
        else:
            # No eagle segment
            label_text = segment_data['label']
            label_base = "NoEagleSound"
            quality = None
            selection_numbers = []
            annotation_details = []
            trigger = None
        
        # Prepare subfolder
        label_folder = os.path.join(OUTPUT_FOLDER, label_text)
        os.makedirs(label_folder, exist_ok=True)

        # Extract segment
        start_sample = int(start * sr)
        end_sample = int(end * sr)
        segment = y[start_sample:end_sample]
        
        # Skip if too short
        if len(segment) < 0.5 * sr:
            continue

        # Save segment
        filename = f"{base_name}_seg{seg_idx:04d}.wav"
        save_path = os.path.join(label_folder, filename)
        sf.write(save_path, segment, sr)
        
        stats[label_text] = stats.get(label_text, 0) + 1
        
        # Create manifest entry
        selection_list_str = ', '.join(map(str, selection_numbers)) if selection_numbers else 'None'
        
        individual_times = []
        if label_base == "EagleMixed" and annotation_details:
            for detail in annotation_details:
                sel_num = detail.get('selection', '?')
                individual_times.append(
                    f"Sel{sel_num}({detail['call_type']}): {detail['begin']:.3f}-{detail['end']:.3f}s"
                )
        individual_times_str = '; '.join(individual_times) if individual_times else 'N/A'
        
        segment_info = {
            'source_audio': audio_file,
            'segment_filename': filename,
            'label': label_text,
            'label_category': label_base,
            'quality': quality if quality else 'N/A',
            'output_folder': label_text,
            'segment_start_time': round(start, 3),
            'segment_end_time': round(end, 3),
            'segment_duration': round(end - start, 3),
            'trigger_annotation_start': round(trigger['start'], 3) if trigger else 'N/A',
            'trigger_annotation_end': round(trigger['end'], 3) if trigger else 'N/A',
            'num_annotations': len(annotation_details),
            'selection_numbers': selection_list_str,
            'annotation_types': '; '.join([a['type'] for a in annotation_details]) if annotation_details else 'None',
            'annotation_times': '; '.join([f"{a['begin']:.3f}-{a['end']:.3f}" for a in annotation_details]) if annotation_details else 'None',
            'individual_call_details': individual_times_str
        }
        
        file_segments.append(segment_info)
        all_segments.append(segment_info)
        
        seg_idx += 1
        
        trigger_info = f"triggered by {trigger['start']:.2f}s" if trigger else "random"
        print(f"‚úÖ Saved: {label_text}/{filename} ({trigger_info}, Selections: {selection_list_str})")
    
    # Print summary
    print(f"\nüìä Summary for {audio_file}:")
    for label, count in stats.items():
        if count > 0:
            print(f"   ‚Ä¢ {label}: {count} segments")
    
    # Save per-file manifest
    if file_segments:
        file_manifest_df = pd.DataFrame(file_segments)
        manifest_filename = f"{base_name}_manifest.csv"
        manifest_path = os.path.join(OUTPUT_FOLDER, manifest_filename)
        file_manifest_df.to_csv(manifest_path, index=False)
        print(f"üìã Manifest saved: {manifest_filename}")

# Save master manifest
if all_segments:
    master_manifest_df = pd.DataFrame(all_segments)
    master_manifest_path = os.path.join(OUTPUT_FOLDER, "master_manifest.csv")
    master_manifest_df.to_csv(master_manifest_path, index=False)
    print(f"\nüìã Master manifest saved: master_manifest.csv")
    
    print(f"\nüìä Overall Statistics:")
    print(f"   ‚Ä¢ Total segments created: {len(all_segments)}")
    
    print(f"\n   Eagle segments by quality:")
    for quality in ["High", "Medium", "Low"]:
        quality_segs = master_manifest_df[master_manifest_df['quality'] == quality]
        if len(quality_segs) > 0:
            print(f"   ‚Ä¢ {quality}: {len(quality_segs)} segments")
    
    print(f"\n   By label:")
    label_counts = master_manifest_df['label'].value_counts()
    for label, count in label_counts.items():
        print(f"   ‚Ä¢ {label}: {count} segments")
    
    eagle_total = len(master_manifest_df[master_manifest_df['label_category'] != 'NoEagleSound'])
    no_eagle_total = len(master_manifest_df[master_manifest_df['label_category'] == 'NoEagleSound'])
    print(f"\n   ‚Ä¢ Total Eagle: {eagle_total} segments")
    print(f"   ‚Ä¢ Total No Eagle: {no_eagle_total} segments")
    
    if eagle_total + no_eagle_total > 0:
        eagle_pct = (eagle_total / (eagle_total + no_eagle_total)) * 100
        print(f"   ‚Ä¢ Balance: {eagle_pct:.1f}% Eagle / {100-eagle_pct:.1f}% No Eagle")

print("\nüéâ All audio files segmented successfully!")


üéß Processing: converted_18537.wav
   ‚Ü≥ Using annotations from: converted_18537.txt
   üìã Annotation types found: ['Unecessary' 'PhilEagle_long' 'PhilEagle_short']
   üö´ Unnecessary regions: 2
   ‚úÖ Valid eagle annotations: 30
   üìä Found 16 eagle segments
   üìä Found 76 potential no-eagle segments
   ‚öñÔ∏è  Selecting 16 no-eagle segments for balance
‚úÖ Saved: EagleMixed/converted_18537_seg0000.wav
‚úÖ Saved: Eagle_short/converted_18537_seg0001.wav
‚úÖ Saved: Eagle_long/converted_18537_seg0002.wav
‚úÖ Saved: EagleMixed/converted_18537_seg0003.wav
‚úÖ Saved: EagleMixed/converted_18537_seg0004.wav
‚úÖ Saved: EagleMixed/converted_18537_seg0005.wav
‚úÖ Saved: EagleMixed/converted_18537_seg0006.wav
‚úÖ Saved: Eagle_long/converted_18537_seg0007.wav
‚úÖ Saved: Eagle_long/converted_18537_seg0008.wav
‚úÖ Saved: Eagle_long/converted_18537_seg0009.wav
‚úÖ Saved: Eagle_long/converted_18537_seg0010.wav
‚úÖ Saved: Eagle_long/converted_18537_seg0011.wav
‚úÖ Saved: Eagle_short/converte