### **Imports**

In [None]:
%pip install pandas librosa soundfile numpy

In [1]:
import os
import pandas as pd
import librosa
import soundfile as sf
import numpy as np

### **Folder Paths**

In [2]:
AUDIO_FOLDER = "D:/_3rd Year Class/1st Sem/Machine Learning/_ForLE/For DataSet/TestInput" 
ANNOTATION_FOLDER = "D:/_3rd Year Class/1st Sem/Machine Learning/_ForLE/For DataSet/TestTable"
OUTPUT_FOLDER = "D:/_3rd Year Class/1st Sem/Machine Learning/_ForLE/For DataSet/TestOutput"

### **Sementation Settings**

In [5]:
# NEW: Path to your metadata spreadsheet
METADATA_PATH = r"D:/_3rd Year Class/1st Sem/Machine Learning/_ForLE/Spreadsheets/ML__2025-10-12T15-06_grpeag1_audio.csv"

SEGMENT_LENGTH = 5  # seconds
COL_TYPE = "Type"
COL_QUALITY = "Quality"

# HYBRID APPROACH SETTINGS
WINDOW_POSITION = "center" 
MIN_ANNOTATION_GAP = 1.0 
OVERLAP_THRESHOLD = 0.8

### **Setup MainLoop**

In [None]:
os.makedirs(OUTPUT_FOLDER, exist_ok=True)

print("Loading Metadata...")
try:
    meta_df = pd.read_csv(METADATA_PATH)
    meta_df['ML Catalog Number'] = meta_df['ML Catalog Number'].astype(str)
    # Map ID to State (Location)
    id_to_location = dict(zip(meta_df['ML Catalog Number'].str.strip(), meta_df['State'].str.strip()))
    print(f"‚úÖ Loaded locations for {len(id_to_location)} files.")
except Exception as e:
    print(f"‚ùå Error loading metadata CSV: {e}")
    print("   Will default to 'UnknownLocation' for all files.")
    id_to_location = {}

audio_files = [f for f in os.listdir(AUDIO_FOLDER) if f.endswith(".wav")]
annotation_files = [f for f in os.listdir(ANNOTATION_FOLDER) if f.endswith(".txt")]

def get_base(filename):
    return filename.split(".Table")[0].split(".txt")[0].split(".wav")[0]

audio_bases = {get_base(f): f for f in audio_files}
annot_bases = {get_base(f): f for f in annotation_files}

all_segments = []
no_eagle_deficit = 0
segment_group_id = 0

# ===============================
# HELPER FUNCTIONS
# ===============================
def calculate_overlap(start1, end1, start2, end2):
    overlap_start = max(start1, start2)
    overlap_end = min(end1, end2)
    overlap_duration = max(0, overlap_end - overlap_start)
    segment1_duration = end1 - start1
    segment2_duration = end2 - start2
    if segment1_duration == 0 or segment2_duration == 0: return 0
    return max(overlap_duration / segment1_duration, overlap_duration / segment2_duration)

def find_location_for_file(filename, lookup_dict):
    filename_clean = filename.replace('.wav', '')
    for catalog_id, location in lookup_dict.items():
        if str(catalog_id) in filename_clean:
            return location
    return "UnknownLocation"

# ===============================
# MAIN LOOP
# ===============================
for base_name, audio_file in audio_bases.items():
    if base_name not in annot_bases:
        print(f"‚ö†Ô∏è No annotation file for {audio_file}, skipping...")
        continue

    annotation_file = annot_bases[base_name]
    audio_path = os.path.join(AUDIO_FOLDER, audio_file)
    annot_path = os.path.join(ANNOTATION_FOLDER, annotation_file)

    # LOCATION LOOKUP
    location_name = find_location_for_file(audio_file, id_to_location)
    location_clean = "".join([c for c in location_name if c.isalnum() or c in (' ', '_', '-')]).strip()
    
    print(f"\n" + "="*60)
    print(f"üéß Processing: {audio_file}")
    print(f"üìç Location identified: {location_clean}")
    print(f"="*60)

    # Load audio
    y, sr = librosa.load(audio_path, sr=None)
    total_duration = librosa.get_duration(y=y, sr=sr)

    # Load annotation table
    annotations = pd.read_csv(annot_path, sep="\t", engine="python", comment="#", skip_blank_lines=True)
    annotations.columns = [c.strip() for c in annotations.columns]

    if COL_TYPE not in annotations.columns:
        print(f"‚ö†Ô∏è No '{COL_TYPE}' column found, skipping...")
        continue

    has_quality = COL_QUALITY in annotations.columns

    # Clean annotations
    annotations = annotations[~annotations[COL_TYPE].astype(str).str.contains(r'\?|ambiguous', case=False, na=False)].copy()
    unnecessary = annotations[annotations[COL_TYPE].astype(str).str.lower().str.contains("necessar", na=False)].copy()
    valid_annots = annotations[~annotations[COL_TYPE].astype(str).str.lower().str.contains("necessar", na=False)].copy()

    for col in ["Begin Time (s)", "End Time (s)"]:
        if not valid_annots.empty: valid_annots[col] = valid_annots[col].astype(float)
        if not unnecessary.empty: unnecessary[col] = unnecessary[col].astype(float)

    if 'Selection' in valid_annots.columns:
        valid_annots['Selection'] = valid_annots['Selection'].astype(int)

    seg_idx = 0
    stats = {}
    file_segments = []
    eagle_segments = []
    created_segments = [] 
    
    # --- PROCESS EAGLE SEGMENTS ---
    if not valid_annots.empty:
        valid_annots_sorted = valid_annots.sort_values('Begin Time (s)')
        
        for idx, annot in valid_annots_sorted.iterrows():
            annot_start = annot["Begin Time (s)"]
            annot_end = annot["End Time (s)"]
            annot_type = str(annot[COL_TYPE])
            annot_selection = int(annot['Selection']) if 'Selection' in annot else idx
            
            # Gap check
            if len(valid_annots_sorted) > 1:
                prev_annots = valid_annots_sorted[valid_annots_sorted['Begin Time (s)'] < annot_start]
                if not prev_annots.empty:
                    gap = annot_start - prev_annots.iloc[-1]['End Time (s)']
                    if gap < MIN_ANNOTATION_GAP and gap > 0:
                        # print(f"   ‚è≠Ô∏è  Skipping Selection {annot_selection} (too close: {gap:.2f}s gap)")
                        continue
            
            # Window calculation (Center)
            annot_mid = (annot_start + annot_end) / 2
            seg_start = max(0, annot_mid - SEGMENT_LENGTH / 2)
            seg_end = min(seg_start + SEGMENT_LENGTH, total_duration)
            if seg_end >= total_duration:
                seg_end = total_duration
                seg_start = max(0, seg_end - SEGMENT_LENGTH)
            
            # Check overlap with unnecessary
            skip_unnecessary = False
            if not unnecessary.empty:
                for _, unn_row in unnecessary.iterrows():
                    if seg_start < unn_row["End Time (s)"] and seg_end > unn_row["Begin Time (s)"]:
                        skip_unnecessary = True
                        break
            if skip_unnecessary:
                print(f"   ‚è≠Ô∏è  Skipping annotation at {annot_start:.2f}s (overlaps unnecessary region)")
                continue
            
            # Smart Overlap Check
            skip_overlap = False
            overlapping_group = None
            for existing_seg in created_segments:
                overlap_ratio = calculate_overlap(seg_start, seg_end, existing_seg['start'], existing_seg['end'])
                if overlap_ratio > OVERLAP_THRESHOLD:
                    skip_overlap = True
                    print(f"   ‚è≠Ô∏è  Skipping Selection {annot_selection} ({overlap_ratio*100:.0f}% overlap)")
                    break
                elif overlap_ratio > 0.3:
                    overlapping_group = existing_seg.get('group_id')
            
            if skip_overlap: continue
            
            if overlapping_group is None:
                current_group_id = segment_group_id
                segment_group_id += 1
            else:
                current_group_id = overlapping_group
            
            # Collect details
            overlapped = valid_annots[(valid_annots["Begin Time (s)"] < seg_end) & (valid_annots["End Time (s)"] > seg_start)]
            annotation_details = []
            types = []
            selection_numbers = []
            
            for _, row_ov in overlapped.iterrows():
                ov_start = row_ov["Begin Time (s)"]
                ov_end = row_ov["End Time (s)"]
                ov_type = str(row_ov[COL_TYPE]).lower()
                
                if "long" in ov_type: ctype = "long"
                elif "short" in ov_type: ctype = "short"
                else: ctype = "unknown"
                types.append(ctype)
                
                if 'Selection' in row_ov: selection_numbers.append(int(row_ov['Selection']))
                annotation_details.append({'type': row_ov[COL_TYPE], 'begin': ov_start, 'end': ov_end})

            # SIMPLIFIED LABELING
            annot_type_lower = annot_type.lower()
            if "long" in annot_type_lower: label_base = "Eagle_long"
            elif "short" in annot_type_lower: label_base = "Eagle_short"
            else: label_base = "EagleSound"
            
            if len(set(types)) > 1: label_base = "EagleMixed"
            
            segment_quality = "Medium"
            if has_quality:
                q = str(annot[COL_QUALITY]).strip()
                if q.lower() in ["high", "medium", "low"]: segment_quality = q

            label_text = label_base
            
            segment_info = {
                'start': seg_start,
                'end': seg_end,
                'label_base': label_base,
                'label_full': label_text,
                'quality': segment_quality,
                'group_id': current_group_id,
                'selection_numbers': selection_numbers,
                'location_id': location_clean
            }
            
            eagle_segments.append(segment_info)
            created_segments.append(segment_info)
            
            trigger_info = f"Sel{annot_selection}"
            group_msg = f" [Group {current_group_id}]"
            print(f"   ‚úÖ Created: {location_clean}/{label_text} ({trigger_info}){group_msg}")

    print(f"   üìä Eagle segments created: {len(eagle_segments)}")
    
    # --- DEFICIT LOGGING ---
    num_no_eagle_needed = len(eagle_segments) + no_eagle_deficit
    print(f"   üìä Deficit from previous files: {no_eagle_deficit}")
    print(f"   ‚öñÔ∏è  Total no-eagle segments needed: {num_no_eagle_needed}")

    # --- PROCESS NO-EAGLE SEGMENTS ---
    no_eagle_segments = []
    if num_no_eagle_needed > 0:
        attempts = 0
        max_attempts = num_no_eagle_needed * 10
        
        while len(no_eagle_segments) < num_no_eagle_needed and attempts < max_attempts:
            attempts += 1
            r_start = np.random.uniform(0, max(0, total_duration - SEGMENT_LENGTH))
            r_end = min(r_start + SEGMENT_LENGTH, total_duration)
            
            overlaps_eagle = False
            if not valid_annots.empty:
                for _, annot in valid_annots.iterrows():
                    if r_start < annot["End Time (s)"] and r_end > annot["Begin Time (s)"]:
                        overlaps_eagle = True
                        break
            if overlaps_eagle: continue
            
            overlaps_unn = False
            if not unnecessary.empty:
                for _, u in unnecessary.iterrows():
                    if r_start < u["End Time (s)"] and r_end > u["Begin Time (s)"]:
                        overlaps_unn = True
                        break
            if overlaps_unn: continue

            overlaps_existing = False
            for ex in no_eagle_segments:
                if r_start < ex['end'] and r_end > ex['start']:
                    overlaps_existing = True
                    break
            if overlaps_existing: continue
            
            no_eagle_segments.append({
                'start': r_start, 'end': r_end, 
                'label': "NoEagleSound", 
                'group_id': segment_group_id,
                'location_id': location_clean
            })
            segment_group_id += 1
            
        print(f"   üìä No-eagle segments created: {len(no_eagle_segments)}")
            
    # Update Deficit
    if len(no_eagle_segments) >= num_no_eagle_needed: 
        no_eagle_deficit = 0
        print(f"   ‚úÖ Deficit cleared! All {num_no_eagle_needed} no-eagle segments created.")
    else: 
        no_eagle_deficit = num_no_eagle_needed - len(no_eagle_segments)
        print(f"   ‚ö†Ô∏è  Could only create {len(no_eagle_segments)}/{num_no_eagle_needed} segments.")
        print(f"   ‚ö†Ô∏è  Carrying over deficit of {no_eagle_deficit} to next file.")

    # SAVE FILES
    all_selected_segments = eagle_segments + no_eagle_segments
    for seg in all_selected_segments:
        lbl = seg.get('label_full', seg.get('label'))
        loc = seg['location_id']
        
        save_folder = os.path.join(OUTPUT_FOLDER, loc, lbl)
        os.makedirs(save_folder, exist_ok=True)
        
        s_samp = int(seg['start'] * sr)
        e_samp = int(seg['end'] * sr)
        audio_seg = y[s_samp:e_samp]
        
        if len(audio_seg) >= 0.5 * sr:
            fname = f"{base_name}_seg{seg_idx:04d}.wav"
            sf.write(os.path.join(save_folder, fname), audio_seg, sr)
            
            stats[lbl] = stats.get(lbl, 0) + 1
            
            seg['segment_filename'] = fname
            seg['output_folder'] = os.path.join(loc, lbl)
            seg['source_audio'] = audio_file
            
            all_segments.append(seg)
            file_segments.append(seg)
            seg_idx += 1

    # --- PER FILE SUMMARY ---
    print(f"\nüìä Summary for {audio_file}:")
    for label, count in stats.items():
        if count > 0:
            print(f"   ‚Ä¢ {label}: {count} segments")
            
    eagle_count = len(eagle_segments)
    no_eagle_count = len(no_eagle_segments)
    print(f"   ‚Ä¢ Total Eagle: {eagle_count} segments")
    print(f"   ‚Ä¢ Total No Eagle: {no_eagle_count} segments")
    print(f"   ‚Ä¢ Balance for this file: {eagle_count}/{no_eagle_count} (Eagle/NoEagle)")

    # Save per-file manifest
    if file_segments:
        df = pd.DataFrame(file_segments)
        manifest_filename = f"{base_name}_manifest.csv"
        df.to_csv(os.path.join(OUTPUT_FOLDER, manifest_filename), index=False)
        print(f"üìã Manifest saved: {manifest_filename}")

    # ==========================================
    # ‚è∏Ô∏è PAUSE FOR REVIEW (NEW FEATURE)
    # ==========================================
    print(f"\n" + "-"*40)
    print(f"FINISHED PROCESSING: {audio_file}")
    user_input = input(f"Press ENTER to process the next file (or type 'q' to quit): ")
    if user_input.lower() == 'q':
        print("\nüõë Stopping segmentation early per user request.")
        break
    print(f"-"*40 + "\n")


# --- MASTER SUMMARY ---
# --- MASTER SUMMARY WITH SEPARATE MANIFESTS ---
if all_segments:
    # Create master DataFrame
    master_df = pd.DataFrame(all_segments)
    
    # Save complete master manifest
    master_path = os.path.join(OUTPUT_FOLDER, "master_manifest.csv")
    master_df.to_csv(master_path, index=False)
    print("\n" + "="*60)
    print("üìã MASTER MANIFEST SAVED: master_manifest.csv")
    print("="*60)
    
    # ==========================================
    # NEW: SEPARATE EAGLE AND NO-EAGLE MANIFESTS
    # ==========================================
    
    # Separate Eagle segments (all types)
    eagle_df = master_df[master_df['label'] != 'NoEagleSound'].copy()
    if not eagle_df.empty:
        eagle_manifest_path = os.path.join(OUTPUT_FOLDER, "master_manifest_PhilEagle.csv")
        eagle_df.to_csv(eagle_manifest_path, index=False)
        print(f"ü¶Ö EAGLE MANIFEST SAVED: master_manifest_PhilEagle.csv")
        print(f"   ‚Ä¢ Contains {len(eagle_df)} eagle segments")
    
    # Separate NoEagle segments
    no_eagle_df = master_df[master_df['label'] == 'NoEagleSound'].copy()
    if not no_eagle_df.empty:
        no_eagle_manifest_path = os.path.join(OUTPUT_FOLDER, "master_manifest_NoEagle.csv")
        no_eagle_df.to_csv(no_eagle_manifest_path, index=False)
        print(f"üîá NO-EAGLE MANIFEST SAVED: master_manifest_NoEagle.csv")
        print(f"   ‚Ä¢ Contains {len(no_eagle_df)} no-eagle segments")
    
    print("="*60)
    
    # ==========================================
    # DETAILED STATISTICS
    # ==========================================
    
    print(f"\nüìä Overall Statistics:")
    print(f"   ‚Ä¢ Total segments: {len(all_segments)}")
    print(f"   ‚Ä¢ Unique locations: {master_df['location_id'].nunique()}")
    print(f"   ‚Ä¢ Location names: {', '.join(master_df['location_id'].unique())}")
    
    print(f"\n   üìç Segments by Location:")
    location_counts = master_df['location_id'].value_counts()
    for location, count in location_counts.items():
        print(f"      ‚Ä¢ {location}: {count} segments")
    
    print(f"\n   ü¶Ö Eagle Segments by Type:")
    if not eagle_df.empty:
        eagle_type_counts = eagle_df['label'].value_counts()
        for label, count in eagle_type_counts.items():
            percentage = (count / len(eagle_df)) * 100
            print(f"      ‚Ä¢ {label}: {count} ({percentage:.1f}%)")
    
    print(f"\n   üîá No-Eagle Segments:")
    no_eagle_total = len(no_eagle_df)
    print(f"      ‚Ä¢ NoEagleSound: {no_eagle_total}")
    
    # ==========================================
    # BALANCE ANALYSIS
    # ==========================================
    
    eagle_total = len(eagle_df)
    
    print(f"\n   ‚öñÔ∏è  Dataset Balance:")
    print(f"      ‚Ä¢ Total Eagle: {eagle_total} segments")
    print(f"      ‚Ä¢ Total No-Eagle: {no_eagle_total} segments")
    
    if no_eagle_deficit > 0:
        print(f"\n   ‚ö†Ô∏è  FINAL DEFICIT: {no_eagle_deficit} no-eagle segments still needed!")
        print(f"   ‚ö†Ô∏è  Dataset is imbalanced by {no_eagle_deficit} segments.")
        actual_balance = (eagle_total / (eagle_total + no_eagle_total)) * 100 if (eagle_total + no_eagle_total) > 0 else 0
        print(f"      ‚Ä¢ Actual Balance: {actual_balance:.1f}% Eagle / {100-actual_balance:.1f}% No Eagle")
    else:
        if eagle_total + no_eagle_total > 0:
            eagle_pct = (eagle_total / (eagle_total + no_eagle_total)) * 100
            print(f"      ‚úÖ Perfect Balance: {eagle_pct:.1f}% Eagle / {100-eagle_pct:.1f}% No Eagle")
    
    # ==========================================
    # QUALITY DISTRIBUTION (from metadata)
    # ==========================================
    
    if 'quality' in eagle_df.columns:
        print(f"\n   üé® Eagle Segments by Quality:")
        quality_counts = eagle_df['quality'].value_counts()
        for quality, count in quality_counts.items():
            percentage = (count / len(eagle_df)) * 100
            print(f"      ‚Ä¢ {quality}: {count} ({percentage:.1f}%)")
    
    # ==========================================
    # OVERLAP GROUP ANALYSIS
    # ==========================================
    
    if 'group_id' in master_df.columns:
        total_groups = master_df['group_id'].nunique()
        segments_in_groups = master_df.groupby('group_id').size()
        overlapping_groups = segments_in_groups[segments_in_groups > 1]
        
        print(f"\n   üîó Overlap Group Analysis:")
        print(f"      ‚Ä¢ Total unique groups: {total_groups}")
        print(f"      ‚Ä¢ Groups with overlapping segments: {len(overlapping_groups)}")
        if len(overlapping_groups) > 0:
            max_overlap = overlapping_groups.max()
            print(f"      ‚Ä¢ Maximum segments in one group: {max_overlap}")
            print(f"      ‚ö†Ô∏è  Remember to use group_id for train/test splitting!")
    
    # ==========================================
    # FILE SUMMARY
    # ==========================================
    
    print(f"\nüìÅ Generated Files:")
    print(f"   ‚Ä¢ master_manifest.csv - Complete dataset ({len(master_df)} rows)")
    if not eagle_df.empty:
        print(f"   ‚Ä¢ master_manifest_PhilEagle.csv - Eagle only ({len(eagle_df)} rows)")
    if not no_eagle_df.empty:
        print(f"   ‚Ä¢ master_manifest_NoEagle.csv - NoEagle only ({len(no_eagle_df)} rows)")
    print(f"   ‚Ä¢ Individual file manifests - One per processed audio file")
    
    print("\nüí° USAGE TIPS:")
    print("   ‚Ä¢ Use master_manifest.csv for complete overview")
    print("   ‚Ä¢ Use master_manifest_PhilEagle.csv to analyze eagle call patterns")
    print("   ‚Ä¢ Use master_manifest_NoEagle.csv to verify background sound diversity")
    print("   ‚Ä¢ Always use 'group_id' for stratified train/test splitting!")

print("\nüéâ All audio files segmented successfully!")
print("="*60)