In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [1]:
!pip install librosa==0.10.1
!pip install pydub==0.25.1
!pip install soundfile==0.12.1
!pip install scikit-learn==1.3.0
!pip install tqdm

Collecting librosa==0.10.1
  Downloading librosa-0.10.1-py3-none-any.whl.metadata (8.3 kB)
Downloading librosa-0.10.1-py3-none-any.whl (253 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m253.7/253.7 kB[0m [31m6.7 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
[?25hInstalling collected packages: librosa
  Attempting uninstall: librosa
    Found existing installation: librosa 0.11.0
    Uninstalling librosa-0.11.0:
      Successfully uninstalled librosa-0.11.0
Successfully installed librosa-0.10.1
Collecting soundfile==0.12.1
  Downloading soundfile-0.12.1-py2.py3-none-manylinux_2_31_x86_64.whl.metadata (14 kB)
Downloading soundfile-0.12.1-py2.py3-none-manylinux_2_31_x86_64.whl (1.2 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.2/1.2 MB[0m [31m16.1 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
[?25hInstalling collected packages: soundfile
  Attempting uninstall: soundfile
    Found existing installation: soundfile 0.13.1
    Uninstalling 

In [2]:
# Enhanced Audio Analysis with Advanced ML and Emotion Detection
import os
import numpy as np
import pandas as pd
import librosa
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats
from scipy.signal import find_peaks, savgol_filter
from sklearn.cluster import KMeans, DBSCAN
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.ensemble import IsolationForest, RandomForestClassifier
from sklearn.decomposition import PCA
from sklearn.manifold import TSNE
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
import umap
from tqdm import tqdm
import warnings
import gc
from collections import Counter
import joblib

# Audio processing libraries
from pydub import AudioSegment
import soundfile as sf

warnings.filterwarnings('ignore')
plt.style.use('seaborn-v0_8')
sns.set_palette("husl")

class AdvancedEmotionClassifier:
    """
    Advanced rule-based emotion classification with fuzzy logic and confidence scoring.
    Uses multiple feature combinations and weighted scoring for better accuracy.
    """
    
    def __init__(self):
        self.emotion_labels = ['angry', 'happy', 'sad', 'confused', 'frustrated', 'calm', 'excited', 'neutral']
        self.feature_weights = {
            'pitch': 0.25,
            'energy': 0.20,
            'rhythm': 0.15,
            'spectral': 0.15,
            'silence': 0.15,
            'voice_quality': 0.10
        }
        print(f"Advanced Emotion Classifier initialized with {len(self.emotion_labels)} emotion categories")
    
    def _calculate_confidence(self, scores):
        """Calculate confidence based on score distribution"""
        max_score = max(scores.values())
        second_max = sorted(scores.values())[-2] if len(scores) > 1 else 0
        confidence = (max_score - second_max) / max_score if max_score > 0 else 0
        return min(confidence, 1.0)
    
    def _fuzzy_membership(self, value, low, medium, high):
        """Calculate fuzzy membership for continuous values"""
        if value <= low:
            return 'low'
        elif value <= medium:
            return 'medium_low' if value < (low + medium) / 2 else 'medium'
        elif value <= high:
            return 'medium_high' if value < (medium + high) / 2 else 'high'
        else:
            return 'very_high'
    
    def classify_emotion(self, features):
        """
        Advanced emotion classification with confidence scoring
        """
        scores = {emotion: 0.0 for emotion in self.emotion_labels}
        
        # Extract key metrics with safe defaults
        pitch_mean = features.get('pitch', {}).get('mean', 150)
        pitch_std = features.get('pitch', {}).get('std', 30)
        pitch_range = features.get('pitch', {}).get('range', 50)
        
        energy_rms = features.get('energy', {}).get('rms', 0.03)
        energy_variance = features.get('energy', {}).get('variance', 0.001)
        
        speaking_rate = features.get('rhythm', {}).get('speaking_rate', 1.5)
        rhythm_regularity = features.get('rhythm', {}).get('regularity', 0.5)
        
        silence_ratio = features.get('silence', {}).get('ratio', 0.3)
        long_pause_count = features.get('silence', {}).get('long_pause_count', 2)
        
        spectral_centroid = features.get('spectral', {}).get('centroid', 1500)
        spectral_rolloff = features.get('spectral', {}).get('rolloff', 3000)
        
        jitter = features.get('voice_quality', {}).get('jitter', 0.01)
        shimmer = features.get('voice_quality', {}).get('shimmer', 0.05)
        
        # ANGRY: High energy, high pitch variation, fast speech, harsh spectral qualities
        if energy_rms > 0.07 and pitch_std > 60:
            scores['angry'] += 3.0
        if speaking_rate > 2.2 and spectral_centroid > 1700:
            scores['angry'] += 2.0
        if jitter > 0.015 or shimmer > 0.08:
            scores['angry'] += 1.5
        if pitch_range > 100:
            scores['angry'] += 1.0
            
        # FRUSTRATED: Moderate energy, irregular rhythm, medium-high pitch
        if 0.04 < energy_rms < 0.07 and rhythm_regularity < 0.4:
            scores['frustrated'] += 2.5
        if 160 < pitch_mean < 200 and long_pause_count > 5:
            scores['frustrated'] += 2.0
        if 0.3 < silence_ratio < 0.5:
            scores['frustrated'] += 1.0
            
        # HAPPY/EXCITED: High energy, high pitch, regular rhythm
        if energy_rms > 0.06 and pitch_mean > 170:
            if speaking_rate > 2.0 and rhythm_regularity > 0.6:
                scores['excited'] += 3.0
            else:
                scores['happy'] += 2.5
        if pitch_std > 40 and energy_variance > 0.003:
            scores['happy'] += 1.5
            
        # SAD: Low energy, low pitch, slow speech
        if energy_rms < 0.03 and pitch_mean < 140:
            scores['sad'] += 3.0
        if speaking_rate < 1.2 and silence_ratio > 0.4:
            scores['sad'] += 2.0
        if spectral_centroid < 1300:
            scores['sad'] += 1.0
            
        # CONFUSED: Irregular patterns, many pauses, variable energy
        if long_pause_count > 8 and silence_ratio > 0.45:
            scores['confused'] += 3.0
        if rhythm_regularity < 0.3 and energy_variance > 0.004:
            scores['confused'] += 2.0
        if 0.02 < energy_rms < 0.05:
            scores['confused'] += 1.0
            
        # CALM: Moderate, stable patterns
        if 0.02 < energy_rms < 0.05 and pitch_std < 40:
            scores['calm'] += 2.5
        if 1.2 < speaking_rate < 1.8 and rhythm_regularity > 0.5:
            scores['calm'] += 2.0
        if 0.2 < silence_ratio < 0.35:
            scores['calm'] += 1.0
            
        # NEUTRAL: Average values across metrics
        neutral_score = 0
        if 130 < pitch_mean < 180: neutral_score += 1
        if 0.03 < energy_rms < 0.06: neutral_score += 1
        if 1.5 < speaking_rate < 2.0: neutral_score += 1
        scores['neutral'] = neutral_score
        
        # Calculate confidence and return result
        confidence = self._calculate_confidence(scores)
        predicted_emotion = max(scores, key=scores.get) if max(scores.values()) > 0 else 'neutral'
        
        return {
            'emotion': predicted_emotion,
            'confidence': confidence,
            'scores': scores
        }

class MultiLevelAnomalyDetector:
    """
    Multi-level anomaly detection using multiple algorithms and ensemble methods
    """
    
    def __init__(self):
        self.detectors = {
            'isolation_forest': IsolationForest(contamination=0.15, random_state=42),
            'statistical': None,  # Custom statistical detector
            'pattern_based': None  # Custom pattern-based detector
        }
        self.scaler = StandardScaler()
        self.is_fitted = False
        
    def fit(self, X, feature_names):
        """Fit all anomaly detectors"""
        X_scaled = self.scaler.fit_transform(X)
        
        # Fit Isolation Forest
        self.detectors['isolation_forest'].fit(X_scaled)
        
        # Store statistics for statistical detector
        self.stats = {
            'mean': np.mean(X_scaled, axis=0),
            'std': np.std(X_scaled, axis=0),
            'percentiles': {
                '5': np.percentile(X_scaled, 5, axis=0),
                '95': np.percentile(X_scaled, 95, axis=0)
            }
        }
        
        self.feature_names = feature_names
        self.is_fitted = True
        
    def detect_anomalies(self, X):
        """Detect anomalies using ensemble approach"""
        if not self.is_fitted:
            raise ValueError("Detectors must be fitted before detecting anomalies")
            
        X_scaled = self.scaler.transform(X)
        
        # Isolation Forest anomalies
        iso_anomalies = (self.detectors['isolation_forest'].predict(X_scaled) == -1).astype(int)
        
        # Statistical anomalies (Z-score based)
        z_scores = np.abs((X_scaled - self.stats['mean']) / (self.stats['std'] + 1e-8))
        stat_anomalies = (np.max(z_scores, axis=1) > 3).astype(int)
        
        # Percentile-based anomalies
        perc_anomalies = np.zeros(len(X_scaled))
        for i, row in enumerate(X_scaled):
            outside_range = ((row < self.stats['percentiles']['5']) | 
                           (row > self.stats['percentiles']['95'])).sum()
            perc_anomalies[i] = 1 if outside_range > len(row) * 0.3 else 0
        
        # Ensemble decision (majority vote)
        ensemble_anomalies = ((iso_anomalies + stat_anomalies + perc_anomalies) >= 2).astype(int)
        
        return {
            'ensemble': ensemble_anomalies,
            'isolation_forest': iso_anomalies,
            'statistical': stat_anomalies,
            'percentile': perc_anomalies.astype(int)
        }

class CallPrioritizationML:
    """
    Machine Learning model for call prioritization based on multiple factors
    """
    
    def __init__(self):
        self.models = {
            'priority_classifier': RandomForestClassifier(n_estimators=100, random_state=42),
            'urgency_regressor': RandomForestClassifier(n_estimators=100, random_state=42),
            'risk_scorer': LogisticRegression(random_state=42)
        }
        self.scalers = {name: StandardScaler() for name in self.models.keys()}
        self.is_fitted = False
        
    def _create_priority_labels(self, df):
        """Create priority labels based on multiple criteria"""
        priority_scores = np.zeros(len(df))
        
        # Emotion-based scoring
        emotion_weights = {
            'angry': 5, 'frustrated': 4, 'sad': 3, 'confused': 3,
            'excited': 2, 'happy': 1, 'calm': 0, 'neutral': 0
        }
        for emotion, weight in emotion_weights.items():
            priority_scores += (df['emotion_label'] == emotion) * weight
            
        # Anomaly scoring
        priority_scores += df['is_anomaly'] * 3
        
        # Duration scoring (very long or very short calls)
        duration_q25, duration_q75 = df['general_duration'].quantile([0.25, 0.75])
        priority_scores += ((df['general_duration'] < duration_q25 * 0.5) | 
                          (df['general_duration'] > duration_q75 * 1.5)) * 2
        
        # Energy and silence patterns
        priority_scores += (df['energy_rms'] > df['energy_rms'].quantile(0.9)) * 2
        priority_scores += (df['silence_long_pause_count'] > df['silence_long_pause_count'].quantile(0.8)) * 1
        
        # Convert to categorical labels
        priority_labels = np.where(priority_scores >= 7, 'HIGH',
                         np.where(priority_scores >= 4, 'MEDIUM', 'LOW'))
        
        return priority_labels, priority_scores
    
    def fit(self, df, feature_columns):
        """Train the prioritization models"""
        X = df[feature_columns].fillna(0)
        
        # Create labels
        priority_labels, priority_scores = self._create_priority_labels(df)
        urgency_labels = np.where(priority_scores >= 6, 2,  # High urgency
                         np.where(priority_scores >= 3, 1, 0))  # Medium/Low urgency
        risk_labels = (priority_scores >= 5).astype(int)  # Binary risk classification
        
        # Train models
        for model_name, model in self.models.items():
            X_scaled = self.scalers[model_name].fit_transform(X)
            
            if model_name == 'priority_classifier':
                model.fit(X_scaled, priority_labels)
            elif model_name == 'urgency_regressor':
                model.fit(X_scaled, urgency_labels)
            elif model_name == 'risk_scorer':
                model.fit(X_scaled, risk_labels)
        
        self.feature_columns = feature_columns
        self.is_fitted = True
        
        # Print model performance
        self._evaluate_models(X, priority_labels, urgency_labels, risk_labels)
        
    def _evaluate_models(self, X, priority_labels, urgency_labels, risk_labels):
        """Evaluate model performance using cross-validation"""
        print("\nModel Performance Evaluation:")
        print("-" * 40)
        
        for model_name, model in self.models.items():
            X_scaled = self.scalers[model_name].transform(X)
            
            if model_name == 'priority_classifier':
                scores = cross_val_score(model, X_scaled, priority_labels, cv=3, scoring='accuracy')
                print(f"Priority Classifier Accuracy: {scores.mean():.3f} (+/- {scores.std() * 2:.3f})")
            elif model_name == 'urgency_regressor':
                scores = cross_val_score(model, X_scaled, urgency_labels, cv=3, scoring='accuracy')
                print(f"Urgency Regressor Accuracy: {scores.mean():.3f} (+/- {scores.std() * 2:.3f})")
            elif model_name == 'risk_scorer':
                scores = cross_val_score(model, X_scaled, risk_labels, cv=3, scoring='roc_auc')
                print(f"Risk Scorer AUC: {scores.mean():.3f} (+/- {scores.std() * 2:.3f})")
    
    def predict(self, df, feature_columns):
        """Predict call priorities"""
        if not self.is_fitted:
            raise ValueError("Models must be fitted before prediction")
            
        X = df[feature_columns].fillna(0)
        
        predictions = {}
        for model_name, model in self.models.items():
            X_scaled = self.scalers[model_name].transform(X)
            predictions[model_name] = model.predict(X_scaled)
            
            if hasattr(model, 'predict_proba'):
                predictions[f"{model_name}_proba"] = model.predict_proba(X_scaled)
        
        return predictions

class EnhancedAudioAnalyzer:
    def __init__(self, audio_folder_path):
        self.audio_folder = audio_folder_path
        self.output_folder = "/kaggle/working/enhanced_audio_analysis"
        os.makedirs(self.output_folder, exist_ok=True)
        self.processed_data = []
        self.features_df = None
        self.emotion_classifier = AdvancedEmotionClassifier()
        self.anomaly_detector = MultiLevelAnomalyDetector()
        self.ml_prioritizer = CallPrioritizationML()
        self.sample_for_timeline = None
        
    def _create_short_filename(self, filename):
        """Create short filename for labeling"""
        base_name = os.path.splitext(filename)[0]
        if len(base_name) > 10:
            parts = base_name.split('_')
            if len(parts) > 1:
                return f"{parts[0][:5]}_{parts[1][:5]}"
            return base_name[:10]
        return base_name
    
    def convert_mp3_to_wav(self, mp3_path, wav_filename, target_sr=22050):
        """Convert MP3 to WAV with higher sample rate for better analysis"""
        try:
            temp_dir = "/kaggle/working/"
            wav_path = os.path.join(temp_dir, wav_filename)
            audio = AudioSegment.from_mp3(mp3_path)
            audio = audio.set_channels(1).set_frame_rate(target_sr)
            audio.export(wav_path, format="wav")
            return wav_path
        except Exception as e:
            print(f"Error converting {mp3_path}: {str(e)}")
            return None
    
    def extract_comprehensive_features(self, audio_path, sr=22050):
        """Extract comprehensive audio features including voice quality metrics"""
        try:
            y, sr = librosa.load(audio_path, sr=sr, res_type='kaiser_fast', dtype=np.float32)
            duration = len(y) / sr
            
            # Basic features
            features = {
                'general': {'duration': duration, 'sample_rate': sr},
                'energy': {},
                'spectral': {},
                'pitch': {},
                'rhythm': {},
                'silence': {},
                'voice_quality': {},
                'temporal': {}
            }
            
            # Energy features (enhanced)
            rms = librosa.feature.rms(y=y, frame_length=2048, hop_length=512)[0]
            features['energy'] = {
                'rms': np.mean(rms),
                'rms_std': np.std(rms),
                'variance': np.var(rms),
                'energy_entropy': self._calculate_entropy(rms),
                'dynamic_range': np.max(rms) - np.min(rms)
            }
            
            # Spectral features (enhanced)
            spectral_centroids = librosa.feature.spectral_centroid(y=y, sr=sr)[0]
            spectral_bandwidth = librosa.feature.spectral_bandwidth(y=y, sr=sr)[0]
            spectral_rolloff = librosa.feature.spectral_rolloff(y=y, sr=sr)[0]
            zcr = librosa.feature.zero_crossing_rate(y)[0]
            mfcc = librosa.feature.mfcc(y=y, sr=sr, n_mfcc=13)
            
            features['spectral'] = {
                'centroid': np.mean(spectral_centroids),
                'centroid_std': np.std(spectral_centroids),
                'bandwidth': np.mean(spectral_bandwidth),
                'rolloff': np.mean(spectral_rolloff),
                'zcr': np.mean(zcr),
                'mfcc_mean': np.mean(mfcc, axis=1),
                'spectral_flatness': np.mean(librosa.feature.spectral_flatness(y=y)[0])
            }
            
            # Pitch features (enhanced)
            f0, voiced_flag, voiced_probs = librosa.pyin(y, fmin=80, fmax=400, sr=sr)
            f0_clean = f0[~np.isnan(f0)]
            
            if len(f0_clean) > 0:
                features['pitch'] = {
                    'mean': np.mean(f0_clean),
                    'std': np.std(f0_clean),
                    'range': np.max(f0_clean) - np.min(f0_clean),
                    'voiced_ratio': np.mean(voiced_flag),
                    'pitch_slope': self._calculate_pitch_slope(f0_clean)
                }
            else:
                features['pitch'] = {'mean': 150, 'std': 30, 'range': 50, 'voiced_ratio': 0.5, 'pitch_slope': 0}
            
            # Voice quality features
            features['voice_quality'] = self._extract_voice_quality(y, sr, f0_clean)
            
            # Enhanced rhythm and silence analysis
            silence_features, rhythm_features = self._analyze_rhythm_and_silence(y, sr, rms)
            features['silence'].update(silence_features)
            features['rhythm'].update(rhythm_features)
            
            # Temporal features
            features['temporal'] = self._extract_temporal_features(rms, sr)
            
            return features, y, sr, rms
            
        except Exception as e:
            print(f"Error extracting features from {audio_path}: {str(e)}")
            return None, None, None, None
    
    def _calculate_entropy(self, signal):
        """Calculate entropy of signal"""
        hist, _ = np.histogram(signal, bins=50, density=True)
        hist = hist[hist > 0]  # Remove zero probabilities
        return -np.sum(hist * np.log2(hist))
    
    def _calculate_pitch_slope(self, f0):
        """Calculate overall pitch trend"""
        if len(f0) < 2:
            return 0
        x = np.arange(len(f0))
        slope, _, _, _, _ = stats.linregress(x, f0)
        return slope
    
    def _extract_voice_quality(self, y, sr, f0_clean):
        """Extract voice quality metrics like jitter and shimmer"""
        try:
            # Simplified jitter calculation (pitch period variability)
            if len(f0_clean) > 2:
                periods = 1 / (f0_clean + 1e-8)
                jitter = np.std(np.diff(periods)) / np.mean(periods) if np.mean(periods) > 0 else 0
            else:
                jitter = 0.01
            
            # Simplified shimmer calculation (amplitude variability)
            rms_frames = librosa.feature.rms(y=y, frame_length=1024, hop_length=256)[0]
            if len(rms_frames) > 2:
                shimmer = np.std(np.diff(rms_frames)) / np.mean(rms_frames) if np.mean(rms_frames) > 0 else 0
            else:
                shimmer = 0.05
            
            # Harmonic-to-noise ratio approximation
            harmonic, percussive = librosa.effects.hpss(y)
            hnr = np.mean(harmonic**2) / (np.mean(percussive**2) + 1e-8)
            
            return {
                'jitter': min(jitter, 0.1),  # Cap extreme values
                'shimmer': min(shimmer, 0.2),
                'hnr': min(hnr, 100)  # Cap HNR
            }
        except:
            return {'jitter': 0.01, 'shimmer': 0.05, 'hnr': 10}
    
    def _analyze_rhythm_and_silence(self, y, sr, rms):
        """Enhanced rhythm and silence analysis"""
        # Frame-based analysis
        frame_length = int(0.025 * sr)
        hop_length = int(0.01 * sr)
        
        # Voice activity detection
        silence_threshold = np.percentile(rms, 20)
        voice_frames = rms > silence_threshold
        
        # Silence features
        silence_ratio = 1 - np.mean(voice_frames)
        silence_segments = self._get_segments(~voice_frames)
        silence_durations = [(seg[1] - seg[0]) * hop_length / sr for seg in silence_segments]
        
        silence_features = {
            'ratio': silence_ratio,
            'count': len(silence_segments),
            'avg_duration': np.mean(silence_durations) if silence_durations else 0,
            'max_duration': np.max(silence_durations) if silence_durations else 0,
            'long_pause_count': sum(1 for d in silence_durations if d > 2.0),
            'total_long_pause_time': sum(d for d in silence_durations if d > 2.0)
        }
        
        # Rhythm features
        voice_segments = self._get_segments(voice_frames)
        voice_durations = [(seg[1] - seg[0]) * hop_length / sr for seg in voice_segments]
        
        rhythm_features = {
            'speaking_rate': len(voice_segments) / (len(y) / sr) if len(y) > 0 else 0,
            'avg_utterance_length': np.mean(voice_durations) if voice_durations else 0,
            'utterance_variability': np.std(voice_durations) if len(voice_durations) > 1 else 0,
            'regularity': 1 / (np.std(voice_durations) + 1e-8) if len(voice_durations) > 1 else 0.5
        }
        
        return silence_features, rhythm_features
    
    def _extract_temporal_features(self, rms, sr):
        """Extract temporal dynamics features"""
        # Energy contour analysis
        energy_smooth = savgol_filter(rms, window_length=min(21, len(rms)//2*2+1), polyorder=2)
        
        # Find peaks and valleys
        peaks, _ = find_peaks(energy_smooth, height=np.percentile(energy_smooth, 60))
        valleys, _ = find_peaks(-energy_smooth, height=-np.percentile(energy_smooth, 40))
        
        return {
            'num_peaks': len(peaks),
            'num_valleys': len(valleys),
            'peak_prominence': np.mean(energy_smooth[peaks]) if len(peaks) > 0 else 0,
            'energy_range': np.max(rms) - np.min(rms),
            'energy_trend': np.polyfit(range(len(rms)), rms, 1)[0] if len(rms) > 1 else 0
        }
    
    def _get_segments(self, binary_array):
        """Get continuous segments where binary_array is True"""
        if len(binary_array) == 0:
            return []
        
        segments = []
        start = None
        
        for i, val in enumerate(binary_array):
            if val and start is None:
                start = i
            elif not val and start is not None:
                segments.append((start, i))
                start = None
        
        if start is not None:
            segments.append((start, len(binary_array)))
        
        return segments
    
    def process_all_files(self):
        """Process all MP3 files with enhanced feature extraction"""
        mp3_files = [f for f in os.listdir(self.audio_folder) if f.endswith('.mp3')]
        if not mp3_files:
            print("No MP3 files found!"); return
            
        print(f"Processing {len(mp3_files)} MP3 files with enhanced analysis...")
        
        for mp3_file in tqdm(mp3_files, desc="Processing files"):
            mp3_path = os.path.join(self.audio_folder, mp3_file)
            wav_filename = mp3_file.replace('.mp3', '_temp.wav')
            wav_path = self.convert_mp3_to_wav(mp3_path, wav_filename)
            
            if wav_path is None:
                continue
                
            features, y, sr, rms = self.extract_comprehensive_features(wav_path)
            if features is None:
                if os.path.exists(wav_path): os.remove(wav_path)
                continue
            
            # Advanced emotion classification
            emotion_result = self.emotion_classifier.classify_emotion(features)
            
            # Flatten features for DataFrame
            flat_features = self._flatten_features(features)
            
            # Combine all data
            file_data = {
                'filename': mp3_file,
                'short_filename': self._create_short_filename(mp3_file),
                'emotion_label': emotion_result['emotion'],
                'emotion_confidence': emotion_result['confidence'],
                **flat_features
            }
            
            # Store sample for visualization
            if self.sample_for_timeline is None:
                self.sample_for_timeline = {
                    'short_filename': file_data['short_filename'],
                    'rms_frames': rms,
                    'emotion': emotion_result['emotion']
                }
            
            self.processed_data.append(file_data)
            
            # Cleanup
            if os.path.exists(wav_path): os.remove(wav_path)
            gc.collect()
        
        print(f"Successfully processed {len(self.processed_data)} files")
    
    def _flatten_features(self, features):
        """Flatten nested feature dictionary"""
        flat_features = {}
        for category, values in features.items():
            if isinstance(values, dict):
                for feature, value in values.items():
                    if isinstance(value, np.ndarray):
                        if len(value.shape) == 1:
                            for i, v in enumerate(value):
                                flat_features[f"{category}_{feature}_{i}"] = v
                        else:
                            flat_features[f"{category}_{feature}"] = np.mean(value)
                    else:
                        flat_features[f"{category}_{feature}"] = value
            else:
                flat_features[category] = values
        return flat_features
    
    def create_features_dataframe(self):
        """Create enhanced features DataFrame"""
        if not self.processed_data:
            print("No processed data available!"); return
            
        self.features_df = pd.DataFrame(self.processed_data)
        
        # Save features
        features_path = os.path.join(self.output_folder, 'enhanced_audio_features.csv')
        self.features_df.to_csv(features_path, index=False)
        print(f"Enhanced features DataFrame created with shape: {self.features_df.shape}")
        print(f"Features saved to: {features_path}")
    
    def perform_advanced_anomaly_detection(self):
        """Perform multi-level anomaly detection"""
        if self.features_df is None:
            print("Features DataFrame not created!"); return
            
        # Select numerical features for anomaly detection
        feature_columns = [col for col in self.features_df.columns 
                          if col not in ['filename', 'short_filename', 'emotion_label', 'emotion_confidence']
                          and self.features_df[col].dtype in ['float64', 'int64']]
        
        X = self.features_df[feature_columns].fillna(0).values
        
        # Fit and detect anomalies
        self.anomaly_detector.fit(X, feature_columns)
        anomaly_results = self.anomaly_detector.detect_anomalies(X)
        
        # Add anomaly results to DataFrame
        for anomaly_type, results in anomaly_results.items():
            self.features_df[f'anomaly_{anomaly_type}'] = results
        
        # Main anomaly indicator
        self.features_df['is_anomaly'] = anomaly_results['ensemble']
        
        print(f"Anomaly detection completed:")
        for anomaly_type, results in anomaly_results.items():
            count = np.sum(results)
            print(f"  - {anomaly_type}: {count} anomalies ({count/len(results)*100:.1f}%)")
    
    def train_ml_prioritization(self):
        """Train machine learning models for call prioritization"""
        if self.features_df is None:
            print("Features DataFrame not created!"); return
            
        # Select features for ML models
        feature_columns = [col for col in self.features_df.columns 
                          if col not in ['filename', 'short_filename', 'emotion_label', 'emotion_confidence']
                          and not col.startswith('anomaly_')
                          and self.features_df[col].dtype in ['float64', 'int64']]
        
        # Train the ML prioritization system
        self.ml_prioritizer.fit(self.features_df, feature_columns)
        
        # Make predictions
        predictions = self.ml_prioritizer.predict(self.features_df, feature_columns)
        
        # Add predictions to DataFrame
        self.features_df['priority_level'] = predictions['priority_classifier']
        self.features_df['urgency_score'] = predictions['urgency_regressor']
        self.features_df['risk_score'] = predictions['risk_scorer']
        
        # Calculate composite priority score
        priority_weights = {'HIGH': 3, 'MEDIUM': 2, 'LOW': 1}
        self.features_df['priority_numeric'] = self.features_df['priority_level'].map(priority_weights)
        self.features_df['composite_priority'] = (
            self.features_df['priority_numeric'] * 0.4 +
            self.features_df['urgency_score'] * 0.3 +
            self.features_df['risk_score'] * 0.2 +
            self.features_df['is_anomaly'] * 0.1
        )
        
        print("ML prioritization training completed")
        print(f"Priority distribution: {self.features_df['priority_level'].value_counts().to_dict()}")
    
    def perform_clustering_analysis(self, n_clusters=4):
        """Enhanced clustering analysis with multiple algorithms"""
        if self.features_df is None:
            print("Features DataFrame not created!"); return
            
        # Select features for clustering
        feature_columns = [col for col in self.features_df.columns 
                          if col not in ['filename', 'short_filename', 'emotion_label', 'emotion_confidence', 'priority_level']
                          and not col.startswith('anomaly_')
                          and self.features_df[col].dtype in ['float64', 'int64']]
        
        X = self.features_df[feature_columns].fillna(0).values
        scaler = StandardScaler()
        X_scaled = scaler.fit_transform(X)
        
        # K-Means clustering
        kmeans = KMeans(n_clusters=n_clusters, random_state=42, n_init=10)
        self.features_df['kmeans_cluster'] = kmeans.fit_predict(X_scaled)
        
        # DBSCAN clustering for density-based grouping
        dbscan = DBSCAN(eps=0.5, min_samples=5)
        self.features_df['dbscan_cluster'] = dbscan.fit_predict(X_scaled)
        
        # Dimensionality reduction for visualization
        # PCA
        pca = PCA(n_components=2, random_state=42)
        pca_result = pca.fit_transform(X_scaled)
        self.features_df['pca_x'] = pca_result[:, 0]
        self.features_df['pca_y'] = pca_result[:, 1]
        
        # UMAP
        umap_reducer = umap.UMAP(n_components=2, random_state=42, n_neighbors=15, min_dist=0.1)
        umap_result = umap_reducer.fit_transform(X_scaled)
        self.features_df['umap_x'] = umap_result[:, 0]
        self.features_df['umap_y'] = umap_result[:, 1]
        
        # t-SNE
        tsne = TSNE(n_components=2, random_state=42, perplexity=min(30, len(X_scaled)-1))
        tsne_result = tsne.fit_transform(X_scaled)
        self.features_df['tsne_x'] = tsne_result[:, 0]
        self.features_df['tsne_y'] = tsne_result[:, 1]
        
        print(f"Clustering analysis completed with {n_clusters} K-means clusters")
        print(f"DBSCAN found {len(set(self.features_df['dbscan_cluster'])) - (1 if -1 in self.features_df['dbscan_cluster'].values else 0)} clusters")
    
    def create_comprehensive_visualizations(self):
        """Create comprehensive visualizations and insights"""
        if self.features_df is None or self.features_df.empty:
            print("No data to visualize"); return
            
        # Set up the plotting style
        plt.style.use('seaborn-v0_8')
        fig_size = (15, 10)
        
        # 1. Enhanced Emotion Analysis
        self._plot_emotion_analysis()
        
        # 2. Priority and Risk Analysis
        self._plot_priority_analysis()
        
        # 3. Anomaly Analysis
        self._plot_anomaly_analysis()
        
        # 4. Clustering Visualizations
        self._plot_clustering_analysis()
        
        # 5. Feature Importance and Correlations
        self._plot_feature_analysis()
        
        # 6. Advanced Timeline Analysis
        self._plot_timeline_analysis()
        
        print("All visualizations created and saved")
    
    def _plot_emotion_analysis(self):
        """Plot comprehensive emotion analysis"""
        fig, axes = plt.subplots(2, 2, figsize=(16, 12))
        
        # Emotion distribution
        emotion_counts = self.features_df['emotion_label'].value_counts()
        axes[0,0].pie(emotion_counts.values, labels=emotion_counts.index, autopct='%1.1f%%', 
                     colors=sns.color_palette("husl", len(emotion_counts)))
        axes[0,0].set_title('Emotion Distribution')
        
        # Emotion confidence distribution
        sns.boxplot(data=self.features_df, x='emotion_label', y='emotion_confidence', ax=axes[0,1])
        axes[0,1].set_title('Emotion Confidence by Type')
        axes[0,1].tick_params(axis='x', rotation=45)
        
        # Emotion vs Duration
        sns.scatterplot(data=self.features_df, x='general_duration', y='energy_rms', 
                       hue='emotion_label', alpha=0.7, ax=axes[1,0])
        axes[1,0].set_title('Emotion Distribution: Duration vs Energy')
        
        # Emotion correlation with audio features
        emotion_features = ['energy_rms', 'pitch_mean', 'silence_ratio', 'rhythm_speaking_rate']
        emotion_data = self.features_df.groupby('emotion_label')[emotion_features].mean()
        sns.heatmap(emotion_data.T, annot=True, cmap='RdYlBu_r', ax=axes[1,1])
        axes[1,1].set_title('Average Features by Emotion')
        
        plt.tight_layout()
        plt.savefig(os.path.join(self.output_folder, 'emotion_analysis.png'), dpi=300, bbox_inches='tight')
        plt.close()
    
    def _plot_priority_analysis(self):
        """Plot priority and risk analysis"""
        fig, axes = plt.subplots(2, 2, figsize=(16, 12))
        
        # Priority level distribution
        priority_counts = self.features_df['priority_level'].value_counts()
        colors = ['red', 'orange', 'green']
        axes[0,0].bar(priority_counts.index, priority_counts.values, color=colors[:len(priority_counts)])
        axes[0,0].set_title('Call Priority Distribution')
        axes[0,0].set_ylabel('Number of Calls')
        
        # Composite priority vs emotion
        sns.boxplot(data=self.features_df, x='emotion_label', y='composite_priority', ax=axes[0,1])
        axes[0,1].set_title('Priority Score by Emotion')
        axes[0,1].tick_params(axis='x', rotation=45)
        
        # Risk score distribution
        axes[1,0].hist(self.features_df['risk_score'], bins=2, alpha=0.7, color='red', edgecolor='black')
        axes[1,0].set_title('Risk Score Distribution')
        axes[1,0].set_xlabel('Risk Score (0=Low, 1=High)')
        axes[1,0].set_ylabel('Number of Calls')
        
        # Priority vs Anomaly
        priority_anomaly = pd.crosstab(self.features_df['priority_level'], self.features_df['is_anomaly'])
        sns.heatmap(priority_anomaly, annot=True, fmt='d', cmap='Reds', ax=axes[1,1])
        axes[1,1].set_title('Priority Level vs Anomaly Detection')
        
        plt.tight_layout()
        plt.savefig(os.path.join(self.output_folder, 'priority_analysis.png'), dpi=300, bbox_inches='tight')
        plt.close()
    
    def _plot_anomaly_analysis(self):
        """Plot comprehensive anomaly analysis"""
        fig, axes = plt.subplots(2, 2, figsize=(16, 12))
        
        # Anomaly detection comparison
        anomaly_types = ['isolation_forest', 'statistical', 'percentile', 'ensemble']
        anomaly_counts = [self.features_df[f'anomaly_{atype}'].sum() for atype in anomaly_types]
        
        axes[0,0].bar(anomaly_types, anomaly_counts, color=['blue', 'green', 'orange', 'red'])
        axes[0,0].set_title('Anomaly Detection Method Comparison')
        axes[0,0].set_ylabel('Number of Anomalies')
        axes[0,0].tick_params(axis='x', rotation=45)
        
        # Anomalies in feature space
        sns.scatterplot(data=self.features_df, x='energy_rms', y='pitch_mean', 
                       hue='is_anomaly', palette={0: 'blue', 1: 'red'}, alpha=0.7, ax=axes[0,1])
        axes[0,1].set_title('Anomalies in Energy-Pitch Space')
        
        # Anomaly vs Emotion
        anomaly_emotion = pd.crosstab(self.features_df['emotion_label'], self.features_df['is_anomaly'])
        sns.heatmap(anomaly_emotion, annot=True, fmt='d', cmap='Blues', ax=axes[1,0])
        axes[1,0].set_title('Anomalies by Emotion Type')
        
        # Duration vs Silence for anomalies
        sns.scatterplot(data=self.features_df, x='general_duration', y='silence_ratio', 
                       hue='is_anomaly', palette={0: 'lightblue', 1: 'red'}, alpha=0.7, ax=axes[1,1])
        axes[1,1].set_title('Anomalies: Duration vs Silence Ratio')
        
        plt.tight_layout()
        plt.savefig(os.path.join(self.output_folder, 'anomaly_analysis.png'), dpi=300, bbox_inches='tight')
        plt.close()
    
    def _plot_clustering_analysis(self):
        """Plot clustering analysis results"""
        fig, axes = plt.subplots(2, 2, figsize=(16, 12))
        
        # UMAP clustering
        sns.scatterplot(data=self.features_df, x='umap_x', y='umap_y', 
                       hue='kmeans_cluster', palette='viridis', alpha=0.7, ax=axes[0,0])
        axes[0,0].set_title('UMAP Projection with K-Means Clusters')
        
        # PCA clustering
        sns.scatterplot(data=self.features_df, x='pca_x', y='pca_y', 
                       hue='kmeans_cluster', palette='viridis', alpha=0.7, ax=axes[0,1])
        axes[0,1].set_title('PCA Projection with K-Means Clusters')
        
        # t-SNE clustering
        sns.scatterplot(data=self.features_df, x='tsne_x', y='tsne_y', 
                       hue='kmeans_cluster', palette='viridis', alpha=0.7, ax=axes[1,0])
        axes[1,0].set_title('t-SNE Projection with K-Means Clusters')
        
        # Cluster characteristics
        cluster_features = ['energy_rms', 'pitch_mean', 'silence_ratio', 'rhythm_speaking_rate']
        cluster_data = self.features_df.groupby('kmeans_cluster')[cluster_features].mean()
        sns.heatmap(cluster_data.T, annot=True, cmap='RdYlBu_r', ax=axes[1,1])
        axes[1,1].set_title('Average Features by Cluster')
        
        plt.tight_layout()
        plt.savefig(os.path.join(self.output_folder, 'clustering_analysis.png'), dpi=300, bbox_inches='tight')
        plt.close()
    
    def _plot_feature_analysis(self):
        """Plot feature importance and correlations"""
        fig, axes = plt.subplots(2, 2, figsize=(16, 12))
        
        # Feature correlation matrix
        feature_columns = ['energy_rms', 'pitch_mean', 'pitch_std', 'silence_ratio', 
                          'rhythm_speaking_rate', 'spectral_centroid', 'voice_quality_jitter']
        if all(col in self.features_df.columns for col in feature_columns):
            corr_matrix = self.features_df[feature_columns].corr()
            sns.heatmap(corr_matrix, annot=True, cmap='coolwarm', center=0, ax=axes[0,0])
            axes[0,0].set_title('Feature Correlation Matrix')
        
        # Feature distributions by emotion
        key_features = ['energy_rms', 'pitch_mean', 'silence_ratio']
        for i, feature in enumerate(key_features):
            if i < 3:  # We have 3 remaining subplots
                row, col = divmod(i+1, 2)
                if row == 0: col = 1
                elif row == 1: col = i-1
                
                sns.boxplot(data=self.features_df, x='emotion_label', y=feature, ax=axes[row, col])
                axes[row, col].set_title(f'{feature} by Emotion')
                axes[row, col].tick_params(axis='x', rotation=45)
        
        plt.tight_layout()
        plt.savefig(os.path.join(self.output_folder, 'feature_analysis.png'), dpi=300, bbox_inches='tight')
        plt.close()
    
    def _plot_timeline_analysis(self):
        """Plot sample timeline analysis"""
        if self.sample_for_timeline:
            fig, axes = plt.subplots(2, 1, figsize=(16, 10))
            
            sample = self.sample_for_timeline
            time_frames = np.arange(len(sample['rms_frames'])) * 0.01
            
            # Energy timeline
            axes[0].plot(time_frames, sample['rms_frames'], alpha=0.8, linewidth=1.5, color='blue')
            axes[0].set_title(f"Sample Audio Timeline: {sample['short_filename']} (Emotion: {sample['emotion']})")
            axes[0].set_ylabel('Energy (RMS)')
            axes[0].grid(True, alpha=0.3)
            
            # Smoothed energy with peaks
            smooth_rms = savgol_filter(sample['rms_frames'], 
                                     window_length=min(21, len(sample['rms_frames'])//2*2+1), 
                                     polyorder=2)
            peaks, _ = find_peaks(smooth_rms, height=np.percentile(smooth_rms, 70))
            
            axes[1].plot(time_frames, smooth_rms, color='green', linewidth=2, label='Smoothed Energy')
            axes[1].scatter(time_frames[peaks], smooth_rms[peaks], color='red', s=50, 
                          label=f'Peaks ({len(peaks)})', zorder=5)
            axes[1].set_xlabel('Time (seconds)')
            axes[1].set_ylabel('Smoothed Energy')
            axes[1].legend()
            axes[1].grid(True, alpha=0.3)
            
            plt.tight_layout()
            plt.savefig(os.path.join(self.output_folder, 'timeline_analysis.png'), dpi=300, bbox_inches='tight')
            plt.close()
    
    def generate_comprehensive_report(self):
        """Generate comprehensive analysis report"""
        if self.features_df is None or self.features_df.empty:
            print("No data available for report generation")
            return
            
        report_path = os.path.join(self.output_folder, 'comprehensive_analysis_report.txt')
        
        with open(report_path, 'w') as f:
            f.write("="*80 + "\n")
            f.write("COMPREHENSIVE CALL CENTER AUDIO ANALYSIS REPORT\n")
            f.write("="*80 + "\n\n")
            
            # Summary statistics
            f.write("SUMMARY STATISTICS\n")
            f.write("-" * 40 + "\n")
            f.write(f"Total calls analyzed: {len(self.features_df)}\n")
            f.write(f"Average call duration: {self.features_df['general_duration'].mean():.2f} seconds\n")
            f.write(f"Duration range: {self.features_df['general_duration'].min():.1f} - {self.features_df['general_duration'].max():.1f} seconds\n\n")
            
            # Emotion analysis
            f.write("EMOTION ANALYSIS\n")
            f.write("-" * 40 + "\n")
            emotion_stats = self.features_df['emotion_label'].value_counts()
            for emotion, count in emotion_stats.items():
                percentage = (count / len(self.features_df)) * 100
                f.write(f"{emotion.capitalize()}: {count} calls ({percentage:.1f}%)\n")
            
            avg_confidence = self.features_df['emotion_confidence'].mean()
            f.write(f"\nAverage emotion confidence: {avg_confidence:.3f}\n\n")
            
            # Priority analysis
            f.write("PRIORITY ANALYSIS\n")
            f.write("-" * 40 + "\n")
            priority_stats = self.features_df['priority_level'].value_counts()
            for priority, count in priority_stats.items():
                percentage = (count / len(self.features_df)) * 100
                f.write(f"{priority} priority: {count} calls ({percentage:.1f}%)\n")
            
            high_priority_calls = self.features_df[self.features_df['priority_level'] == 'HIGH']
            f.write(f"\nHigh priority calls breakdown:\n")
            if not high_priority_calls.empty:
                high_priority_emotions = high_priority_calls['emotion_label'].value_counts()
                for emotion, count in high_priority_emotions.items():
                    f.write(f"  - {emotion}: {count} calls\n")
            
            # Anomaly analysis
            f.write(f"\nANOMALY ANALYSIS\n")
            f.write("-" * 40 + "\n")
            total_anomalies = self.features_df['is_anomaly'].sum()
            anomaly_percentage = (total_anomalies / len(self.features_df)) * 100
            f.write(f"Total anomalies detected: {total_anomalies} ({anomaly_percentage:.1f}%)\n")
            
            # Top priority calls for review
            f.write(f"\nTOP PRIORITY CALLS FOR MANUAL REVIEW\n")
            f.write("-" * 40 + "\n")
            top_priority = self.features_df.nlargest(10, 'composite_priority')
            
            for idx, (_, call) in enumerate(top_priority.iterrows(), 1):
                f.write(f"{idx}. {call['filename']}\n")
                f.write(f"   Priority: {call['priority_level']}, Emotion: {call['emotion_label']}\n")
                f.write(f"   Duration: {call['general_duration']:.1f}s, Composite Score: {call['composite_priority']:.2f}\n")
                f.write(f"   Anomaly: {'Yes' if call['is_anomaly'] else 'No'}\n\n")
            
            # Recommendations
            f.write("RECOMMENDATIONS\n")
            f.write("-" * 40 + "\n")
            
            # Identify patterns
            angry_calls = len(self.features_df[self.features_df['emotion_label'] == 'angry'])
            frustrated_calls = len(self.features_df[self.features_df['emotion_label'] == 'frustrated'])
            confused_calls = len(self.features_df[self.features_df['emotion_label'] == 'confused'])
            
            if angry_calls > len(self.features_df) * 0.15:
                f.write("⚠️  High percentage of angry calls detected - consider agent training on de-escalation\n")
            
            if frustrated_calls > len(self.features_df) * 0.2:
                f.write("⚠️  High frustration levels - review common issues and improve FAQ/knowledge base\n")
            
            if confused_calls > len(self.features_df) * 0.25:
                f.write("⚠️  Many confused customers - consider improving call routing and agent preparation\n")
            
            long_calls = len(self.features_df[self.features_df['general_duration'] > 
                                           self.features_df['general_duration'].quantile(0.9)])
            if long_calls > 0:
                f.write(f"📈 {long_calls} exceptionally long calls detected - investigate for efficiency improvements\n")
            
            f.write(f"\n✅ Prioritize manual review of {len(top_priority)} highest-scoring calls\n")
            f.write(f"✅ Focus quality assurance on {total_anomalies} anomalous calls\n")
            f.write(f"✅ Consider additional agent training based on emotion distribution patterns\n")
        
        print(f"Comprehensive report generated: {report_path}")
    
    def save_results(self):
        """Save all results to files"""
        if self.features_df is None:
            print("No results to save")
            return
            
        # Save main results
        results_path = os.path.join(self.output_folder, 'final_analysis_results.csv')
        self.features_df.to_csv(results_path, index=False)
        
        # Save priority calls for immediate action
        high_priority = self.features_df[
            (self.features_df['priority_level'] == 'HIGH') | 
            (self.features_df['is_anomaly'] == 1)
        ].sort_values('composite_priority', ascending=False)
        
        priority_path = os.path.join(self.output_folder, 'priority_calls_for_review.csv')
        high_priority[['filename', 'emotion_label', 'priority_level', 'composite_priority', 
                      'is_anomaly', 'general_duration']].to_csv(priority_path, index=False)
        
        # Save model for future use
        model_path = os.path.join(self.output_folder, 'ml_prioritizer.pkl')
        joblib.dump(self.ml_prioritizer, model_path)
        
        print(f"Results saved:")
        print(f"  - Full analysis: {results_path}")
        print(f"  - Priority calls: {priority_path}")
        print(f"  - ML model: {model_path}")

# Main execution function
def main():
    """Main execution function with comprehensive analysis"""
    AUDIO_FOLDER = "/kaggle/input/asdfggh/Audio file"
    N_CLUSTERS = 4
    
    print("Starting Enhanced Audio Analysis Pipeline...")
    print("="*60)
    
    # Initialize analyzer
    analyzer = EnhancedAudioAnalyzer(AUDIO_FOLDER)
    
    # Process all files
    analyzer.process_all_files()
    if not analyzer.processed_data:
        print("No files were processed successfully!")
        return None
    
    # Create features DataFrame
    analyzer.create_features_dataframe()
    
    # Perform advanced anomaly detection
    analyzer.perform_advanced_anomaly_detection()
    
    # Train ML prioritization models
    analyzer.train_ml_prioritization()
    
    # Perform clustering analysis
    analyzer.perform_clustering_analysis(n_clusters=N_CLUSTERS)
    
    # Create comprehensive visualizations
    analyzer.create_comprehensive_visualizations()
    
    # Generate comprehensive report
    analyzer.generate_comprehensive_report()
    
    # Save all results
    analyzer.save_results()
    
    print("\n" + "="*60)
    print("ENHANCED AUDIO ANALYSIS COMPLETE!")
    print(f"All outputs saved to: {analyzer.output_folder}")
    print("="*60)
    
    return analyzer

# Run the enhanced analysis
if __name__ == "__main__":
    try:
        analyzer = main()
        if analyzer:
            print("\nAnalysis pipeline completed successfully!")
            print(f"Processed {len(analyzer.processed_data)} audio files")
            print("Check the output folder for detailed results and visualizations")
    except Exception as e:
        print(f"\nAn error occurred during execution: {e}")
        import traceback
        traceback.print_exc()

2025-06-20 09:22:58.892832: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1750411379.062884      35 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1750411379.114008      35 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


Starting Enhanced Audio Analysis Pipeline...
Advanced Emotion Classifier initialized with 8 emotion categories
Processing 10 MP3 files with enhanced analysis...


Processing files: 100%|██████████| 10/10 [07:34<00:00, 45.48s/it]


Successfully processed 10 files
Enhanced features DataFrame created with shape: (10, 53)
Features saved to: /kaggle/working/enhanced_audio_analysis/enhanced_audio_features.csv
Anomaly detection completed:
  - ensemble: 1 anomalies (10.0%)
  - isolation_forest: 2 anomalies (20.0%)
  - statistical: 0 anomalies (0.0%)
  - percentile: 2 anomalies (20.0%)

Model Performance Evaluation:
----------------------------------------
Priority Classifier Accuracy: 0.694 (+/- 0.079)
Urgency Regressor Accuracy: 0.694 (+/- 0.550)
Risk Scorer AUC: nan (+/- nan)
ML prioritization training completed
Priority distribution: {'MEDIUM': 6, 'LOW': 2, 'HIGH': 2}
Clustering analysis completed with 4 K-means clusters
DBSCAN found 0 clusters
All visualizations created and saved
Comprehensive report generated: /kaggle/working/enhanced_audio_analysis/comprehensive_analysis_report.txt
Results saved:
  - Full analysis: /kaggle/working/enhanced_audio_analysis/final_analysis_results.csv
  - Priority calls: /kaggle/worki