In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import scipy.stats as stats
from scipy.stats import chi2_contingency, shapiro, kstest
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.ensemble import IsolationForest
from sklearn.feature_selection import f_classif, f_regression, chi2, mutual_info_classif
from statsmodels.stats.outliers_influence import variance_inflation_factor
from statsmodels.tools.tools import add_constant
import warnings
import logging
from pathlib import Path
from typing import Dict, List, Tuple, Optional, Union
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots

# Configure settings
warnings.filterwarnings('ignore')
plt.style.use('seaborn-v0_8')
sns.set_palette("husl")

# Setup logging
logging.basicConfig(
    level=logging.INFO,
    format='%(asctime)s - %(levelname)s - %(message)s'
)
logger = logging.getLogger(__name__)

class AdvancedEDA:
    """
    Professional-grade Exploratory Data Analysis Framework

    Features:
    - Comprehensive data profiling and quality assessment
    - Advanced statistical analysis and hypothesis testing
    - Multiple outlier detection methods
    - Multicollinearity analysis with VIF
    - Feature importance and selection
    - Interactive visualizations
    - Automated insights and recommendations
    """

    def __init__(self, alpha: float = 0.05, figsize: Tuple[int, int] = (12, 8)):
        """
        Initialize the EDA framework

        Args:
            alpha: Statistical significance level
            figsize: Default figure size for plots
        """
        self.alpha = alpha
        self.figsize = figsize
        self.data = None
        self.target_column = None
        self.numerical_cols = []
        self.categorical_cols = []
        self.results = {}

        # Create output directory
        self.output_dir = Path("eda_output")
        self.output_dir.mkdir(exist_ok=True)
        (self.output_dir / "plots").mkdir(exist_ok=True)

        logger.info("Advanced EDA Framework initialized successfully")

    def load_data(self, data_source: Union[str, pd.DataFrame], target_column: str = None) -> pd.DataFrame:
        """
        Load and validate dataset

        Args:
            data_source: File path or DataFrame
            target_column: Name of target variable

        Returns:
            Loaded and validated DataFrame
        """
        try:
            if isinstance(data_source, str):
                if data_source.endswith('.csv'):
                    self.data = pd.read_csv(data_source)
                elif data_source.endswith(('.xlsx', '.xls')):
                    self.data = pd.read_excel(data_source)
                else:
                    raise ValueError("Unsupported file format. Use CSV or Excel files.")
            elif isinstance(data_source, pd.DataFrame):
                self.data = data_source.copy()
            else:
                raise ValueError("Data source must be file path or pandas DataFrame")

            self.target_column = target_column
            self._identify_column_types()

            logger.info(f"Dataset loaded: {self.data.shape} | Target: {target_column}")
            return self.data

        except Exception as e:
            logger.error(f"Error loading data: {str(e)}")
            raise

    def _identify_column_types(self):
        """Automatically identify numerical and categorical columns"""
        if self.data is None:
            raise ValueError("No data loaded")

        # Identify column types
        self.numerical_cols = self.data.select_dtypes(include=[np.number]).columns.tolist()
        self.categorical_cols = self.data.select_dtypes(include=['object', 'category']).columns.tolist()

        # Remove target from feature lists
        if self.target_column:
            if self.target_column in self.numerical_cols:
                self.numerical_cols.remove(self.target_column)
            elif self.target_column in self.categorical_cols:
                self.categorical_cols.remove(self.target_column)

        logger.info(f"Columns identified - Numerical: {len(self.numerical_cols)}, Categorical: {len(self.categorical_cols)}")

    def data_overview(self) -> Dict:
        """
        Generate comprehensive data overview

        Returns:
            Dictionary with data profile information
        """
        if self.data is None:
            raise ValueError("No data loaded")

        # Basic information
        overview = {
            'shape': self.data.shape,
            'memory_usage_mb': self.data.memory_usage(deep=True).sum() / (1024**2),
            'missing_values': self.data.isnull().sum().to_dict(),
            'missing_percentage': (self.data.isnull().sum() / len(self.data) * 100).round(2).to_dict(),
            'duplicates': self.data.duplicated().sum(),
            'duplicate_percentage': (self.data.duplicated().sum() / len(self.data) * 100).round(2)
        }

        # Data types summary
        overview['column_types'] = {
            'numerical': self.numerical_cols,
            'categorical': self.categorical_cols,
            'target': self.target_column
        }

        # Statistical summary for numerical columns
        if self.numerical_cols:
            overview['numerical_summary'] = self.data[self.numerical_cols].describe().round(3)

        # Categorical summary
        if self.categorical_cols:
            cat_summary = {}
            for col in self.categorical_cols:
                cat_summary[col] = {
                    'unique_count': self.data[col].nunique(),
                    'top_categories': self.data[col].value_counts().head().to_dict()
                }
            overview['categorical_summary'] = cat_summary

        self.results['data_overview'] = overview

        # Print summary
        print("=" * 60)
        print("📊 DATA OVERVIEW SUMMARY")
        print("=" * 60)
        print(f"Dataset Shape: {overview['shape']}")
        print(f"Memory Usage: {overview['memory_usage_mb']:.2f} MB")
        print(f"Missing Values: {sum(overview['missing_values'].values())} cells")
        print(f"Duplicate Rows: {overview['duplicates']} ({overview['duplicate_percentage']:.1f}%)")
        print(f"Numerical Columns: {len(self.numerical_cols)}")
        print(f"Categorical Columns: {len(self.categorical_cols)}")

        if sum(overview['missing_values'].values()) > 0:
            print("\n📋 Missing Data by Column:")
            for col, missing in overview['missing_values'].items():
                if missing > 0:
                    pct = overview['missing_percentage'][col]
                    print(f"  {col}: {missing} ({pct:.1f}%)")

        return overview

    def analyze_distributions(self, save_plots: bool = True) -> Dict:
        """
        Comprehensive distribution analysis with statistical tests

        Args:
            save_plots: Whether to save plots

        Returns:
            Dictionary with distribution analysis results
        """
        distribution_results = {}

        # Numerical distributions
        if self.numerical_cols:
            print("\n📈 ANALYZING NUMERICAL DISTRIBUTIONS...")

            # Create subplots
            n_cols = min(3, len(self.numerical_cols))
            n_rows = (len(self.numerical_cols) + n_cols - 1) // n_cols

            fig, axes = plt.subplots(n_rows, n_cols, figsize=(n_cols*5, n_rows*4))
            if n_rows == 1 and n_cols == 1:
                axes = [axes]
            elif n_rows == 1:
                axes = axes
            else:
                axes = axes.flatten()

            for i, col in enumerate(self.numerical_cols):
                # Distribution plot
                sns.histplot(data=self.data, x=col, kde=True, ax=axes[i])
                axes[i].set_title(f'Distribution of {col}')

                # Add statistics
                mean_val = self.data[col].mean()
                median_val = self.data[col].median()
                axes[i].axvline(mean_val, color='red', linestyle='--', alpha=0.7, label=f'Mean: {mean_val:.2f}')
                axes[i].axvline(median_val, color='green', linestyle='--', alpha=0.7, label=f'Median: {median_val:.2f}')
                axes[i].legend()

                # Statistical analysis
                col_data = self.data[col].dropna()
                skewness = stats.skew(col_data)
                kurtosis = stats.kurtosis(col_data)

                # Normality test
                if len(col_data) > 3:
                    if len(col_data) <= 5000:
                        stat, p_val = shapiro(col_data)
                        test_name = "Shapiro-Wilk"
                    else:
                        stat, p_val = kstest(col_data, 'norm')
                        test_name = "Kolmogorov-Smirnov"

                    is_normal = p_val > self.alpha
                else:
                    stat, p_val, is_normal, test_name = np.nan, np.nan, None, "Insufficient data"

                distribution_results[col] = {
                    'mean': mean_val,
                    'median': median_val,
                    'std': self.data[col].std(),
                    'skewness': skewness,
                    'kurtosis': kurtosis,
                    'normality_test': test_name,
                    'normality_p_value': p_val,
                    'is_normal': is_normal
                }

                print(f"  {col}: Skew={skewness:.3f}, Kurt={kurtosis:.3f}, Normal={is_normal} (p={p_val:.4f})")

            # Hide unused subplots
            for i in range(len(self.numerical_cols), len(axes)):
                axes[i].set_visible(False)

            plt.tight_layout()
            if save_plots:
                plt.savefig(self.output_dir / "plots" / "numerical_distributions.png", dpi=300, bbox_inches='tight')
            plt.show()

        # Categorical distributions
        if self.categorical_cols:
            print("\n📊 ANALYZING CATEGORICAL DISTRIBUTIONS...")

            for col in self.categorical_cols:
                plt.figure(figsize=self.figsize)

                # Get top categories (limit to 20 for readability)
                value_counts = self.data[col].value_counts()
                top_categories = value_counts.head(20)

                # Create horizontal bar plot
                sns.barplot(y=top_categories.index, x=top_categories.values, orient='h')
                plt.title(f'Distribution of {col}')
                plt.xlabel('Count')

                # Add percentage labels
                total = len(self.data)
                for i, (category, count) in enumerate(top_categories.items()):
                    try:
                        percentage = (count / total) * 100
                        plt.text(count + total*0.01, i, f'{percentage:.1f}%', va='center')
                    except (TypeError, ZeroDivisionError):
                        continue

                plt.tight_layout()
                if save_plots:
                    plt.savefig(self.output_dir / "plots" / f"categorical_{col}.png", dpi=300, bbox_inches='tight')
                plt.show()

                # Store results
                distribution_results[col] = {
                    'unique_values': self.data[col].nunique(),
                    'most_frequent': value_counts.index[0],
                    'frequency': value_counts.iloc[0],
                    'frequency_percentage': (value_counts.iloc[0] / total * 100).round(2)
                }

                print(f"  {col}: {self.data[col].nunique()} unique values, Top: {value_counts.index[0]} ({value_counts.iloc[0]/total*100:.1f}%)")

        self.results['distributions'] = distribution_results
        return distribution_results

    def correlation_analysis(self, method: str = 'pearson', plot_heatmap: bool = True) -> pd.DataFrame:
        """
        Advanced correlation analysis with statistical significance

        Args:
            method: Correlation method ('pearson', 'spearman', 'kendall')
            plot_heatmap: Whether to plot correlation heatmap

        Returns:
            Correlation matrix
        """
        if not self.numerical_cols:
            print("⚠️ No numerical columns for correlation analysis")
            return pd.DataFrame()

        print(f"\n🔗 CORRELATION ANALYSIS ({method.upper()})...")

        # Calculate correlation matrix
        corr_matrix = self.data[self.numerical_cols].corr(method=method)

        if plot_heatmap:
            plt.figure(figsize=(max(8, len(self.numerical_cols)), max(6, len(self.numerical_cols))))

            # Create mask for upper triangle
            mask = np.triu(np.ones_like(corr_matrix, dtype=bool))

            # Generate heatmap
            sns.heatmap(corr_matrix, mask=mask, annot=True, cmap='RdBu_r', center=0,
                       square=True, linewidths=0.5, cbar_kws={"shrink": 0.8}, fmt='.2f')

            plt.title(f'{method.capitalize()} Correlation Matrix')
            plt.tight_layout()
            plt.savefig(self.output_dir / "plots" / f"correlation_{method}.png", dpi=300, bbox_inches='tight')
            plt.show()

        # Find high correlations
        high_corr_pairs = []
        threshold = 0.7

        for i in range(len(corr_matrix.columns)):
            for j in range(i+1, len(corr_matrix.columns)):
                corr_val = corr_matrix.iloc[i, j]
                if abs(corr_val) > threshold:
                    high_corr_pairs.append({
                        'var1': corr_matrix.columns[i],
                        'var2': corr_matrix.columns[j],
                        'correlation': corr_val
                    })

        if high_corr_pairs:
            print(f"🔍 High correlations found (|r| > {threshold}):")
            for pair in high_corr_pairs:
                print(f"  {pair['var1']} ↔ {pair['var2']}: {pair['correlation']:.3f}")
        else:
            print(f"✅ No high correlations detected (threshold: {threshold})")

        self.results['correlation_matrix'] = corr_matrix
        self.results['high_correlations'] = high_corr_pairs

        return corr_matrix

    def outlier_detection(self, methods: List[str] = ['iqr', 'zscore', 'isolation']) -> Dict:
        """
        Multi-method outlier detection

        Args:
            methods: List of detection methods

        Returns:
            Dictionary with outlier detection results
        """
        print(f"\n🎯 OUTLIER DETECTION using {', '.join(methods).upper()}...")

        outlier_results = {}

        for col in self.numerical_cols:
            outlier_results[col] = {}
            col_data = self.data[col].dropna()

            for method in methods:
                if method == 'iqr':
                    Q1 = col_data.quantile(0.25)
                    Q3 = col_data.quantile(0.75)
                    IQR = Q3 - Q1
                    lower_bound = Q1 - 1.5 * IQR
                    upper_bound = Q3 + 1.5 * IQR
                    outliers = col_data[(col_data < lower_bound) | (col_data > upper_bound)]

                elif method == 'zscore':
                    z_scores = np.abs(stats.zscore(col_data))
                    outliers = col_data[z_scores > 3]

                elif method == 'isolation':
                    iso_forest = IsolationForest(contamination=0.1, random_state=42)
                    outlier_pred = iso_forest.fit_predict(col_data.values.reshape(-1, 1))
                    outliers = col_data[outlier_pred == -1]

                outlier_results[col][method] = {
                    'count': len(outliers),
                    'percentage': (len(outliers) / len(col_data)) * 100,
                    'indices': outliers.index.tolist()
                }

        # Visualization
        if self.numerical_cols:
            n_cols = min(3, len(self.numerical_cols))
            n_rows = (len(self.numerical_cols) + n_cols - 1) // n_cols

            fig, axes = plt.subplots(n_rows, n_cols, figsize=(n_cols*5, n_rows*4))
            if n_rows == 1 and n_cols == 1:
                axes = [axes]
            elif n_rows == 1:
                axes = axes
            else:
                axes = axes.flatten()

            for i, col in enumerate(self.numerical_cols):
                sns.boxplot(data=self.data, y=col, ax=axes[i])
                outlier_count = outlier_results[col]['iqr']['count']
                outlier_pct = outlier_results[col]['iqr']['percentage']
                axes[i].set_title(f'{col}\nOutliers: {outlier_count} ({outlier_pct:.1f}%)')

            # Hide unused subplots
            for i in range(len(self.numerical_cols), len(axes)):
                axes[i].set_visible(False)

            plt.tight_layout()
            plt.savefig(self.output_dir / "plots" / "outlier_detection.png", dpi=300, bbox_inches='tight')
            plt.show()

        # Summary
        print("📋 Outlier Detection Summary:")
        for col in self.numerical_cols:
            results = outlier_results[col]
            print(f"  {col}:")
            for method in methods:
                count = results[method]['count']
                pct = results[method]['percentage']
                print(f"    {method.upper()}: {count} outliers ({pct:.1f}%)")

        self.results['outliers'] = outlier_results
        return outlier_results

    def multicollinearity_check(self) -> pd.DataFrame:
        """
        Variance Inflation Factor (VIF) analysis for multicollinearity

        Returns:
            DataFrame with VIF values
        """
        if len(self.numerical_cols) < 2:
            print("⚠️ Need at least 2 numerical columns for VIF analysis")
            return pd.DataFrame()

        print("\n🔍 MULTICOLLINEARITY ANALYSIS (VIF)...")

        # Prepare data
        X = self.data[self.numerical_cols].dropna()
        X_with_const = add_constant(X)

        # Calculate VIF
        vif_data = pd.DataFrame()
        vif_data['Feature'] = X_with_const.columns
        vif_data['VIF'] = [variance_inflation_factor(X_with_const.values, i)
                          for i in range(X_with_const.shape[1])]

        vif_data = vif_data.sort_values('VIF', ascending=False)

        # Visualization
        plt.figure(figsize=self.figsize)
        features_to_plot = vif_data[vif_data['Feature'] != 'const']  # Exclude constant
        sns.barplot(data=features_to_plot, y='Feature', x='VIF')
        plt.axvline(x=5, color='red', linestyle='--', label='VIF = 5 (Threshold)')
        plt.axvline(x=10, color='orange', linestyle='--', label='VIF = 10 (High)')
        plt.title('Variance Inflation Factor (VIF) Analysis')
        plt.xlabel('VIF Score')
        plt.legend()
        plt.tight_layout()
        plt.savefig(self.output_dir / "plots" / "vif_analysis.png", dpi=300, bbox_inches='tight')
        plt.show()

        # Analysis
        high_vif = features_to_plot[features_to_plot['VIF'] > 5]
        if not high_vif.empty:
            print("⚠️ High multicollinearity detected (VIF > 5):")
            for _, row in high_vif.iterrows():
                print(f"  {row['Feature']}: VIF = {row['VIF']:.2f}")

            print("\n💡 Recommendations:")
            print("  • Remove highly correlated features")
            print("  • Apply Principal Component Analysis (PCA)")
            print("  • Use regularization techniques (Ridge/Lasso)")
        else:
            print("✅ No multicollinearity issues detected (all VIF < 5)")

        self.results['vif_analysis'] = vif_data
        return vif_data

    def bivariate_analysis(self) -> Dict:
        """
        Comprehensive bivariate analysis with target variable

        Returns:
            Dictionary with bivariate analysis results
        """
        if not self.target_column:
            print("⚠️ No target column specified for bivariate analysis")
            return {}

        print(f"\n🔬 BIVARIATE ANALYSIS with target: {self.target_column}")

        # Validate target column
        if self.target_column not in self.data.columns:
            print(f"❌ Target column '{self.target_column}' not found in data")
            return {}

        bivariate_results = {}
        target_data = self.data[self.target_column]

        # Ensure target column exists and has valid data
        if target_data.empty or target_data.isnull().all():
            print("❌ Target column is empty or all null values")
            return bivariate_results

        target_type = 'categorical' if self.target_column in self.categorical_cols else 'numerical'
        print(f"Target type detected: {target_type}")

        # Numerical features vs target
        for col in self.numerical_cols:
            try:
                # Ensure column data is numerical
                col_data = pd.to_numeric(self.data[col], errors='coerce')
                if col_data.isnull().all():
                    print(f"  Skipping {col}: Cannot convert to numerical data")
                    continue

                if target_type == 'categorical':
                    # Box plots and statistical tests
                    plt.figure(figsize=self.figsize)
                    sns.boxplot(data=self.data, x=self.target_column, y=col)
                    plt.title(f'{col} by {self.target_column}')
                    plt.xticks(rotation=45)
                    plt.tight_layout()
                    plt.savefig(self.output_dir / "plots" / f"bivariate_{col}_vs_{self.target_column}.png",
                               dpi=300, bbox_inches='tight')
                    plt.show()

                    # Statistical test
                    try:
                        groups = []
                        for name, group in self.data.groupby(self.target_column):
                            group_data = group[col].dropna()
                            if len(group_data) > 0:
                                groups.append(group_data)

                        if len(groups) == 2 and all(len(g) > 0 for g in groups):
                            stat, p_val = stats.ttest_ind(groups[0], groups[1], equal_var=False)
                            test_name = "Welch's t-test"
                        elif len(groups) > 2 and all(len(g) > 0 for g in groups):
                            stat, p_val = stats.f_oneway(*groups)
                            test_name = "ANOVA"
                        else:
                            stat, p_val, test_name = np.nan, np.nan, "Insufficient data"
                    except Exception as e:
                        stat, p_val, test_name = np.nan, np.nan, f"Test failed: {str(e)}"

                    bivariate_results[col] = {
                        'test': test_name,
                        'statistic': stat,
                        'p_value': p_val,
                        'significant': p_val < self.alpha if not np.isnan(p_val) else False
                    }

                    significance = "✅ Significant" if p_val < self.alpha else "❌ Not significant"
                    print(f"  {col} vs {self.target_column}: {test_name}, p={p_val:.4f} - {significance}")

                else:  # Numerical target
                    # Scatter plot with correlation
                    plt.figure(figsize=self.figsize)
                    sns.scatterplot(data=self.data, x=col, y=self.target_column, alpha=0.6)
                    sns.regplot(data=self.data, x=col, y=self.target_column, scatter=False, color='red')

                    # Calculate correlation
                    try:
                        corr_val = self.data[col].corr(target_data)
                        if pd.isna(corr_val):
                            corr_val = 0.0
                    except (TypeError, ValueError):
                        corr_val = 0.0

                    plt.title(f'{col} vs {self.target_column} (r = {corr_val:.3f})')
                    plt.tight_layout()
                    plt.savefig(self.output_dir / "plots" / f"scatter_{col}_vs_{self.target_column}.png",
                               dpi=300, bbox_inches='tight')
                    plt.show()

                    bivariate_results[col] = {
                        'correlation': corr_val,
                        'correlation_strength': self._interpret_correlation(abs(corr_val))
                    }

                    print(f"  {col} vs {self.target_column}: r = {corr_val:.3f} - {self._interpret_correlation(abs(corr_val))}")

            except Exception as e:
                print(f"  Error analyzing {col}: {str(e)}")
                continue

        # Categorical features vs target
        for col in self.categorical_cols:
            if col != self.target_column:
                try:
                    if target_type == 'categorical':
                        # Cross-tabulation and Chi-square test
                        ct = pd.crosstab(self.data[col], target_data)

                        plt.figure(figsize=self.figsize)
                        sns.heatmap(ct, annot=True, fmt='d', cmap='Blues')
                        plt.title(f'Cross-tabulation: {col} vs {self.target_column}')
                        plt.tight_layout()
                        plt.savefig(self.output_dir / "plots" / f"crosstab_{col}_vs_{self.target_column}.png",
                                   dpi=300, bbox_inches='tight')
                        plt.show()

                        # Chi-square test
                        try:
                            if ct.size > 0 and ct.sum().sum() > 0:
                                chi2, p_val, dof, expected = chi2_contingency(ct)
                                # Cramér's V
                                n = ct.sum().sum()
                                min_dim = min(ct.shape) - 1
                                if min_dim > 0:
                                    cramers_v = np.sqrt(chi2 / (n * min_dim))
                                else:
                                    cramers_v = 0.0

                                bivariate_results[col] = {
                                    'test': 'Chi-square',
                                    'statistic': chi2,
                                    'p_value': p_val,
                                    'cramers_v': cramers_v,
                                    'significant': p_val < self.alpha
                                }

                                significance = "✅ Significant" if p_val < self.alpha else "❌ Not significant"
                                print(f"  {col} vs {self.target_column}: Chi-square, p={p_val:.4f}, Cramér's V={cramers_v:.3f} - {significance}")
                            else:
                                print(f"  {col} vs {self.target_column}: Empty cross-tabulation")

                        except Exception as e:
                            print(f"  {col} vs {self.target_column}: Chi-square test failed - {str(e)}")

                    else:  # Numerical target
                        # Box plot by category
                        plt.figure(figsize=self.figsize)
                        sns.boxplot(data=self.data, x=col, y=self.target_column)
                        plt.title(f'{self.target_column} by {col}')
                        plt.xticks(rotation=45)
                        plt.tight_layout()
                        plt.savefig(self.output_dir / "plots" / f"boxplot_{self.target_column}_by_{col}.png",
                                   dpi=300, bbox_inches='tight')
                        plt.show()

                except Exception as e:
                    print(f"  Error analyzing {col}: {str(e)}")
                    continue

        self.results['bivariate_analysis'] = bivariate_results
        return bivariate_results

    def _interpret_correlation(self, corr_val: float) -> str:
        """Interpret correlation strength"""
        if corr_val >= 0.7:
            return "Strong"
        elif corr_val >= 0.5:
            return "Moderate"
        elif corr_val >= 0.3:
            return "Weak"
        else:
            return "Very weak"

    def feature_importance(self, method: str = 'auto') -> pd.DataFrame:
        """
        Calculate feature importance scores

        Args:
            method: Feature selection method ('auto', 'f_test', 'chi2', 'mutual_info')

        Returns:
            DataFrame with feature importance scores
        """
        if not self.target_column:
            print("⚠️ No target column specified for feature importance")
            return pd.DataFrame()

        print(f"\n🎯 FEATURE IMPORTANCE ANALYSIS...")

        # Prepare data
        X = self.data.drop(columns=[self.target_column])
        y = self.data[self.target_column]

        # Encode categorical variables
        X_encoded = X.copy()
        y_encoded = y.copy()

        # Handle categorical features
        for col in X_encoded.select_dtypes(include=['object', 'category']).columns:
            try:
                le = LabelEncoder()
                X_encoded[col] = le.fit_transform(X_encoded[col].astype(str))
            except Exception as e:
                print(f"Warning: Could not encode column {col}: {str(e)}")
                X_encoded = X_encoded.drop(columns=[col])

        # Handle categorical target
        if y_encoded.dtype in ['object', 'category']:
            try:
                le_y = LabelEncoder()
                y_encoded = le_y.fit_transform(y_encoded.astype(str))
            except Exception as e:
                print(f"Warning: Could not encode target variable: {str(e)}")
                return pd.DataFrame()

        # Auto-select method
        if method == 'auto':
            if y.dtype in ['object', 'category'] or y.nunique() < 20:
                method = 'f_classif' if len(self.numerical_cols) > 0 else 'chi2'
            else:
                method = 'f_regression'

        # Calculate scores
        try:
            if method == 'f_classif':
                scores, p_values = f_classif(X_encoded, y_encoded)
            elif method == 'f_regression':
                scores, p_values = f_regression(X_encoded, y_encoded)
            elif method == 'chi2':
                # Ensure non-negative values
                X_encoded = X_encoded - X_encoded.min() + 1
                scores, p_values = chi2(X_encoded, y_encoded)
            elif method == 'mutual_info':
                scores = mutual_info_classif(X_encoded, y_encoded, random_state=42)
                p_values = np.full_like(scores, np.nan)

            # Create results DataFrame
            importance_df = pd.DataFrame({
                'feature': X.columns,
                'importance_score': scores,
                'p_value': p_values
            }).sort_values('importance_score', ascending=False)

            # Visualization
            plt.figure(figsize=self.figsize)
            top_features = importance_df.head(min(15, len(importance_df)))
            sns.barplot(data=top_features, y='feature', x='importance_score')
            plt.title(f'Feature Importance ({method})')
            plt.xlabel('Importance Score')
            plt.tight_layout()
            plt.savefig(self.output_dir / "plots" / "feature_importance.png", dpi=300, bbox_inches='tight')
            plt.show()

            # Summary
            print(f"📊 Top 10 Most Important Features ({method}):")
            for i, (_, row) in enumerate(importance_df.head(10).iterrows(), 1):
                p_str = f" (p={row['p_value']:.4f})" if not pd.isna(row['p_value']) else ""
                print(f"  {i:2d}. {row['feature']:<20} Score: {row['importance_score']:.3f}{p_str}")

            if not importance_df['p_value'].isna().all():
                significant_features = importance_df[importance_df['p_value'] < self.alpha]
                print(f"\n✅ {len(significant_features)} features are statistically significant (p < {self.alpha})")

            self.results['feature_importance'] = importance_df
            return importance_df

        except Exception as e:
            print(f"❌ Feature importance analysis failed: {str(e)}")
            return pd.DataFrame()

    def preprocess_data(self,
                       handle_missing: str = 'drop',
                       encode_categorical: str = 'onehot',
                       scale_numerical: bool = True) -> pd.DataFrame:
        """
        Advanced data preprocessing pipeline

        Args:
            handle_missing: Strategy for missing values ('drop', 'impute')
            encode_categorical: Categorical encoding ('onehot', 'label')
            scale_numerical: Whether to scale numerical features

        Returns:
            Preprocessed DataFrame
        """
        print(f"\n🔧 DATA PREPROCESSING...")
        print(f"  Missing value strategy: {handle_missing}")
        print(f"  Categorical encoding: {encode_categorical}")
        print(f"  Numerical scaling: {scale_numerical}")

        df_processed = self.data.copy()

        # Handle missing values
        if handle_missing == 'drop':
            # Drop columns with >50% missing values
            high_missing_cols = df_processed.columns[df_processed.isnull().mean() > 0.5]
            if len(high_missing_cols) > 0:
                df_processed = df_processed.drop(columns=high_missing_cols)
                print(f"  Dropped {len(high_missing_cols)} columns with >50% missing values")

            # Drop rows with any missing values
            initial_rows = len(df_processed)
            df_processed = df_processed.dropna()
            dropped_rows = initial_rows - len(df_processed)
            print(f"  Dropped {dropped_rows} rows with missing values")

        elif handle_missing == 'impute':
            # Impute numerical columns with median
            for col in self.numerical_cols:
                if col in df_processed.columns:
                    df_processed[col].fillna(df_processed[col].median(), inplace=True)

            # Impute categorical columns with mode
            for col in self.categorical_cols:
                if col in df_processed.columns and not df_processed[col].mode().empty:
                    df_processed[col].fillna(df_processed[col].mode().iloc[0], inplace=True)

            print(f"  Imputed missing values for all columns")

        # Encode categorical variables
        if encode_categorical == 'onehot':
            categorical_cols_to_encode = [col for col in self.categorical_cols if col in df_processed.columns]
            if categorical_cols_to_encode:
                df_processed = pd.get_dummies(df_processed, columns=categorical_cols_to_encode, drop_first=True)
                print(f"  One-hot encoded {len(categorical_cols_to_encode)} categorical columns")

        elif encode_categorical == 'label':
            for col in self.categorical_cols:
                if col in df_processed.columns:
                    le = LabelEncoder()
                    df_processed[col] = le.fit_transform(df_processed[col].astype(str))
            print(f"  Label encoded {len(self.categorical_cols)} categorical columns")

        # Scale numerical features
        if scale_numerical and self.numerical_cols:
            scaler = StandardScaler()
            numerical_cols_to_scale = [col for col in self.numerical_cols if col in df_processed.columns]
            if numerical_cols_to_scale:
                df_processed[numerical_cols_to_scale] = scaler.fit_transform(df_processed[numerical_cols_to_scale])
                print(f"  Standardized {len(numerical_cols_to_scale)} numerical columns")

        print(f"  Final shape: {df_processed.shape}")

        self.results['preprocessed_data'] = df_processed
        return df_processed

    def generate_insights(self) -> Dict:
        """
        Generate automated insights and recommendations

        Returns:
            Dictionary with insights and recommendations
        """
        print("\n" + "="*60)
        print("🤖 AUTOMATED INSIGHTS & RECOMMENDATIONS")
        print("="*60)

        insights = {
            'data_quality': [],
            'distributions': [],
            'correlations': [],
            'outliers': [],
            'feature_selection': [],
            'modeling_recommendations': []
        }

        # Data Quality Insights
        if 'data_overview' in self.results:
            overview = self.results['data_overview']

            missing_total = sum(overview['missing_values'].values())
            missing_pct = (missing_total / (self.data.shape[0] * self.data.shape[1])) * 100

            if missing_pct == 0:
                insights['data_quality'].append("✅ Excellent data quality - no missing values")
            elif missing_pct < 5:
                insights['data_quality'].append(f"✅ Good data quality - only {missing_pct:.1f}% missing values")
                insights['data_quality'].append("💡 Simple imputation strategies recommended")
            elif missing_pct < 20:
                insights['data_quality'].append(f"⚠️ Moderate missing data - {missing_pct:.1f}% missing values")
                insights['data_quality'].append("💡 Consider advanced imputation (KNN, iterative)")
            else:
                insights['data_quality'].append(f"❌ High missing data - {missing_pct:.1f}% missing values")
                insights['data_quality'].append("💡 Investigate data collection process")

            if overview['duplicate_percentage'] > 5:
                insights['data_quality'].append(f"⚠️ {overview['duplicate_percentage']:.1f}% duplicate rows detected")
                insights['data_quality'].append("💡 Remove duplicates before modeling")

        # Distribution Insights
        if 'distributions' in self.results:
            distributions = self.results['distributions']
            skewed_features = []
            non_normal_features = []

            for col, stats in distributions.items():
                if col in self.numerical_cols:
                    if abs(stats.get('skewness', 0)) > 1:
                        skewed_features.append(col)
                    if stats.get('is_normal') == False:
                        non_normal_features.append(col)

            if skewed_features:
                insights['distributions'].append(f"⚠️ Highly skewed features: {', '.join(skewed_features)}")
                insights['distributions'].append("💡 Consider log/Box-Cox transformations")

            if non_normal_features:
                insights['distributions'].append(f"📊 Non-normal distributions: {len(non_normal_features)} features")
                insights['distributions'].append("💡 Consider non-parametric methods or transformations")

        # Correlation Insights
        if 'high_correlations' in self.results:
            high_corr = self.results['high_correlations']
            if high_corr:
                insights['correlations'].append(f"⚠️ {len(high_corr)} high correlation pairs detected")
                insights['correlations'].append("💡 Consider feature selection or PCA")
                insights['correlations'].append("💡 Use regularized models (Ridge/Lasso)")
            else:
                insights['correlations'].append("✅ No problematic correlations detected")

        # Outlier Insights
        if 'outliers' in self.results:
            outliers = self.results['outliers']
            total_outliers = sum([result['iqr']['count'] for result in outliers.values()])
            outlier_rate = (total_outliers / (self.data.shape[0] * len(self.numerical_cols))) * 100

            if outlier_rate > 10:
                insights['outliers'].append(f"⚠️ High outlier rate: {outlier_rate:.1f}%")
                insights['outliers'].append("💡 Investigate outliers, use robust methods")
            elif outlier_rate > 5:
                insights['outliers'].append(f"⚠️ Moderate outlier rate: {outlier_rate:.1f}%")
                insights['outliers'].append("💡 Monitor model performance with/without outliers")
            else:
                insights['outliers'].append(f"✅ Low outlier rate: {outlier_rate:.1f}%")

        # Feature Selection Insights
        if 'feature_importance' in self.results:
            importance = self.results['feature_importance']
            if not importance.empty:
                top_features = len(importance[importance['importance_score'] > importance['importance_score'].median()])
                insights['feature_selection'].append(f"📊 {top_features} features above median importance")

                if not importance['p_value'].isna().all():
                    significant = len(importance[importance['p_value'] < self.alpha])
                    insights['feature_selection'].append(f"✅ {significant} statistically significant features")

        # Modeling Recommendations
        if len(self.categorical_cols) > len(self.numerical_cols):
            insights['modeling_recommendations'].append("📊 Primarily categorical data detected")
            insights['modeling_recommendations'].append("💡 Consider: Random Forest, XGBoost, CatBoost")
        else:
            insights['modeling_recommendations'].append("📊 Primarily numerical data detected")
            insights['modeling_recommendations'].append("💡 Consider: Linear models, SVM, Neural Networks")

        if self.data.shape[0] < 1000:
            insights['modeling_recommendations'].append("📏 Small dataset - use cross-validation")
            insights['modeling_recommendations'].append("💡 Avoid overly complex models")
        elif self.data.shape[0] > 100000:
            insights['modeling_recommendations'].append("📏 Large dataset - consider sampling for EDA")
            insights['modeling_recommendations'].append("💡 Use scalable algorithms")

        # Print insights
        for category, insight_list in insights.items():
            if insight_list:
                print(f"\n{category.replace('_', ' ').upper()}:")
                for insight in insight_list:
                    print(f"  {insight}")

        print("\n🎯 NEXT STEPS:")
        print("1. Address identified data quality issues")
        print("2. Apply recommended transformations")
        print("3. Select important features")
        print("4. Choose appropriate modeling approach")
        print("5. Implement proper validation strategy")

        self.results['insights'] = insights
        return insights

    def run_complete_analysis(self, save_plots: bool = True) -> Dict:
        """
        Run complete EDA analysis pipeline

        Args:
            save_plots: Whether to save plots

        Returns:
            Dictionary with all analysis results
        """
        if self.data is None:
            raise ValueError("No data loaded. Please load data first.")

        print("🚀 RUNNING COMPLETE EDA ANALYSIS...")
        print("="*60)

        try:
            # 1. Data Overview
            self.data_overview()

            # 2. Distribution Analysis
            self.analyze_distributions(save_plots=save_plots)

            # 3. Correlation Analysis
            self.correlation_analysis(plot_heatmap=save_plots)

            # 4. Outlier Detection
            self.outlier_detection()

            # 5. Multicollinearity Check
            self.multicollinearity_check()

            # 6. Bivariate Analysis
            if self.target_column:
                self.bivariate_analysis()

            # 7. Feature Importance
            if self.target_column:
                self.feature_importance()

            # 8. Generate Insights
            self.generate_insights()

            print("\n" + "="*60)
            print("✅ COMPLETE EDA ANALYSIS FINISHED")
            print(f"📁 Results saved to: {self.output_dir}")
            if save_plots:
                print(f"📊 Plots saved to: {self.output_dir}/plots/")
            print("="*60)

            return self.results

        except Exception as e:
            logger.error(f"Error in complete analysis: {str(e)}")
            raise

def main():
    """
    Example usage of the Advanced EDA Framework
    """
    # Initialize the framework
    eda = AdvancedEDA(alpha=0.05, figsize=(12, 8))

    # Example with sample data (replace with your data)
    try:
        # Option 1: Load from file
        # data = eda.load_data('your_dataset.csv', target_column='target')

        # Option 2: Create sample data for demonstration
        np.random.seed(42)
        n_samples = 1000

        sample_data = pd.DataFrame({
            'age': np.random.randint(18, 80, n_samples),
            'income': np.random.normal(50000, 15000, n_samples),
            'education_years': np.random.randint(8, 20, n_samples),
            'experience': np.random.randint(0, 40, n_samples),
            'department': np.random.choice(['Sales', 'Engineering', 'Marketing', 'HR'], n_samples),
            'gender': np.random.choice(['Male', 'Female'], n_samples),
            'performance': np.random.choice(['Low', 'Medium', 'High'], n_samples)
        })

        # Ensure no missing values in sample data
        for col in sample_data.columns:
            if sample_data[col].dtype in ['object', 'category']:
                sample_data[col] = sample_data[col].fillna('Unknown')
            else:
                sample_data[col] = sample_data[col].fillna(sample_data[col].mean())

        # Load data
        data = eda.load_data(sample_data, target_column='performance')

        # Run complete analysis
        results = eda.run_complete_analysis(save_plots=True)

        print("\n🎉 EDA Framework demonstration completed successfully!")
        print("📈 Check the generated plots and analysis results above.")

    except Exception as e:
        print(f"❌ Error in demonstration: {str(e)}")
        print("💡 Please ensure you have all required libraries installed.")

if __name__ == "__main__":
    main()