In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats
from sklearn.preprocessing import RobustScaler, StandardScaler

- Distribution visualization
- Missing value analysis
- Correlation analysis with automatic selection of correlation method
- Smart normalization based on outlier presence
- Outlier visualization
- Comprehensive summary report generation

In [None]:
class UniversalEDA:
    def __init__(self, df):
        """
        Initialize the EDA class with a DataFrame
        
        Parameters:
        -----------
        df : pandas.DataFrame
            DataFrame to analyze
        """
        self.df = df
        self.numeric_columns = df.select_dtypes(include=['int64', 'float64']).columns
        
    def plot_distributions(self, figsize=(15, 5*len(self.numeric_columns))):
        """
        Visualize the distribution of numerical variables
        - Histogram
        - Box plot
        - Q-Q plot for normality check
        """
        n_cols = len(self.numeric_columns)
        
        fig, axes = plt.subplots(n_cols, 3, figsize=figsize)
        
        for idx, col in enumerate(self.numeric_columns):
            # Histogram
            sns.histplot(data=self.df, x=col, ax=axes[idx, 0])
            axes[idx, 0].set_title(f'{col} Distribution')
            
            # Box plot
            sns.boxplot(data=self.df, y=col, ax=axes[idx, 1])
            axes[idx, 1].set_title(f'{col} Boxplot')
            
            # Q-Q plot
            stats.probplot(self.df[col].dropna(), dist="norm", plot=axes[idx, 2])
            axes[idx, 2].set_title(f'{col} Q-Q Plot')
        
        plt.tight_layout()
        return fig
    
    def check_missing_values(self):
        """
        Analyze missing values
        - Count and percentage of missing values for each variable
        - Return rows containing missing values
        """
        missing_count = self.df.isnull().sum()
        missing_percent = (missing_count / len(self.df)) * 100
        missing_summary = pd.DataFrame({
            'Missing Count': missing_count,
            'Missing Percent': missing_percent
        }).sort_values('Missing Count', ascending=False)
        
        print("\n=== Missing Value Summary ===")
        print(missing_summary[missing_summary['Missing Count'] > 0])
        
        print("\n=== Rows with Missing Values ===")
        return self.df[self.df.isnull().any(axis=1)]
    
    def correlation_analysis(self):
        """
        Analyze correlations between variables
        - Use Pearson or Spearman correlation based on normality test
        """
        correlation_matrix = pd.DataFrame(index=self.numeric_columns, columns=self.numeric_columns)
        method_matrix = pd.DataFrame(index=self.numeric_columns, columns=self.numeric_columns)
        
        for col1 in self.numeric_columns:
            for col2 in self.numeric_columns:
                # Shapiro-Wilk test for normality
                _, p_val1 = stats.shapiro(self.df[col1].dropna())
                _, p_val2 = stats.shapiro(self.df[col2].dropna())
                
                # If both variables are normally distributed (p > 0.05), use Pearson
                if p_val1 > 0.05 and p_val2 > 0.05:
                    corr, _ = stats.pearsonr(self.df[col1].dropna(), self.df[col2].dropna())
                    method = 'Pearson'
                else:
                    corr, _ = stats.spearmanr(self.df[col1].dropna(), self.df[col2].dropna())
                    method = 'Spearman'
                
                correlation_matrix.loc[col1, col2] = corr
                method_matrix.loc[col1, col2] = method
        
        # Plot correlation heatmap
        plt.figure(figsize=(10, 8))
        sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', center=0)
        plt.title('Correlation Matrix')
        plt.show()
        
        print("\n=== Correlation Method Used ===")
        print(method_matrix)
        
        return correlation_matrix, method_matrix
    
    def normalize_data(self, columns=None):
        """
        Normalize data using appropriate scaling method
        - RobustScaler for data with outliers
        - StandardScaler for data without outliers
        
        Parameters:
        -----------
        columns : list
            List of columns to normalize (None for all numeric variables)
        
        Returns:
        --------
        normalized_df : pandas.DataFrame
            Normalized DataFrame
        scalers : dict
            Dictionary of scaler objects used for each column
        """
        if columns is None:
            columns = self.numeric_columns
            
        normalized_df = self.df.copy()
        scalers = {}
        
        for col in columns:
            # Check outliers using IQR method
            Q1 = self.df[col].quantile(0.25)
            Q3 = self.df[col].quantile(0.75)
            IQR = Q3 - Q1
            outlier_range = 1.5 * IQR
            outliers = ((self.df[col] < (Q1 - outlier_range)) | 
                       (self.df[col] > (Q3 + outlier_range))).sum()
            
            # Use RobustScaler if outliers are more than 1% of data
            if outliers / len(self.df) >= 0.01:
                scaler = RobustScaler()
                print(f"{col}: Using RobustScaler (Found {outliers} outliers)")
            else:
                scaler = StandardScaler()
                print(f"{col}: Using StandardScaler")
            
            normalized_df[col] = scaler.fit_transform(self.df[[col]])
            scalers[col] = scaler
        
        return normalized_df, scalers
    
    def plot_outliers(self, columns=None):
        """
        Visualize outliers using box plots
        
        Parameters:
        -----------
        columns : list
            List of columns to visualize (None for all numeric variables)
        """
        if columns is None:
            columns = self.numeric_columns
            
        n_cols = len(columns)
        fig, axes = plt.subplots(n_cols, 1, figsize=(10, 5*n_cols))
        if n_cols == 1:
            axes = [axes]
            
        for idx, col in enumerate(columns):
            # Calculate outlier bounds
            Q1 = self.df[col].quantile(0.25)
            Q3 = self.df[col].quantile(0.75)
            IQR = Q3 - Q1
            lower_bound = Q1 - 1.5 * IQR
            upper_bound = Q3 + 1.5 * IQR
            
            # Create boxplot
            sns.boxplot(data=self.df, y=col, ax=axes[idx])
            
            # Add text with outlier information
            outliers = ((self.df[col] < lower_bound) | (self.df[col] > upper_bound)).sum()
            axes[idx].set_title(f'{col} Outliers: {outliers} points')
            
        plt.tight_layout()
        return fig

    def generate_summary_report(self):
        """
        Generate comprehensive data summary report
        - Basic information
        - Numeric summary
        - Missing values analysis
        - Distribution plots
        - Correlation analysis
        - Outlier visualization
        """
        print("=== Data Summary Report ===")
        print("\nBasic Information:")
        print(f"- Total Rows: {len(self.df)}")
        print(f"- Total Columns: {len(self.df.columns)}")
        print(f"- Numeric Columns: {len(self.numeric_columns)}")
        print(f"- Non-numeric Columns: {len(self.df.columns) - len(self.numeric_columns)}")
        
        print("\nNumeric Columns Summary:")
        print(self.df[self.numeric_columns].describe())
        
        print("\nMissing Values Summary:")
        self.check_missing_values()
        
        # Generate and save all plots
        self.plot_distributions()
        plt.savefig('distributions.png')
        
        self.correlation_analysis()
        plt.savefig('correlation.png')
        
        self.plot_outliers()
        plt.savefig('outliers.png')
        
        print("\nPlots have been saved as:")
        print("- distributions.png")
        print("- correlation.png")
        print("- outliers.png")

In [None]:
# Usage Example:
"""
# Initialize the class with a DataFrame
eda = UniversalEDA(df)

# Generate comprehensive summary report
eda.generate_summary_report()

# Or perform specific analyses
eda.plot_distributions()  # Check distributions
eda.check_missing_values()  # Analyze missing values
eda.correlation_analysis()  # Analyze correlations
eda.plot_outliers()  # Visualize outliers

# Normalize data
normalized_df, scalers = eda.normalize_data()
"""

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from scipy.stats import shapiro, spearmanr, pearsonr
from sklearn.preprocessing import StandardScaler, RobustScaler

# 1. Plot distribution of numerical features
def plot_distribution(df):
    num_cols = df.select_dtypes(include=['int64', 'float64']).columns
    for col in num_cols:
        plt.figure(figsize=(8, 4))
        sns.histplot(df[col], kde=True, bins=30, color='blue')
        plt.title(f'Distribution of {col}')
        plt.xlabel(col)
        plt.ylabel('Frequency')
        plt.show()

# 2. Missing Value Analysis
def missing_value_analysis(df):
    missing_counts = df.isnull().sum()
    missing_counts = missing_counts[missing_counts > 0]
    print("Missing Values Per Column:")
    print(missing_counts)
    
    if missing_counts.sum() > 0:
        print("\nRows with Missing Values:")
        display(df[df.isnull().any(axis=1)])

# 3. Compute Correlation (Pearson if normal, Spearman otherwise)
def calculate_correlation(df):
    num_cols = df.select_dtypes(include=['int64', 'float64']).columns
    correlation_matrix = pd.DataFrame(index=num_cols, columns=num_cols)
    
    for col1 in num_cols:
        for col2 in num_cols:
            if col1 != col2:
                stat, p = shapiro(df[col1].dropna())
                if p > 0.05:  # Normally distributed
                    corr, _ = pearsonr(df[col1].dropna(), df[col2].dropna())
                else:  # Not normally distributed
                    corr, _ = spearmanr(df[col1].dropna(), df[col2].dropna())
                correlation_matrix.loc[col1, col2] = corr
    
    correlation_matrix = correlation_matrix.astype(float)
    
    plt.figure(figsize=(10, 6))
    sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', fmt=".2f")
    plt.title("Correlation Matrix")
    plt.show()
    
    return correlation_matrix

# 4. Detect Outliers
def detect_outliers(df, method="zscore", threshold=3):
    num_cols = df.select_dtypes(include=['int64', 'float64']).columns
    outlier_dict = {}
    
    for col in num_cols:
        if method == "zscore":
            mean, std = df[col].mean(), df[col].std()
            z_scores = (df[col] - mean) / std
            outliers = df[np.abs(z_scores) > threshold]
        elif method == "iqr":
            Q1 = df[col].quantile(0.25)
            Q3 = df[col].quantile(0.75)
            IQR = Q3 - Q1
            outliers = df[(df[col] < (Q1 - 1.5 * IQR)) | (df[col] > (Q3 + 1.5 * IQR))]
        
        if not outliers.empty:
            outlier_dict[col] = outliers
    
    return outlier_dict

# 5. Normalize Data
def normalize_data(df):
    num_cols = df.select_dtypes(include=['int64', 'float64']).columns
    scaler_dict = {}
    df_scaled = df.copy()
    
    for col in num_cols:
        outliers = detect_outliers(df[[col]], method="zscore", threshold=3)
        
        if len(outliers) > 0:
            scaler = RobustScaler()
            print(f"Using RobustScaler for {col} (outliers detected)")
        else:
            scaler = StandardScaler()
            print(f"Using StandardScaler for {col} (no outliers detected)")
        
        df_scaled[col] = scaler.fit_transform(df[[col]])
        scaler_dict[col] = scaler
    
    return df_scaled, scaler_dict
