In [None]:
import pandas as pd
import numpy as np

# Load your dataset
file_path = '.csv'  # Update this with your file path
df = pd.read_csv(file_path)

def generate_dataset_report(dataframe, target_column=None):
    # Basic information
    report = {}
    report['shape'] = dataframe.shape
    report['dtypes'] = dataframe.dtypes.value_counts().to_dict()
    
    # Missing values analysis
    missing_values = dataframe.isnull().sum()
    report['missing_values'] = {
        'total_missing': missing_values.sum(),
        'columns_with_missing': missing_values[missing_values > 0].to_dict(),
        'missing_percentage': (missing_values.sum() / np.product(dataframe.shape)) * 100
    }
    
    # Descriptive statistics
    report['description'] = {
        'numeric': dataframe.describe(include=[np.number]).to_dict(),
        'categorical': dataframe.describe(include=['object']).to_dict()
    }
    
    # Duplicates analysis
    report['duplicates'] = {
        'total_duplicates': dataframe.duplicated().sum(),
        'percentage': (dataframe.duplicated().sum() / len(dataframe)) * 100
    }
    
    # Unique values analysis
    unique_values = dataframe.nunique()
    report['unique_values'] = {
        'average_unique': unique_values.mean(),
        'high_cardinality_features': unique_values[unique_values > 100].to_dict(),
        'constant_features': unique_values[unique_values == 1].to_dict()
    }
    
    # Target analysis (if specified)
    if target_column:
        if target_column in dataframe.columns:
            report['target_distribution'] = {
                'class_distribution': dataframe[target_column].value_counts(normalize=True).to_dict(),
                'class_imbalance': (dataframe[target_column].value_counts(normalize=True).max() > 0.7)
            }
    
    # Correlation analysis (numeric features only)
    if len(dataframe.select_dtypes(include=[np.number]).columns) > 0:
        report['correlation'] = {
            'top_correlations': dataframe.corr().abs().unstack().sort_values(ascending=False).drop_duplicates().head(10).to_dict()
        }
    
    return report

# Generate the report
target_col = 'your_target_column'  # Set to None if not applicable
dataset_report = generate_dataset_report(df, target_column=target_col)

# Print the report
import pprint
pp = pprint.PrettyPrinter(indent=4)
pp.pprint(dataset_report)

# Optional: Save to file
with open('dataset_report.txt', 'w') as f:
    pprint.pprint(dataset_report, stream=f, indent=4)