In [38]:
import pandas as pd
import numpy as np
from pathlib import Path
from typing import Dict, Any, Union

def LoadDataset(file_path: Union[str, Path]) -> pd.DataFrame:
    """
    Load various tabular data formats into a pandas DataFrame.
    Supports CSV, Excel, and other common formats.
    """
    file_path = Path(file_path)
    if file_path.suffix.lower() == '.csv':
        return pd.read_csv(file_path)
    elif file_path.suffix.lower() in ['.xlsx', '.xls']:
        return pd.read_excel(file_path)
    else:
        raise ValueError(f"Unsupported file format: {file_path.suffix}")

def GetDatasetProfile(df: pd.DataFrame) -> Dict[str, Dict[str, Any]]:
    """
    Create a comprehensive profile of the dataset including statistics and metadata.
    """
    profile = {}
    
    for col in df.columns:
        col_stats = {
            'dtype': str(df[col].dtype),
            'total_count': len(df[col]),
            'null_count': df[col].isna().sum(),
            'null_percentage': (df[col].isna().sum() / len(df[col])) * 100,
            'unique_count': df[col].nunique()
        }
        
        # Add numerical statistics if applicable
        if pd.api.types.is_numeric_dtype(df[col]):
            col_stats.update({
                'mean': df[col].mean() if not df[col].empty else None,
                'median': df[col].median() if not df[col].empty else None,
                'std': df[col].std() if not df[col].empty else None,
                'min': df[col].min() if not df[col].empty else None,
                'max': df[col].max() if not df[col].empty else None
            })
        
        # Check for various null representations
        null_variants = ['NA', 'Na', 'na', 'NULL', 'Null', 'null', 'NAN', 'Nan', 'nan']
        if df[col].dtype == 'object':
            null_like_count = df[col].isin(null_variants).sum()
            col_stats['alternative_null_count'] = null_like_count
        
        profile[col] = col_stats
    
    return profile

In [41]:
df = LoadDataset('../sheets/credit_card_transactions.csv')
GetDatasetProfile(df)

{'Unnamed: 0': {'dtype': 'int64',
  'total_count': 1296675,
  'null_count': 0,
  'null_percentage': 0.0,
  'unique_count': 1296675,
  'mean': 648337.0,
  'median': 648337.0,
  'std': 374317.9744882685,
  'min': 0,
  'max': 1296674},
 'trans_date_trans_time': {'dtype': 'object',
  'total_count': 1296675,
  'null_count': 0,
  'null_percentage': 0.0,
  'unique_count': 1274791,
  'alternative_null_count': 0},
 'cc_num': {'dtype': 'int64',
  'total_count': 1296675,
  'null_count': 0,
  'null_percentage': 0.0,
  'unique_count': 983,
  'mean': 4.1719204207972666e+17,
  'median': 3521417320836166.0,
  'std': 1.3088064470002404e+18,
  'min': 60416207185,
  'max': 4992346398065154184},
 'merchant': {'dtype': 'object',
  'total_count': 1296675,
  'null_count': 0,
  'null_percentage': 0.0,
  'unique_count': 693,
  'alternative_null_count': 0},
 'category': {'dtype': 'object',
  'total_count': 1296675,
  'null_count': 0,
  'null_percentage': 0.0,
  'unique_count': 14,
  'alternative_null_count': 0}