In [None]:
NAME
COUPA_INVOICE_HEADER_SCD2_ID
CUSTOM_FIELDS
CANCELED
CASH_ACCOUNTING_SCHEME_REFERENCE
CHANNEL
CLEARANCE_DOCUMENT
COMMENTS
COMPLIANT
CONFIRMATION
COUPA_ACCELERATE_STATUS
CREDIT_REASON
CUSTOMS_DECLARATION_NUMBER
CUSTOMS_OFFICE
DELIVERY_NUMBER
DISPUTE_METHOD
DOCUMENT_TYPE
EARLY_PAYMENT_PROVISIONS
EXPORTED
FOLIO_NUMBER
FORM_OF_PAYMENT
IMAGE_SCAN
IMAGE_SCAN_URL
INBOX_NAME
INTERNAL_NOTE
INVOICE_NUMBER
ISSUANCE_PLACE
LAST_EXPORTED_AT
LATE_PAYMENT_PENALTIES
LINE_LEVEL_TAXATION
MARGIN_SCHEME
NET_DUE_DATE
ORIGINAL_INVOICE_NUMBER
PAID
PAYMENT_CHANNEL
PAYMENT_METHOD
PAYMENT_NOTES
PAYMENT_ORDER_REFERENCE
REVERSE_CHARGE_REFERENCE
SELF_BILLING_REFERENCE
SENDER_EMAIL
SERIES
SHOW_TAX_INFORMATION
STATUS
SUPPLIER_CREATED
SUPPLIER_NOTE
TAX_AMOUNT_ENGINE
TAX_RATE
TOLERANCE_FAILURES
TYPE_OF_RECEIPT
TYPE_OF_RELATIONSHIP
USE_OF_INVOICE
ZDP_META_SOURCE_ACCOUNT
DBT_SCD_ID

In [None]:
use database CLEANSED;
USE SCHEMA COUPA;
SELECT 
    COUPA_INVOICE_LINE_SCD2_ID,
    CUSTOM_FIELDS,
    BILLING_NOTE,
    CATEGORY,
    COMPANY_UOM,
    CUSTOMS_DECLARATION_NUMBER,
    DEDUCTIBILITY,
    DELIVERY_NOTE_NUMBER,
    DESCRIPTION,
    HSN_SAC_CODE,
    LINE_TYPE,
    MATCH_REFERENCE,
    ORDER_HEADER_NUM,
    ORDER_LINE_NUM,
    ORDER_LINE_SOURCE_PART_NUM,
    ORIGINAL_DATE_OF_SUPPLY,
    PO_NUMBER,
    PROPERTY_TAX_ACCOUNT,
    SOURCE_PART_NUM,
    STATUS,
    SUBCATEGORY,
    TAX_AMOUNT_ENGINE,
    TAX_DESCRIPTION,
    TAX_LOCATION,
    TAX_RATE,
    TYPE,
    UNSPSC,
    ZDP_META_SOURCE_ACCOUNT,
    DBT_SCD_ID
FROM COUPA_INVOICE_LINE_BCV
WHERE DATE(CREATED_AT) > '2025-11-30'

In [None]:
import pandas as pd
df = cell1.to_pandas()

In [None]:
def generate_data_snapshot(df):
    """
    Generates a 360Â° data snapshot for a given DataFrame.
    Returns the summary DataFrame.
    """
    summary_data = []
    row_count = len(df)
    
    for col in df.columns:
        col_data = df[col]
        dtype = col_data.dtype
        non_null_ct = col_data.count()
        null_ct = row_count - non_null_ct
        distinct_ct = col_data.nunique()
        
        # Base stats
        stats = {
            'ATTRIBUTE': col,
            'DTYPE': str(dtype),
            'ROW_COUNT': row_count,
            'NON_NULL_CT': non_null_ct,
            'NULL_CT': null_ct,
            'DISTINCT_CT': distinct_ct,
            'DETAILS': '' # Placeholder for type-specific stats
        }
        
        # Type specific stats
        if pd.api.types.is_numeric_dtype(col_data):
            # Numeric: min/median/mean/max, P90/P95
            desc = col_data.describe(percentiles=[.5, .9, .95])
            stats['DETAILS'] = (
                f"Min: {desc['min']:.2f}, Med: {desc['50%']:.2f}, Mean: {desc['mean']:.2f}, "
                f"Max: {desc['max']:.2f}, P90: {desc['90%']:.2f}, P95: {desc['95%']:.2f}"
            )
        elif pd.api.types.is_datetime64_any_dtype(col_data):
            # Date: min/max
            stats['DETAILS'] = f"Min: {col_data.min()}, Max: {col_data.max()}"
        else:
            # Categorical: top 3 values with shares
            top_counts = col_data.value_counts(normalize=True).head(3)
            top_strs = [f"{val} ({pct:.1%})" for val, pct in top_counts.items()]
            stats['DETAILS'] = "Top 3: " + ", ".join(top_strs)
            
        summary_data.append(stats)
        
    summary_df = pd.DataFrame(summary_data)
    
    return summary_df
# Example Usage (Uncomment to run with your dataframe)
# df = pd.read_csv('your_data.csv')
# generate_data_snapshot(df)

In [None]:
# Define the function to get unique values for specified columns
def unique_values(df, list_of_columns):
    """
    Get unique values for each column in the list.
    Excludes columns where all values are NULL.
    
    Parameters:
    -----------
    df : pandas.DataFrame
        The dataframe to analyze
    list_of_columns : list
        List of column names to analyze
    
    Returns:
    --------
    tuple: (results_df, excluded_columns)
        - results_df: DataFrame with columns: column_name, unique_count, unique_values (as string)
        - excluded_columns: List of columns that were all NULL
    """
    results = []
    excluded_columns = []
    
    for col in list_of_columns:
        if col in df.columns:
            # Check if all values are NULL/NaN
            if df[col].isna().all():
                excluded_columns.append(col)
                print(f"Excluding '{col}' - all values are NULL")
                continue
            
            # Get unique values, excluding None/NaN
            unique_vals = df[col].dropna().unique()
            unique_count = len(unique_vals)
            
            # Convert to string and join with ' | ' separator
            unique_vals_str = ' | '.join([str(val) for val in sorted(unique_vals)])
            
            results.append({
                'column_name': col,
                'unique_count': unique_count,
                'unique_values': unique_vals_str
            })
        else:
            print(f"Warning: Column '{col}' not found in dataframe")
    
    return pd.DataFrame(results), excluded_columns

#print("Function 'unique_values' defined successfully!")

In [None]:
# Define function to get top N most frequent values for each column
def top_n_values(df, list_of_columns, top_n=5):
    """
    Get the top N most frequent values for each column.
    Returns one row per value per column (exploded format).
    Excludes columns where all values are NULL.
    
    Parameters:
    -----------
    df : pandas.DataFrame
        The dataframe to analyze
    list_of_columns : list
        List of column names to analyze
    top_n : int
        Number of top values to return per column (default: 5)
    
    Returns:
    --------
    tuple: (results_df, excluded_columns)
        - results_df: DataFrame with columns: column_name, rank, value, count, percentage
        - excluded_columns: List of columns that were all NULL
    """
    results = []
    excluded_columns = []
    
    for col in list_of_columns:
        if col in df.columns:
            # Check if all values are NULL/NaN
            if df[col].isna().all():
                excluded_columns.append(col)
                print(f"Excluding '{col}' - all values are NULL")
                continue
            
            # Get value counts for the column
            value_counts = df[col].value_counts().head(top_n)
            total_non_null = df[col].notna().sum()
            
            # Create one row per top value
            for rank, (val, count) in enumerate(value_counts.items(), start=1):
                percentage = (count / total_non_null * 100) if total_non_null > 0 else 0
                results.append({
                    'column_name': col,
                    'rank': rank,
                    'value': str(val),
                    'count': count,
                    'percentage': round(percentage, 2)
                })
        else:
            print(f"Warning: Column '{col}' not found in dataframe")
    
    return pd.DataFrame(results), excluded_columns

print("Function 'top_n_values' defined successfully!")

In [None]:
# Define the list of columns to analyze
columns_to_analyze = [
    'COUPA_INVOICE_LINE_SCD2_ID',
    'CUSTOM_FIELDS',
    'BILLING_NOTE',
    'CATEGORY',
    'COMPANY_UOM',
    'CUSTOMS_DECLARATION_NUMBER',
    'DEDUCTIBILITY',
    'DELIVERY_NOTE_NUMBER',
    'DESCRIPTION',
    'HSN_SAC_CODE',
    'LINE_TYPE',
    'MATCH_REFERENCE',
    'ORDER_HEADER_NUM',
    'ORDER_LINE_NUM',
    'ORDER_LINE_SOURCE_PART_NUM',
    'ORIGINAL_DATE_OF_SUPPLY',
    'PO_NUMBER',
    'PROPERTY_TAX_ACCOUNT',
    'SOURCE_PART_NUM',
    'STATUS',
    'SUBCATEGORY',
    'TAX_AMOUNT_ENGINE',
    'TAX_DESCRIPTION',
    'TAX_LOCATION',
    'TAX_RATE',
    'TYPE',
    'UNSPSC',
    'ZDP_META_SOURCE_ACCOUNT',
    'DBT_SCD_ID'
]

# Run the analysis
top_results_df, top_excluded_columns = top_n_values(df, columns_to_analyze, top_n=5)

# Display results
print(f"\nTop Values Analysis complete!")
print(f"Total rows (top values across all columns): {len(top_results_df)}")
print(f"Columns analyzed: {top_results_df['column_name'].nunique()}")
print(f"Columns excluded (all NULL): {len(top_excluded_columns)}")
if top_excluded_columns:
    print(f"\nExcluded columns: {', '.join(top_excluded_columns)}")

# Show summary by column
print("\nTop values count by column:")
print(top_results_df.groupby('column_name').size().to_frame('top_values_returned'))

# Return the exploded dataframe to render it
top_results_df

In [None]:
generate_data_snapshot(df)