In [15]:
from src.core.loader import load_dataset, preview_dataset , DataLoader, DatasetInfo
import polars as pl
csv_data = "data/examples/customers-2000000.csv"
json_data = "data/examples/flights-1m.json"
df = load_dataset(json_data)

In [16]:
df.head()

FL_DATE,DEP_DELAY,ARR_DELAY,AIR_TIME,DISTANCE,DEP_TIME,ARR_TIME
str,i64,i64,i64,i64,f64,f64
"""2006-01-01""",5,19,350,2475,9.083333,12.483334
"""2006-01-02""",167,216,343,2475,11.783334,15.766666
"""2006-01-03""",-7,-2,344,2475,8.883333,12.133333
"""2006-01-04""",-5,-13,331,2475,8.916667,11.95
"""2006-01-05""",-3,-17,321,2475,8.95,11.883333


In [17]:
preview = preview_dataset(csv_data, n_rows=5)
print(f"Columns: {preview['columns']}")
print(preview['preview'])

Columns: 12
shape: (5, 12)
┌───────┬────────────┬────────────┬───────────┬───┬────────────┬───────────┬───────────┬───────────┐
│ Index ┆ Customer   ┆ First Name ┆ Last Name ┆ … ┆ Phone 2    ┆ Email     ┆ Subscript ┆ Website   │
│ ---   ┆ Id         ┆ ---        ┆ ---       ┆   ┆ ---        ┆ ---       ┆ ion Date  ┆ ---       │
│ i64   ┆ ---        ┆ str        ┆ str       ┆   ┆ str        ┆ str       ┆ ---       ┆ str       │
│       ┆ str        ┆            ┆           ┆   ┆            ┆           ┆ str       ┆           │
╞═══════╪════════════╪════════════╪═══════════╪═══╪════════════╪═══════════╪═══════════╪═══════════╡
│ 1     ┆ 4962fdbE6B ┆ Pam        ┆ Sparks    ┆ … ┆ 480-078-05 ┆ nicolas00 ┆ 2020-11-2 ┆ https://n │
│       ┆ fee6D      ┆            ┆           ┆   ┆ 35x889     ┆ @faulkner ┆ 9         ┆ elson.com │
│       ┆            ┆            ┆           ┆   ┆            ┆ -kramer.c ┆           ┆ /         │
│       ┆            ┆            ┆           ┆   ┆            ┆

In [18]:
loader = DataLoader(lazy=True)
lazy_df = loader.load(json_data)
print(lazy_df.head())

shape: (5, 7)
┌────────────┬───────────┬───────────┬──────────┬──────────┬───────────┬───────────┐
│ FL_DATE    ┆ DEP_DELAY ┆ ARR_DELAY ┆ AIR_TIME ┆ DISTANCE ┆ DEP_TIME  ┆ ARR_TIME  │
│ ---        ┆ ---       ┆ ---       ┆ ---      ┆ ---      ┆ ---       ┆ ---       │
│ str        ┆ i64       ┆ i64       ┆ i64      ┆ i64      ┆ f64       ┆ f64       │
╞════════════╪═══════════╪═══════════╪══════════╪══════════╪═══════════╪═══════════╡
│ 2006-01-01 ┆ 5         ┆ 19        ┆ 350      ┆ 2475     ┆ 9.083333  ┆ 12.483334 │
│ 2006-01-02 ┆ 167       ┆ 216       ┆ 343      ┆ 2475     ┆ 11.783334 ┆ 15.766666 │
│ 2006-01-03 ┆ -7        ┆ -2        ┆ 344      ┆ 2475     ┆ 8.883333  ┆ 12.133333 │
│ 2006-01-04 ┆ -5        ┆ -13       ┆ 331      ┆ 2475     ┆ 8.916667  ┆ 11.95     │
│ 2006-01-05 ┆ -3        ┆ -17       ┆ 321      ┆ 2475     ┆ 8.95      ┆ 11.883333 │
└────────────┴───────────┴───────────┴──────────┴──────────┴───────────┴───────────┘


In [19]:
loader = DataLoader(lazy=False)
lazy_df = loader.load(csv_data)
info = loader.get_info()
print(f"Loaded {info.rows:,} rows in {info.load_time_seconds:.2f}s")
print(f"Encoding: {info.encoding}")
if info.warnings:
    print("Warnings:", info.warnings)

Loaded 2,000,000 rows in 1.41s
Encoding: utf-8


In [20]:
from src.core.schema import detect_schema, get_schema_summary


df = load_dataset(csv_data)

schema = detect_schema(df)
for col_name, col_info in schema.items():
    print(f"{col_name}: {col_info.data_type.value} ({col_info.semantic_type.value})")

Index: numeric (none)
Customer Id: text (none)
First Name: categorical (none)
Last Name: categorical (none)
Company: text (none)
City: text (none)
Country: categorical (none)
Phone 1: text (phone)
Phone 2: text (phone)
Email: text (email)
Subscription Date: datetime (none)
Website: text (url)


In [21]:
summary = get_schema_summary(df)
print(f"Total columns: {summary['total_columns']}")
print(f"Memory usage: {summary['total_memory_mb']:.2f} MB")
print(f"Type distribution: {summary['type_distribution']}")

Total columns: 12
Memory usage: 298.69 MB
Type distribution: {'numeric': 1, 'text': 7, 'categorical': 3, 'datetime': 1}


In [25]:
from src.core.statistics import (
    analyze_statistics, 
    get_statistics_summary,
    NumericStats,
    CategoricalStats,
    DatetimeStats,
    TextStats
)

stats = analyze_statistics(df)

In [26]:
if isinstance(stats['Index'], NumericStats):
    print(f"Mean: {stats['Index'].mean}")
    print(f"Std: {stats['Index'].std}")
    print(f"Skewness: {stats['Index'].skewness}")

Mean: 1000000.5
Std: 577350.4135271751
Skewness: 1.8427188811611887e-16


In [27]:
if isinstance(stats['Country'], CategoricalStats):
    print(f"Mode: {stats['Country'].mode}")
    print(f"Unique: {stats['Country'].unique_count}")
    print("Top values:", stats['Country'].top_values[:3])

Mode: Korea
Unique: 243
Top values: [('Korea', 16240, 0.812), ('Congo', 16208, 0.8104), ('Jordan', 8428, 0.4214)]


In [28]:
summary = get_statistics_summary(df)
print(f"Numeric columns: {summary['numeric_columns']}")
print(f"Categorical columns: {summary['categorical_columns']}")

Numeric columns: ['Index']
Categorical columns: ['First Name', 'Last Name', 'Country']


In [29]:
from src.core.missing import (
    analyze_missing,
    get_missing_summary,
    get_missing_heatmap_data,
    detect_missing_type
)

In [41]:
# Full analysis
missing_analysis = analyze_missing(df)
print(f"Total missing: {missing_analysis['total_missing_values']}")
print(f"Overall missing %: {missing_analysis['overall_missing_percentage']:.2f}%")
print(f"Complete rows: {missing_analysis['complete_rows']} ({missing_analysis['complete_rows_percentage']:.2f}%)")

# Per column
for col, info in missing_analysis['columns'].items():
    if info.missing_count > 0:
        print(f"{col}: {info.missing_count} missing ({info.missing_percentage:.2f}%)")

# Patterns
print("\nTop missing patterns:")
for pattern in missing_analysis['patterns'][:3]:
    print(f"  {pattern.columns}: {pattern.count} rows ({pattern.percentage:.2f}%)")

# Summary
summary = get_missing_summary(df)
print(f"\nHigh missing columns: {summary['high_missing_columns']}")
print(f"Medium missing columns: {summary['medium_missing_columns']}")

# Heatmap data for visualization
heatmap = get_missing_heatmap_data(df, sample_size=100)
print(f"\nHeatmap matrix shape: {len(heatmap['matrix'])} x {len(heatmap['columns'])}")

# Detect missing type
missing_type = detect_missing_type(df, 'Email')
print(f"Email missing type: {missing_type}")

Total missing: 0
Overall missing %: 0.00%
Complete rows: 1000000 (100.00%)

Top missing patterns:

High missing columns: []
Medium missing columns: []

Heatmap matrix shape: 100 x 7
Email missing type: unknown


In [34]:
from src.core.outliers import (
    detect_outliers,
    detect_multivariate_outliers,
    get_outlier_summary,
    get_outliers_for_column
)

df = load_dataset(json_data)

In [38]:
outliers = detect_outliers(df)
for col, info in outliers.items():
    if info.iqr_outlier_count > 0:
        print(f"{col}:")
        print(f"  IQR outliers: {info.iqr_outlier_count} ({info.outlier_percentage:.2f}%)")
        print(f"  Bounds: [{info.iqr_lower_bound:.2f}, {info.iqr_upper_bound:.2f}]")

DEP_DELAY:
  IQR outliers: 120804 (12.38%)
  Bounds: [-22.00, 26.00]
ARR_DELAY:
  IQR outliers: 84250 (8.43%)
  Bounds: [-41.50, 42.50]
AIR_TIME:
  IQR outliers: 45857 (4.59%)
  Bounds: [-66.50, 257.50]
DISTANCE:
  IQR outliers: 49707 (4.97%)
  Bounds: [-663.50, 1972.50]


In [37]:
summary = get_outlier_summary(df)
print(f"\nColumns with outliers: {summary['columns_with_outliers']}")
print(f"High outlier columns: {summary['high_outlier_columns']}")

# Specific column (use actual column from your data)
col_outliers = get_outliers_for_column(df, 'DEP_DELAY')
print(f"\nDEP_DELAY outliers (IQR): {col_outliers['iqr']['count']}")
print(f"DEP_DELAY outliers (Z-score): {col_outliers['zscore']['count']}")
print(f"DEP_DELAY outliers (MAD): {col_outliers['mad']['count']}")

# Multivariate outliers (multiple columns together)
multi_outliers = detect_multivariate_outliers(
    df, 
    columns=['DEP_DELAY', 'ARR_DELAY', 'AIR_TIME'], 
    contamination=0.05
)
print(f"\nMultivariate outliers: {multi_outliers.outlier_count} ({multi_outliers.outlier_percentage:.2f}%)")
print(f"First 10 outlier rows: {multi_outliers.outlier_indices[:10]}")


Columns with outliers: ['DEP_DELAY', 'ARR_DELAY', 'AIR_TIME', 'DISTANCE', 'ARR_TIME']
High outlier columns: [('DEP_DELAY', 12.380099999999999), ('ARR_DELAY', 8.425)]

DEP_DELAY outliers (IQR): 120804
DEP_DELAY outliers (Z-score): 20987
DEP_DELAY outliers (MAD): 123700

Multivariate outliers: 50000 (5.00%)
First 10 outlier rows: [0, 1, 5, 12, 22, 23, 24, 28, 35, 39]


In [39]:
from src.core.correlations import (
    analyze_correlations,
    get_correlation_matrix,
    get_top_correlations,
    get_correlation_for_columns
)

In [42]:
correlations = analyze_correlations(df)
print(f"Numeric columns: {correlations['numeric_columns']}")
print(f"Categorical columns: {correlations['categorical_columns']}")

Numeric columns: ['DEP_DELAY', 'ARR_DELAY', 'AIR_TIME', 'DISTANCE', 'DEP_TIME', 'ARR_TIME']
Categorical columns: ['FL_DATE']


In [43]:
# Top correlations
top_corr = get_top_correlations(df, n=10, min_correlation=0.5)
print("\nTop 10 correlations:")
for pair in top_corr:
    print(f"  {pair.column1} ↔ {pair.column2}: {pair.correlation:.3f} ({pair.method})")

# Correlation matrix (for heatmap)
matrix = get_correlation_matrix(df, method='pearson')
print(f"\nCorrelation matrix shape: {len(matrix['columns'])} x {len(matrix['columns'])}")

# Specific pair
pair_corr = get_correlation_for_columns(df, 'DEP_DELAY', 'ARR_DELAY')
print(f"\nDEP_DELAY vs ARR_DELAY:")
print(f"  Pearson: {pair_corr['pearson']['correlation']:.3f}")
print(f"  Spearman: {pair_corr['spearman']['correlation']:.3f}")


Top 10 correlations:
  AIR_TIME ↔ DISTANCE: 0.978 (spearman)
  AIR_TIME ↔ DISTANCE: 0.976 (pearson)
  DEP_DELAY ↔ ARR_DELAY: 0.905 (pearson)
  DEP_TIME ↔ ARR_TIME: 0.822 (spearman)
  DEP_TIME ↔ ARR_TIME: 0.723 (pearson)
  DEP_DELAY ↔ ARR_DELAY: 0.640 (spearman)

Correlation matrix shape: 6 x 6

DEP_DELAY vs ARR_DELAY:
  Pearson: 0.905
  Spearman: 0.640


In [49]:
from src.core.distributions import (
    analyze_distributions,
    get_distribution_summary,
    get_histogram,
    get_kde,
    test_normality
)

In [50]:
distributions = analyze_distributions(df, bins=50)
for col, info in distributions.items():
    print(f"\n{col}:")
    print(f"  Distribution type: {info.distribution_type}")
    print(f"  Is normal: {info.is_normal}")
    print(f"  Skewness: {info.skewness:.3f}")
    print(f"  Kurtosis: {info.kurtosis:.3f}")


DEP_DELAY:
  Distribution type: right_skewed
  Is normal: False
  Skewness: 5.693
  Kurtosis: 110.204

ARR_DELAY:
  Distribution type: right_skewed
  Is normal: False
  Skewness: 5.295
  Kurtosis: 68.233

AIR_TIME:
  Distribution type: right_skewed
  Is normal: False
  Skewness: 1.580
  Kurtosis: 3.245

DISTANCE:
  Distribution type: right_skewed
  Is normal: False
  Skewness: 1.627
  Kurtosis: 3.460

DEP_TIME:
  Distribution type: unknown
  Is normal: False
  Skewness: 0.023
  Kurtosis: -0.963

ARR_TIME:
  Distribution type: approximately_normal
  Is normal: False
  Skewness: -0.332
  Kurtosis: -0.363


In [51]:
# Summary
summary = get_distribution_summary(df)
print(f"\nNormal columns: {summary['normal_columns']}")
print(f"Skewed columns: {summary['skewed_columns']}")
print(f"Distribution types: {summary['distribution_type_counts']}")

# Histogram for specific column
hist = get_histogram(df, 'DEP_DELAY', bins=30)
print(f"\nDEP_DELAY histogram:")
print(f"  Bins: {len(hist['counts'])}")
print(f"  Total count: {hist['total_count']}")

# KDE for smooth density plot
kde = get_kde(df, 'DEP_DELAY', num_points=100)
if kde:
    print(f"  KDE computed with {len(kde['x'])} points")

# Normality tests
normality = test_normality(df, 'ARR_DELAY')
print(f"\nARR_DELAY normality tests:")
if normality.get('shapiro_wilk'):
    print(f"  Shapiro-Wilk: p={normality['shapiro_wilk']['p_value']:.4f}")
if normality.get('anderson_darling'):
    print(f"  Anderson-Darling: stat={normality['anderson_darling']['statistic']:.4f}")
print(f"  Overall normal: {normality['is_normal']}")


Normal columns: []
Skewed columns: ['DEP_DELAY', 'ARR_DELAY', 'AIR_TIME', 'DISTANCE']
Distribution types: {'right_skewed': 4, 'unknown': 1, 'approximately_normal': 1}

DEP_DELAY histogram:
  Bins: 30
  Total count: 1000000
  KDE computed with 100 points

ARR_DELAY normality tests:
  Anderson-Darling: stat=84750.5748
  Overall normal: False


In [None]:
from core.analyzer import DataAnalyzer, analyze_dataset, quick_analyze, get_data_quality_score