Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
371 changes: 371 additions & 0 deletions ADVANCED_ANALYSIS_PLAN.md

Large diffs are not rendered by default.

File renamed without changes.
File renamed without changes.
File renamed without changes.
428 changes: 424 additions & 4 deletions src/f2a/core/analyzer.py → f2a/core/analyzer.py

Large diffs are not rendered by default.

53 changes: 52 additions & 1 deletion src/f2a/core/config.py → f2a/core/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -67,6 +67,50 @@ class AnalysisConfig:
max_plot_columns: int = 20
"""Maximum columns per plot grid (prevents overly large figures)."""

# ── Advanced analysis ─────────────────────────────────
advanced: bool = True
"""Enable the Advanced analysis tab (clustering, anomaly, etc.)."""

advanced_distribution: bool = True
"""Best-fit distribution, power transform, Jarque-Bera, ECDF."""

advanced_correlation: bool = True
"""Partial correlation, MI matrix, bootstrap CI, network graph."""

clustering: bool = True
"""K-Means, DBSCAN, hierarchical clustering."""

advanced_dimreduction: bool = True
"""t-SNE, UMAP (optional), Factor Analysis."""

feature_insights: bool = True
"""Interaction, monotonic, binning, cardinality, leakage detection."""

advanced_anomaly: bool = True
"""Isolation Forest, LOF, Mahalanobis, consensus."""

statistical_tests: bool = True
"""Levene, Kruskal-Wallis, Mann-Whitney, goodness-of-fit, Grubbs."""

data_profiling: bool = True
"""Automated insights, type recommendation, health dashboard."""

# ── Advanced sub-options ──────────────────────────────
max_cluster_k: int = 10
"""Maximum k for K-Means elbow search."""

tsne_perplexity: float = 30.0
"""t-SNE perplexity parameter."""

bootstrap_iterations: int = 1000
"""Number of bootstrap resamples for correlation CI."""

max_sample_for_advanced: int = 5000
"""Max rows sampled for expensive advanced analyses (t-SNE, UMAP, etc.)."""

n_distribution_fits: int = 7
"""Number of candidate distributions to fit."""

@staticmethod
def minimal() -> "AnalysisConfig":
"""Return a config with only core analyses (descriptive + missing)."""
Expand All @@ -80,12 +124,19 @@ def minimal() -> "AnalysisConfig":
pca=False,
duplicates=False,
quality_score=False,
advanced=False,
)

@staticmethod
def fast() -> "AnalysisConfig":
"""Return a config that skips expensive analyses (PCA, feature importance)."""
"""Return a config that skips expensive analyses (PCA, feature importance, advanced)."""
return AnalysisConfig(
pca=False,
feature_importance=False,
advanced=False,
)

@staticmethod
def basic_only() -> "AnalysisConfig":
"""Return a config with all Basic analyses on, all Advanced off."""
return AnalysisConfig(advanced=False)
File renamed without changes.
File renamed without changes.
7 changes: 6 additions & 1 deletion src/f2a/core/schema.py → f2a/core/schema.py
Original file line number Diff line number Diff line change
Expand Up @@ -77,12 +77,17 @@ def infer_schema(df: pd.DataFrame) -> DataSchema:

for col in df.columns:
n_missing = int(df[col].isna().sum())
try:
n_unique = int(df[col].nunique())
except TypeError:
# Column contains unhashable types (e.g. numpy arrays, lists)
n_unique = len(df[col].dropna())
columns.append(
ColumnInfo(
name=col,
dtype=str(df[col].dtype),
inferred_type=type_map[col],
n_unique=int(df[col].nunique()),
n_unique=n_unique,
n_missing=n_missing,
missing_ratio=round(n_missing / len(df), 4) if len(df) > 0 else 0.0,
)
Expand Down
File renamed without changes.
1,708 changes: 1,708 additions & 0 deletions f2a/report/generator.py

Large diffs are not rendered by default.

Loading