In [1]:
"""
Create Feature Dataset for ML Training
======================================

Process:
1. Load all synthetic messy files
2. For each file, extract features from every column
3. Combine with labels (what problems exist)
4. Save as final training dataset
"""

import pandas as pd
import sys
sys.path.append('../ml_pipeline')

from data.feature_extractor import ColumnFeatureExtractor
import json
from pathlib import Path
from tqdm import tqdm  # Progress bar

print("‚úÖ Setup complete")

‚úÖ Setup complete


In [2]:
"""
Load all generated messy CSV files
"""

messy_files = list(Path('../data/synthetic/messy').glob('*.csv'))
print(f"Found {len(messy_files)} messy files")

# Show first few
for f in messy_files[:5]:
    print(f"  - {f.name}")

Found 15 messy files
  - messy_001_titanic_v0.csv
  - messy_002_titanic_v1.csv
  - messy_003_titanic_v2.csv
  - messy_004_tips_v0.csv
  - messy_005_tips_v1.csv


In [9]:
"""
For each file, extract features from each column
This will take a few minutes...

FIX: Generate column-specific labels based on ACTUAL column data,
not file-level labels!
"""

from pathlib import Path

all_features = []

for messy_file in tqdm(messy_files, desc="Processing files"):
    # Load messy CSV
    df = pd.read_csv(messy_file)
    
    # Extract features from each column
    for col in df.columns:
        extractor = ColumnFeatureExtractor(df[col], col)
        features = extractor.extract_all_features()
        
        # Add file metadata
        features['filename'] = messy_file.name
        features['column_name'] = col
        
        # ========================================
        # COLUMN-LEVEL LABELING (FIXED!)
        # ========================================
        # Check THIS column's actual characteristics
        # Not the file-level labels!
        
        # Label 1: Has duplicates?
        # Higher threshold - only flag if >50% are duplicates
        features['has_duplicates'] = 1 if features['duplicate_percentage'] > 50 else 0
        
        # Label 2: Has missing values?
        # If >5% of values are missing, mark as 1
        features['has_missing'] = 1 if features['missing_percentage'] > 5 else 0
        
        # Label 3: Has outliers?
        # If >3% of values are outliers, mark as 1
        features['has_outliers'] = 1 if features['outlier_percentage'] > 3 else 0
        
        # Label 4: Has format issues?
        # If format consistency < 80%, mark as 1
        features['has_format_issue'] = 1 if features['format_consistency_score'] < 80 else 0
        
        # Label 5: Has type issues?
        # Check if numeric column was converted to string
        # Or if cardinality suggests it should be numeric but it's object
        is_object_but_should_be_numeric = (features['dtype_object'] == 1 and 
                                           features['contains_digits_pct'] > 80)
        features['has_type_issue'] = 1 if is_object_but_should_be_numeric else 0
        
        all_features.append(features)

print(f"\n‚úÖ Extracted features from {len(all_features)} columns")
print("‚úÖ Labels generated from ACTUAL column data (not file labels)")
print("‚úÖ Thresholds: duplicates>50%, missing>5%, outliers>3%, format<80%")


Processing files: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 15/15 [00:00<00:00, 16.97it/s]


‚úÖ Extracted features from 129 columns
‚úÖ Labels generated from ACTUAL column data (not file labels)
‚úÖ Thresholds: duplicates>50%, missing>5%, outliers>3%, format<80%





In [11]:
"""
Convert to pandas DataFrame for easy analysis
"""

feature_df = pd.DataFrame(all_features)

print(f"Feature dataset shape: {feature_df.shape}")
print(f"\nColumns:")
print(feature_df.columns.tolist())

print(f"\nFirst few rows:")
print(feature_df.head())

Feature dataset shape: (129, 40)

Columns:
['total_rows', 'missing_count', 'missing_percentage', 'unique_count', 'cardinality', 'duplicate_count', 'duplicate_percentage', 'mean', 'median', 'std', 'min', 'max', 'range', 'outlier_count', 'outlier_percentage', 'skewness', 'avg_length', 'length_std', 'contains_digits_pct', 'contains_special_pct', 'all_uppercase_pct', 'all_lowercase_pct', 'format_consistency_score', 'name_contains_id', 'name_contains_name', 'name_contains_email', 'name_contains_phone', 'name_contains_date', 'name_contains_amount', 'dtype_numeric', 'dtype_object', 'dtype_datetime', 'dtype_bool', 'filename', 'column_name', 'has_duplicates', 'has_missing', 'has_outliers', 'has_format_issue', 'has_type_issue']

First few rows:
   total_rows  missing_count  missing_percentage  unique_count  cardinality  \
0         980            136           13.877551             5     0.005102   
1         980            157           16.020408             6     0.006122   
2         980     

In [12]:
"""
Save for ML training
"""

output_path = '../data/processed/feature_dataset.csv'
feature_df.to_csv(output_path, index=False)

print(f"‚úÖ Feature dataset saved: {output_path}")
print(f"   Shape: {feature_df.shape}")
print(f"   Ready for ML training!")

‚úÖ Feature dataset saved: ../data/processed/feature_dataset.csv
   Shape: (129, 40)
   Ready for ML training!


In [13]:
"""
Verify labels are properly distributed now
"""

print("üìä LABEL VERIFICATION")
print("=" * 60)

for problem in ['has_duplicates', 'has_missing', 'has_outliers', 'has_format_issue', 'has_type_issue']:
    counts = feature_df[problem].value_counts().sort_index()
    total = len(feature_df)
    
    print(f"\n{problem}:")
    for class_val in [0, 1]:
        count = counts.get(class_val, 0)
        pct = (count / total) * 100
        print(f"  Class {class_val}: {count:4d} ({pct:5.1f}%)")
    
    # Check balance
    if len(counts) == 1:
        print(f"  ‚ùå STILL BROKEN: Only one class!")
    elif len(counts) == 2:
        minority_pct = (min(counts) / total) * 100
        if minority_pct > 10:
            print(f"  ‚úÖ GOOD: Both classes present, {minority_pct:.1f}% minority class")
        else:
            print(f"  ‚ö†Ô∏è  IMBALANCED: Only {minority_pct:.1f}% minority class")

print("\n" + "=" * 60)
print("If you see ‚úÖ GOOD for most problems, labels are fixed!")
print("=" * 60)

üìä LABEL VERIFICATION

has_duplicates:
  Class 0:   18 ( 14.0%)
  Class 1:  111 ( 86.0%)
  ‚úÖ GOOD: Both classes present, 14.0% minority class

has_missing:
  Class 0:    6 (  4.7%)
  Class 1:  123 ( 95.3%)
  ‚ö†Ô∏è  IMBALANCED: Only 4.7% minority class

has_outliers:
  Class 0:   96 ( 74.4%)
  Class 1:   33 ( 25.6%)
  ‚úÖ GOOD: Both classes present, 25.6% minority class

has_format_issue:
  Class 0:   10 (  7.8%)
  Class 1:  119 ( 92.2%)
  ‚ö†Ô∏è  IMBALANCED: Only 7.8% minority class

has_type_issue:
  Class 0:  129 (100.0%)
  Class 1:    0 (  0.0%)
  ‚ùå STILL BROKEN: Only one class!

If you see ‚úÖ GOOD for most problems, labels are fixed!
