In [1]:
"""
MANUAL LABELING PROCESS
======================

For each CSV file:
1. Load it
2. Inspect each column
3. Mark what problems exist
4. Save labels to JSON

This becomes our "ground truth" for training ML models
"""

import pandas as pd
import json
from pathlib import Path

# Load a dataset
df = pd.read_csv('../data/raw/titanic_messy.csv')

print(f"Dataset loaded: {df.shape}")
print(f"Columns: {df.columns.tolist()}")

Dataset loaded: (891, 15)
Columns: ['survived', 'pclass', 'sex', 'age', 'sibsp', 'parch', 'fare', 'embarked', 'class', 'who', 'adult_male', 'deck', 'embark_town', 'alive', 'alone']


In [4]:
"""
For EACH column, we'll check:
- Has missing values? (Yes/No)
- Has duplicates? (Yes/No) 
- Format inconsistent? (Yes/No)
- Has outliers? (Yes/No)
- Wrong data type? (Yes/No)
"""

def analyze_column(series, column_name):
    analysis = {
        'column_name': column_name,
        'dtype': str(series.dtype),
        'total_rows': len(series),
        'problems': []
    }

    # 1Ô∏è‚É£ Missing values
    missing_count = series.isna().sum()
    if missing_count > 0:
        analysis['problems'].append({
            'type': 'missing_values',
            'count': int(missing_count),
            'percentage': round(missing_count / len(series) * 100, 2)
        })

    # 2Ô∏è‚É£ Outliers (ONLY for real numeric types)
    if series.dtype in ['int64', 'float64']:
        clean_series = series.dropna()

        if len(clean_series) > 0:
            q1 = clean_series.quantile(0.25)
            q3 = clean_series.quantile(0.75)
            iqr = q3 - q1

            outliers = clean_series[
                (clean_series < (q1 - 1.5 * iqr)) |
                (clean_series > (q3 + 1.5 * iqr))
            ]

            if len(outliers) > 0:
                analysis['problems'].append({
                    'type': 'outliers',
                    'count': int(len(outliers)),
                    'examples': outliers.head(3).tolist()
                })

    return analysis


# Analyze all columns
results = {}
for col in df.columns:
    results[col] = analyze_column(df[col], col)
    
print("‚úÖ Column analysis complete")

‚úÖ Column analysis complete


In [5]:
"""
Show findings in readable format
"""

print("=" * 60)
print("DATA QUALITY ANALYSIS RESULTS")
print("=" * 60)

for col_name, analysis in results.items():
    print(f"\nüìä Column: {col_name}")
    print(f"   Type: {analysis['dtype']}")
    print(f"   Rows: {analysis['total_rows']}")
    
    if analysis['problems']:
        print(f"   ‚ö†Ô∏è  Problems found: {len(analysis['problems'])}")
        for problem in analysis['problems']:
            if problem['type'] == 'missing_values':
                print(f"      - Missing: {problem['count']} ({problem['percentage']}%)")
            elif problem['type'] == 'outliers':
                print(f"      - Outliers: {problem['count']}")
                print(f"        Examples: {problem['examples']}")
    else:
        print(f"   ‚úÖ No issues detected")

DATA QUALITY ANALYSIS RESULTS

üìä Column: survived
   Type: int64
   Rows: 891
   ‚úÖ No issues detected

üìä Column: pclass
   Type: int64
   Rows: 891
   ‚úÖ No issues detected

üìä Column: sex
   Type: str
   Rows: 891
   ‚úÖ No issues detected

üìä Column: age
   Type: float64
   Rows: 891
   ‚ö†Ô∏è  Problems found: 2
      - Missing: 177 (19.87%)
      - Outliers: 11
        Examples: [66.0, 65.0, 71.0]

üìä Column: sibsp
   Type: int64
   Rows: 891
   ‚ö†Ô∏è  Problems found: 1
      - Outliers: 46
        Examples: [3, 4, 3]

üìä Column: parch
   Type: int64
   Rows: 891
   ‚ö†Ô∏è  Problems found: 1
      - Outliers: 213
        Examples: [1, 2, 1]

üìä Column: fare
   Type: float64
   Rows: 891
   ‚ö†Ô∏è  Problems found: 1
      - Outliers: 116
        Examples: [71.2833, 263.0, 146.5208]

üìä Column: embarked
   Type: str
   Rows: 891
   ‚ö†Ô∏è  Problems found: 1
      - Missing: 2 (0.22%)

üìä Column: class
   Type: str
   Rows: 891
   ‚úÖ No issues detected

üìä Co

In [6]:
"""
Save analysis results to JSON file
This becomes our training data labels
"""

# Save to JSON
output_file = '../data/labeled/titanic_labels.json'

with open(output_file, 'w') as f:
    json.dump(results, f, indent=2)

print(f"\n‚úÖ Labels saved to: {output_file}")


‚úÖ Labels saved to: ../data/labeled/titanic_labels.json


In [7]:
"""
Generate summary statistics
"""

# Count problems by type
problem_counts = {}
for col_data in results.values():
    for problem in col_data['problems']:
        ptype = problem['type']
        problem_counts[ptype] = problem_counts.get(ptype, 0) + 1

print("\n" + "=" * 60)
print("SUMMARY REPORT")
print("=" * 60)
print(f"\nDataset: Titanic")
print(f"Total columns: {len(results)}")
print(f"Total rows: {df.shape[0]}")
print(f"\nProblems found:")

for ptype, count in problem_counts.items():
    print(f"  - {ptype}: {count} columns affected")

# Overall quality score
total_possible_problems = len(results) * 5  # 5 types of problems
actual_problems = sum(len(data['problems']) for data in results.values())
quality_score = ((total_possible_problems - actual_problems) / total_possible_problems) * 100

print(f"\nüìä Overall Quality Score: {quality_score:.1f}/100")

if quality_score > 80:
    print("   Status: ‚úÖ Good quality")
elif quality_score > 60:
    print("   Status: ‚ö†Ô∏è  Moderate quality")
else:
    print("   Status: ‚ùå Poor quality - needs cleaning!")


SUMMARY REPORT

Dataset: Titanic
Total columns: 15
Total rows: 891

Problems found:
  - missing_values: 4 columns affected
  - outliers: 4 columns affected

üìä Overall Quality Score: 89.3/100
   Status: ‚úÖ Good quality
