In [1]:
"""
SMART DATA COLLECTION STRATEGY
==============================

Instead of downloading 100 random files:
- Get 10-15 DIVERSE files from different domains
- Manually inspect each
- Document what problems they have
- Quality > Quantity for training

Why this works:
- 10 well-chosen files > 100 random files
- You understand the data deeply
- Training labels will be accurate
- Faster iteration
"""

# Data sources (free, legal, messy!)
data_sources = {
    "Kaggle": [
        "messy-vs-clean-room",
        "customer-churn-dataset", 
        "employee-attrition",
        "supermarket-sales"
    ],
    "Government (India)": [
        "data.gov.in - MGNREGA datasets",
        "data.gov.in - Healthcare data",
        "data.gov.in - Education statistics"
    ],
    "UCI ML Repository": [
        "Adult Income dataset",
        "Credit Approval dataset"
    ],
    "Generate Synthetic": [
        "We'll create intentionally messy data"
    ]
}

print("üìä DATA COLLECTION PLAN")
print("=" * 50)
for source, datasets in data_sources.items():
    print(f"\n{source}:")
    for ds in datasets:
        print(f"  - {ds}")

üìä DATA COLLECTION PLAN

Kaggle:
  - messy-vs-clean-room
  - customer-churn-dataset
  - employee-attrition
  - supermarket-sales

Government (India):
  - data.gov.in - MGNREGA datasets
  - data.gov.in - Healthcare data
  - data.gov.in - Education statistics

UCI ML Repository:
  - Adult Income dataset
  - Credit Approval dataset

Generate Synthetic:
  - We'll create intentionally messy data


In [2]:
"""
OPTION 1: Download from Kaggle
Need: Kaggle account (free)

OR

OPTION 2: Use built-in datasets
Faster for learning!
"""

# Let's use built-in messy dataset first
import seaborn as sns

# Load Titanic dataset (famously messy!)
titanic = sns.load_dataset('titanic')

print("‚úÖ Loaded Titanic dataset")
print(f"Shape: {titanic.shape}")
print("\nFirst few rows:")
print(titanic.head())

# Save to CSV
titanic.to_csv('../data/raw/titanic_messy.csv', index=False)

print("‚úÖ Saved to data/raw/titanic_messy.csv")


‚úÖ Loaded Titanic dataset
Shape: (891, 15)

First few rows:
   survived  pclass     sex   age  sibsp  parch     fare embarked  class  \
0         0       3    male  22.0      1      0   7.2500        S  Third   
1         1       1  female  38.0      1      0  71.2833        C  First   
2         1       3  female  26.0      0      0   7.9250        S  Third   
3         1       1  female  35.0      1      0  53.1000        S  First   
4         0       3    male  35.0      0      0   8.0500        S  Third   

     who  adult_male deck  embark_town alive  alone  
0    man        True  NaN  Southampton    no  False  
1  woman       False    C    Cherbourg   yes  False  
2  woman       False  NaN  Southampton   yes   True  
3  woman       False    C  Southampton   yes  False  
4    man        True  NaN  Southampton    no   True  
‚úÖ Saved to data/raw/titanic_messy.csv


In [3]:
"""
MANUAL INSPECTION CHECKLIST
---------------------------
Go through each column and ask:
1. Are there missing values?
2. Are there duplicates?
3. Is the format consistent?
4. Are there outliers?
5. Is the data type correct?
"""

print("üìã QUICK DATA QUALITY CHECK - Titanic Dataset")
print("=" * 60)

# 1. Missing Values
print("\n1Ô∏è‚É£ MISSING VALUES:")
missing = titanic.isna().sum()
print(missing[missing > 0])

# 2. Duplicates
print("\n2Ô∏è‚É£ DUPLICATES:")
dupes = titanic.duplicated().sum()
print(f"Duplicate rows: {dupes}")

# 3. Data Types
print("\n3Ô∏è‚É£ DATA TYPES:")
print(titanic.dtypes)

# 4. Outliers (for numeric columns)
print("\n4Ô∏è‚É£ POTENTIAL OUTLIERS:")
print(titanic[['age', 'fare']].describe())

# 5. Format Issues
print("\n5Ô∏è‚É£ FORMAT ISSUES:")
print(f"Unique classes: {titanic['class'].unique()}")
print(f"Unique embark_town: {titanic['embark_town'].unique()}")

üìã QUICK DATA QUALITY CHECK - Titanic Dataset

1Ô∏è‚É£ MISSING VALUES:
age            177
embarked         2
deck           688
embark_town      2
dtype: int64

2Ô∏è‚É£ DUPLICATES:
Duplicate rows: 107

3Ô∏è‚É£ DATA TYPES:
survived          int64
pclass            int64
sex                 str
age             float64
sibsp             int64
parch             int64
fare            float64
embarked            str
class          category
who                 str
adult_male         bool
deck           category
embark_town         str
alive               str
alone              bool
dtype: object

4Ô∏è‚É£ POTENTIAL OUTLIERS:
              age        fare
count  714.000000  891.000000
mean    29.699118   32.204208
std     14.526497   49.693429
min      0.420000    0.000000
25%     20.125000    7.910400
50%     28.000000   14.454200
75%     38.000000   31.000000
max     80.000000  512.329200

5Ô∏è‚É£ FORMAT ISSUES:
Unique classes: ['Third', 'First', 'Second']
Categories (3, str): ['First', 'Se