<a href="https://colab.research.google.com/github/001010/Predictive-AI-Lessons/blob/main/W1_Lesson_1_3_Messy_Data.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import pandas as pd
import numpy as np

print("="*60)
print(" REAL-WORLD MESSY DATA EXAMPLE")
print("="*60)

# Create messy customer data (like you'd get from a real database)
np.random.seed(42)

messy_data = pd.DataFrame({
    'CustomerID': range(1, 101),
    'Name': [f'Customer{i}' if i % 10 != 0 else None for i in range(1, 101)],  # Missing names
    'Age': [np.random.randint(18, 70) if i % 15 != 0 else np.nan for i in range(100)],  # Missing ages
    'Income': np.concatenate([
        np.random.uniform(20000, 150000, 95),
        [500000, 750000, 1000000, 2000000, 5000000]  # Outliers!
    ]),
    'City': np.random.choice(['New York', 'new york', 'NEW YORK', 'LA', 'la', 'Chicago'], 100),  # Inconsistent
    'Subscription': np.random.choice(['Basic', 'Premium', 'Enterprise', 'basic', 'PREMIUM'], 100),  # Inconsistent
    'SignupDate': [f'2024-0{np.random.randint(1,10)}-{np.random.randint(10,29)}' for _ in range(100)],
    'MonthlySpend': [np.random.uniform(20, 500) if i % 12 != 0 else np.nan for i in range(100)]  # Missing values
})

print("\nüìä Our Messy Dataset:")
print(messy_data.head(15))

print("\n‚ùå PROBLEMS IN THIS DATA:")
print(f"1. Missing Values:")
print(messy_data.isnull().sum())

print(f"\n2. Inconsistent Categories:")
print(f"   Cities: {messy_data['City'].unique()}")
print(f"   Subscriptions: {messy_data['Subscription'].unique()}")

print(f"\n3. Outliers in Income:")
print(f"   Min: ${messy_data['Income'].min():,.0f}")
print(f"   Max: ${messy_data['Income'].max():,.0f}")
print(f"   Mean: ${messy_data['Income'].mean():,.0f}")
print(f"   Median: ${messy_data['Income'].median():,.0f}")
print(f"   (Mean is way higher than median = outliers!)")

print(f"\n4. Different Data Types:")
print(messy_data.dtypes)

print("\nüí° This is what REAL data looks like!")
print("   Your job: Transform this into clean, ML-ready data")

 REAL-WORLD MESSY DATA EXAMPLE

üìä Our Messy Dataset:
    CustomerID        Name   Age         Income      City Subscription  \
0            1   Customer1   NaN  127735.876190  NEW YORK      PREMIUM   
1            2   Customer2  56.0   66377.932470  NEW YORK   Enterprise   
2            3   Customer3  69.0   56521.486259  New York   Enterprise   
3            4   Customer4  46.0   90550.490811   Chicago   Enterprise   
4            5   Customer5  32.0   38320.149247        la      Premium   
5            6   Customer6  60.0  124285.607498        LA      PREMIUM   
6            7   Customer7  25.0   29691.583678  new york        Basic   
7            8   Customer8  38.0  148295.301758   Chicago        basic   
8            9   Customer9  56.0  120391.820009   Chicago        Basic   
9           10        None  36.0   45833.038599  NEW YORK      PREMIUM   
10          11  Customer11  40.0   20717.875226  New York        basic   
11          12  Customer12  28.0  126009.985699  New Yor

In [2]:
messy_data

Unnamed: 0,CustomerID,Name,Age,Income,City,Subscription,SignupDate,MonthlySpend
0,1,Customer1,,1.277359e+05,NEW YORK,PREMIUM,2024-07-14,
1,2,Customer2,56.0,6.637793e+04,NEW YORK,Enterprise,2024-05-18,487.709508
2,3,Customer3,69.0,5.652149e+04,New York,Enterprise,2024-05-10,493.381157
3,4,Customer4,46.0,9.055049e+04,Chicago,Enterprise,2024-01-24,355.117623
4,5,Customer5,32.0,3.832015e+04,la,Premium,2024-02-25,277.326256
...,...,...,...,...,...,...,...,...
95,96,Customer96,38.0,5.000000e+05,Chicago,basic,2024-07-20,135.269897
96,97,Customer97,33.0,7.500000e+05,Chicago,Premium,2024-08-15,
97,98,Customer98,62.0,1.000000e+06,Chicago,Enterprise,2024-08-17,56.414397
98,99,Customer99,35.0,2.000000e+06,Chicago,Basic,2024-04-17,81.862267


In [3]:
print("\n" + "="*60)
print(" THE 4 MAIN DATA CLEANING CHALLENGES")
print("="*60)

problems = pd.DataFrame({
    'Problem': ['Missing Values', 'Outliers', 'Categorical Data', 'Different Scales'],
    'Example': ['Age = NaN', 'Income = $5,000,000', 'Color = "Red"', 'Age: 18-70, Income: $20K-$500K'],
    'Why It Matters': [
        'ML algorithms cannot process NaN',
        'Skews statistics and predictions',
        'ML needs numbers, not text',
        'Some features dominate others'
    ],
    'Solution': ['Fill or remove', 'Detect and handle', 'Encode as numbers', 'Scale to same range']
})

print(problems.to_string(index=False))
print("\nWe'll tackle each one systematically!")


 THE 4 MAIN DATA CLEANING CHALLENGES
         Problem                        Example                   Why It Matters            Solution
  Missing Values                      Age = NaN ML algorithms cannot process NaN      Fill or remove
        Outliers            Income = $5,000,000 Skews statistics and predictions   Detect and handle
Categorical Data                  Color = "Red"       ML needs numbers, not text   Encode as numbers
Different Scales Age: 18-70, Income: $20K-$500K    Some features dominate others Scale to same range

We'll tackle each one systematically!
