In [1]:
import pandas as pd
import numpy as np

# Create a sample dirty dataset
data = {
    "CustomerID": [101, 102, 103, 104, 105, 105, None],
    "Name": ["John", "ALICE", "bob", "Charlie", "ALICE", "ALICE", "Eve"],
    "Age": [25, None, 35, -1, 29, 29, 40],
    "Email": ["john@gmail.com", "alice@GMAIL.com", "bob@", "charlie@yahoo.com", None, "alice@GMAIL.com", "eve@gmail.com"],
    "Purchase": ["1000", "2000", "1500", "2500", "3000", "3000", "4000"]
}

df = pd.DataFrame(data)
print(" Original Dirty Data:")
print(df)

# Handle Missing Values
df["CustomerID"] = df["CustomerID"].fillna(df["CustomerID"].mode()[0])  # Fill missing ID with mode
df["Age"] = df["Age"].replace(-1, np.nan)  # Replace invalid ages
df["Age"] = df["Age"].fillna(df["Age"].mean())  # Fill missing with mean
df["Email"] = df["Email"].fillna("unknown@gmail.com")

# Remove Duplicates
df = df.drop_duplicates()

#  Standardize text formats
df["Name"] = df["Name"].str.title()  # Convert names to Proper Case
df["Email"] = df["Email"].str.lower()  # Make all emails lowercase

# Convert Data Types
df["Purchase"] = df["Purchase"].astype(int)

#  Validate Emails (very basic check)
df = df[df["Email"].str.contains("@")]

print("\n✅ Cleaned Data:")
print(df)


 Original Dirty Data:
   CustomerID     Name   Age              Email Purchase
0       101.0     John  25.0     john@gmail.com     1000
1       102.0    ALICE   NaN    alice@GMAIL.com     2000
2       103.0      bob  35.0               bob@     1500
3       104.0  Charlie  -1.0  charlie@yahoo.com     2500
4       105.0    ALICE  29.0               None     3000
5       105.0    ALICE  29.0    alice@GMAIL.com     3000
6         NaN      Eve  40.0      eve@gmail.com     4000

✅ Cleaned Data:
   CustomerID     Name   Age              Email  Purchase
0       101.0     John  25.0     john@gmail.com      1000
1       102.0    Alice  31.6    alice@gmail.com      2000
2       103.0      Bob  35.0               bob@      1500
3       104.0  Charlie  31.6  charlie@yahoo.com      2500
4       105.0    Alice  29.0  unknown@gmail.com      3000
5       105.0    Alice  29.0    alice@gmail.com      3000
6       105.0      Eve  40.0      eve@gmail.com      4000
