In [2]:
import pandas as pd
import numpy as np
import random

# --- reproducibility
np.random.seed(42)

# Generate normal data
age = np.random.randint(18, 60, 190).tolist()
income = np.random.randint(20000, 120000, 190).tolist()
spending = np.random.randint(0, 100, 190).tolist()
cities = np.random.choice(['Delhi', 'Mumbai', 'Chennai', 'Bangalore', 'Hyderabad'], 190).tolist()

# Add noise/anomalies (10 rows)
age += [5, 120, 200, -10, 999, 15, 65, 150, 300, 400]
income += [1000, 500000, 900000, -5000, 250000, 50, 800000, 5, 2000000, 0]
spending += [500, -50, 200, 999, 150, 300, 120, -10, 600, 400]
cities += ['Delhii', 'Mumbaai', '---', 'None', 'blr', 'xyz', 'Chnnai', 'HYD', 'bengluru', 'Hydrabad']

# Create DataFrame
df = pd.DataFrame({
    'Age': age,
    'Income': income,
    'SpendingScore': spending,
    'City': cities
})

print(df.head(10))
print("\nTotal rows:", df.shape)


   Age  Income  SpendingScore       City
0   56   75680             15    Chennai
1   46   66717             13    Chennai
2   32  107092             75      Delhi
3   25   70859             86  Bangalore
4   38   46309             14  Bangalore
5   56  107455             91  Hyderabad
6   36   83734             97      Delhi
7   40   90467             65    Chennai
8   28   72662             31  Bangalore
9   28  118506             86      Delhi

Total rows: (200, 4)


In [3]:
# Detect outliers using IQR
def remove_outliers(df, column):
    Q1 = df[column].quantile(0.25)
    Q3 = df[column].quantile(0.75)
    IQR = Q3 - Q1
    lower = Q1 - 1.5 * IQR
    upper = Q3 + 1.5 * IQR
    cleaned = df[(df[column] >= lower) & (df[column] <= upper)]
    return cleaned

# Clean each numeric column
for col in ['Age', 'Income', 'SpendingScore']:
    df = remove_outliers(df, col)

print("After outlier removal:", len(df), "rows remaining\n")

# Clean city names (simple normalization)
df['City'] = df['City'].str.strip().str.title()

# Keep only known valid cities
valid_cities = ['Delhi', 'Mumbai', 'Chennai', 'Bangalore', 'Hyderabad']
df = df[df['City'].isin(valid_cities)]

print("After cleaning categorical noise:", len(df), "rows remaining\n")

print(df.sample(10))


After outlier removal: 190 rows remaining

After cleaning categorical noise: 190 rows remaining

     Age  Income  SpendingScore       City
164   23   24014             23      Delhi
177   29   25801             11  Hyderabad
183   19   28716             29  Hyderabad
64    31   92694             88  Hyderabad
142   49   83208              1     Mumbai
115   54   23436              1  Bangalore
138   32   56212             12  Hyderabad
121   31   69377             11     Mumbai
93    50  114179             84  Hyderabad
174   49   47751             61    Chennai
