In [2]:
# Task 1: Data Cleaning and Preprocessing
# Dataset: Mall Customers

# 1. Import Libraries
import pandas as pd

# 2. Load Dataset
df = pd.read_csv("Mall_Customers.csv")

# 3. Clean Column Names (remove extra spaces, make consistent)
df.columns = df.columns.str.strip()

print("Columns after cleaning:", df.columns.tolist())

# 4. Initial Exploration
print("\nShape:", df.shape)
print("\nFirst 5 rows:\n", df.head())
print("\nInfo:\n")
df.info()
print("\nMissing values:\n", df.isnull().sum())
print("\nDuplicates:", df.duplicated().sum())

# 5. Handle Missing Values (none in this dataset, but example shown)
# df = df.fillna(method="ffill")  # or df.dropna()

# 6. Remove Duplicates
df = df.drop_duplicates()

# 7. Standardize Text Columns (Gender → lowercase)
if "Gender" in df.columns:
    df["Gender"] = df["Gender"].str.strip().str.lower()

# 8. Rename Columns to be Consistent
df = df.rename(columns={
    "CustomerID": "customer_id",
    "Gender": "gender",
    "Age": "age",
    "Annual Income (k$)": "annual_income_k",
    "Spending Score (1-100)": "spending_score"
})

# 9. Check & Fix Data Types
df["customer_id"] = df["customer_id"].astype(int)
df["age"] = df["age"].astype(int)
df["annual_income_k"] = df["annual_income_k"].astype(int)
df["spending_score"] = df["spending_score"].astype(int)

# 10. Save Cleaned Dataset
df.to_csv("Mall_Customers_Cleaned.csv", index=False)

print("\n✅ Cleaning Done! Saved as 'Mall_Customers_Cleaned.csv'")


Columns after cleaning: ['CustomerID', 'Genre', 'Age', 'Annual Income (k$)', 'Spending Score (1-100)']

Shape: (200, 5)

First 5 rows:
    CustomerID   Genre  Age  Annual Income (k$)  Spending Score (1-100)
0           1    Male   19                  15                      39
1           2    Male   21                  15                      81
2           3  Female   20                  16                       6
3           4  Female   23                  16                      77
4           5  Female   31                  17                      40

Info:

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 200 entries, 0 to 199
Data columns (total 5 columns):
 #   Column                  Non-Null Count  Dtype 
---  ------                  --------------  ----- 
 0   CustomerID              200 non-null    int64 
 1   Genre                   200 non-null    object
 2   Age                     200 non-null    int64 
 3   Annual Income (k$)      200 non-null    int64 
 4   Spending 