In [9]:
import pandas as pd
# step 1: load dataset
df = pd.read_csv ("Mall_Customers.csv")
print(df)

     CustomerID  Gender   Age  Annual Income (k$)  Spending Score (1-100)
0             1    Male  19.0                15.0                    39.0
1             2    Male  21.0                15.0                    81.0
2             3  Female  20.0                16.0                     6.0
3             4  Female  23.0                16.0                    77.0
4             5  Female  31.0                17.0                    40.0
..          ...     ...   ...                 ...                     ...
195         196  Female  35.0               120.0                    79.0
196         197  Female  45.0               126.0                    28.0
197         198    Male  32.0               126.0                    74.0
198         199    Male  32.0               137.0                    18.0
199         200    Male  30.0               137.0                    83.0

[200 rows x 5 columns]


In [11]:
df = pd.read_csv("Mall_Customers.csv")  
print("Initial Data Shape:", df.shape)

# Step 2: Check for missing values
print("\nMissing Values:\n", df.isnull().sum())

# Step 3: Remove duplicate rows
duplicates = df.duplicated().sum()
df = df.drop_duplicates()
print(f"\nRemoved {duplicates} duplicate rows.")
print("Data Shape after removing duplicates:", df.shape)

# Step 4: Rename columns to lowercase and replace spaces and hyphens with underscores
df.columns = df.columns.str.lower().str.replace(" ", "_").str.replace("-", "_")
print("\nRenamed columns:", df.columns.tolist())

# Step 5: Clean text/categorical data
if 'gender' in df.columns:
    df['gender'] = df['gender'].str.upper()
else:
    print("Warning: 'gender' column not found after renaming.")

# Step 6: Validate age column
if 'age' in df.columns:
    invalid_ages = df[df['age'] < 0].shape[0]
    df = df[df['age'] >= 0]
    df['age'] = df['age'].astype(int)
    print(f"\nRemoved {invalid_ages} records with negative age.")
else:
    print("Warning: 'age' column not found.")

# Step 7: Check final data types
print("\nFinal Data Types:\n", df.dtypes)

# Step 8: Save cleaned dataset
df.to_csv("Cleaned_Mall_Customers.csv", index=False)
print("\nCleaned dataset saved as 'Cleaned_Mall_Customers.csv'")

Initial Data Shape: (200, 5)

Missing Values:
 CustomerID                0
Gender                    0
Age                       3
Annual Income (k$)        1
Spending Score (1-100)    1
dtype: int64

Removed 0 duplicate rows.
Data Shape after removing duplicates: (200, 5)

Renamed columns: ['customerid', 'gender', 'age', 'annual_income_(k$)', 'spending_score_(1_100)']

Removed 0 records with negative age.

Final Data Types:
 customerid                  int64
gender                     object
age                         int64
annual_income_(k$)        float64
spending_score_(1_100)    float64
dtype: object

Cleaned dataset saved as 'Cleaned_Mall_Customers.csv'
