In [121]:
import pandas as pd
import numpy as np

Load the dataset

In [122]:
df = pd.read_csv('Customer_Personality_Analysis.csv', sep='\t')

1. Handle missing values

In [123]:
df['Income'] = df['Income'].replace(0, np.nan)  # Replace 0s with NaN
df['Income'] = df['Income'].fillna(df['Income'].median())

2. Remove duplicates

In [124]:
df = df.drop_duplicates()

3. Standardize text values

In [125]:
education_mapping = {
    'Graduation': 'Graduate',
    'PhD': 'Postgraduate',
    'Master': 'Postgraduate',
    '2n Cycle': 'Undergraduate',
    'Basic': 'Undergraduate'
}
df['Education'] = df['Education'].map(education_mapping).fillna(df['Education'])

marital_status_mapping = {
    'Married': 'Partner',
    'Together': 'Partner',
    'Divorced': 'Single',
    'Widow': 'Single',
    'Alone': 'Single',
    'Absurd': 'Single',
    'YOLO': 'Single'
}
df['Marital_Status'] = df['Marital_Status'].map(marital_status_mapping).fillna(df['Marital_Status'])


4. Convert date format

In [126]:
df['Dt_Customer'] = pd.to_datetime(df['Dt_Customer'], format='%d-%m-%Y')

5. Rename columns

In [127]:
df.columns = df.columns.str.lower()


6. Create new features (BEFORE converting to categorical)

In [128]:
current_year = 2025  # From problem context
df['age'] = current_year - df['year_birth']
df['total_children'] = df['kidhome'] + df['teenhome']
df['total_spent'] = df[['mntwines', 'mntfruits', 'mntmeatproducts', 
                       'mntfishproducts', 'mntsweetproducts', 'mntgoldprods']].sum(axis=1)


7. Fix data types (convert to categorical AFTER creating new features)

In [129]:
df['kidhome'] = df['kidhome'].astype('category')
df['teenhome'] = df['teenhome'].astype('category')
df['total_children'] = df['total_children'].astype('category')
df['year_birth'] = df['year_birth'].astype(int)


8. Handle outliers

In [130]:
df = df[df['year_birth'] > 1900]  # Remove invalid birth years

9. Remove constant columns

In [131]:
df = df.drop(columns=['z_costcontact', 'z_revenue'])


10. Reset index

In [132]:
df = df.reset_index(drop=True)

Save cleaned data

In [133]:
df.to_csv('Cleaned_Customer_Personality_Analysis.csv', index=False)