In [1]:
import pandas as pd
from IPython.display import display


data = {
    'Name': ['Alice', 'Bob', 'CHARLIE', 'david', 'Eve'],
    'age': [25, None, 35, 29, 22],
    'City': ['Lagos', 'Abuja', 'port harcourt', 'lagos', 'Enugu'],
    'Income($)': [50000,  None, 70000, 65000, None],
    'signup_date': ['2022/01/15', '15-02-2022', '03/10/2022', '2022.04.12', None],
    'IsActive': ['yes', 'no', 'y', 'true', '']
}
df = pd.DataFrame(data)
print(df)

      Name   age           City  Income($) signup_date IsActive
0    Alice  25.0          Lagos    50000.0  2022/01/15      yes
1      Bob   NaN          Abuja        NaN  15-02-2022       no
2  CHARLIE  35.0  port harcourt    70000.0  03/10/2022        y
3    david  29.0          lagos    65000.0  2022.04.12     true
4      Eve  22.0          Enugu        NaN        None         


In [2]:
df.columns = df.columns.str.strip().str.lower().str.replace('[$() ]', '', regex=True).str.replace(' ', '_')
print(df.columns)

Index(['name', 'age', 'city', 'income', 'signup_date', 'isactive'], dtype='object')


In [3]:
# Check where nulls exist
print(df.isnull().sum())

# Fill the missing income with mean
df['income'] = df['income'].fillna(df['income'].mean())

# Fill missing age with median
df['age'] = df['age'].fillna(df['age'].median())

# Drop rows with missing signup_date
df = df.dropna(subset=['signup_date'])


name           0
age            1
city           0
income         2
signup_date    1
isactive       0
dtype: int64


In [4]:
# Standardize city names (capitalize first letter of each word)
df['city'] = df['city'].str.title()

# Fix name capitalization
df['name'] = df['name'].str.title()

# Convert signup_date to datetime
df['signup_date'] = pd.to_datetime(df['signup_date'], errors='coerce')

# Fix IsActive values to Boolean
df['isactive'] = df['isactive'].str.lower().map({'yes': True, 'y': True, 'true': True, 'no': False, '': False})

In [5]:
# Add column: years since signup
from datetime import datetime
df['years_since_signup'] = datetime.now().year - df['signup_date'].dt.year

In [7]:
display(df)

Unnamed: 0,name,age,city,income,signup_date,isactive,years_since_signup
0,Alice,25.0,Lagos,50000.0,2022-01-15,True,3.0
1,Bob,27.0,Abuja,61666.666667,NaT,False,
2,Charlie,35.0,Port Harcourt,70000.0,NaT,True,
3,David,29.0,Lagos,65000.0,NaT,True,
