In [10]:
import pandas as pd

# Load the dataset
df = pd.read_csv('/content/marketing_campaign.csv', sep='\t')

In [11]:
# Rename column headers
original_columns = df.columns
df.columns = df.columns.str.lower().str.replace(' ', '_').str.replace('[^a-zA-Z0-9_]', '', regex=True)

In [12]:
# Handle missing values
# 'Income' column has missing values; fill with its median.
if 'income' in df.columns and df['income'].isnull().any():
    income_median = df['income'].median()
    df['income'].fillna(income_median, inplace=True)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['income'].fillna(income_median, inplace=True)


In [13]:
# Remove duplicate rows
df.drop_duplicates(inplace=True)

In [14]:
# Convert date formats and create 'age' column
# Convert 'dt_customer' to datetime objects
if 'dt_customer' in df.columns:
    df['dt_customer'] = pd.to_datetime(df['dt_customer'], format='%d-%m-%Y')

# Calculate 'Age' from 'year_birth' and drop 'year_birth'
if 'year_birth' in df.columns:
    current_year = 2025 # Assuming current year
    df['age'] = current_year - df['year_birth']
    df.drop('year_birth', axis=1, inplace=True)

In [15]:
# Standardize text values
# Standardize 'education' and 'marital_status' to title case
if 'education' in df.columns:
    df['education'] = df['education'].str.title()
if 'marital_status' in df.columns:
    df['marital_status'] = df['marital_status'].str.title()