In [4]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [5]:
print("\n" + "="*60)
print(" PRACTICE: CUSTOMER DATA CLEANING")
print("="*60)

# Create realistic customer data with missing values
np.random.seed(42)
customer_data = pd.DataFrame({
    'CustomerID': range(1, 301),
    'Age': [np.random.randint(18, 70) if np.random.random() > 0.08 else np.nan for _ in range(300)],
    'Income': [np.random.uniform(25000, 200000) if np.random.random() > 0.12 else np.nan for _ in range(300)],
    'CreditScore': [np.random.randint(300, 850) if np.random.random() > 0.10 else np.nan for _ in range(300)],
    'LoanAmount': [np.random.uniform(5000, 50000) if np.random.random() > 0.15 else np.nan for _ in range(300)],
    'EmploymentYears': [np.random.randint(0, 40) if np.random.random() > 0.05 else np.nan for _ in range(300)]
})

print(f"üìä Dataset: {customer_data.shape}")
print(f"\n‚ùå Missing Values:")
print(customer_data.isnull().sum())
print(f"\nPercentages:")
print((customer_data.isnull().sum() / len(customer_data) * 100).round(1))

# YOUR TASK: Clean this data!

# Step 1: Identify columns with <5% missing (can safely impute)


 PRACTICE: CUSTOMER DATA CLEANING
üìä Dataset: (300, 6)

‚ùå Missing Values:
CustomerID          0
Age                21
Income             30
CreditScore        31
LoanAmount         39
EmploymentYears    22
dtype: int64

Percentages:
CustomerID          0.0
Age                 7.0
Income             10.0
CreditScore        10.3
LoanAmount         13.0
EmploymentYears     7.3
dtype: float64


In [6]:
safe_to_impute = (customer_data.isnull().sum() / len(customer_data)) < 0.05
print(f'\n Safe to impute (<5% missing): {safe_to_impute[safe_to_impute].index.tolist()}')



 Safe to impute (<5% missing): ['CustomerID']


In [7]:
cleaned_data = customer_data.copy()

In [8]:
cleaned_data['Age'].fillna(cleaned_data['Age'].median(), inplace=True)
cleaned_data['Income'].fillna(cleaned_data['Income'].median(), inplace=True)
cleaned_data['CreditScore'].fillna(cleaned_data['CreditScore'].median(), inplace=True)

In [9]:
print(cleaned_data.isnull().sum())

CustomerID          0
Age                 0
Income              0
CreditScore         0
LoanAmount         39
EmploymentYears    22
dtype: int64


In [10]:
cleaned_data['HasLoan'] = cleaned_data['LoanAmount'].notna().astype(int)
cleaned_data['LoanAmount'].fillna(0,inplace=True)

In [11]:
cleaned_data

Unnamed: 0,CustomerID,Age,Income,CreditScore,LoanAmount,EmploymentYears,HasLoan
0,1,46.0,116542.122730,713.0,0.000000,22.0,0
1,2,25.0,146756.020572,577.0,13634.029612,21.0,1
2,3,56.0,134822.664015,573.0,15199.538100,10.0,1
3,4,40.0,182702.651836,338.0,8124.072924,26.0,1
4,5,43.0,104532.738909,577.0,8042.565362,19.0,1
...,...,...,...,...,...,...,...
295,296,62.0,119410.976166,758.0,20084.362132,33.0,1
296,297,64.0,80781.688013,601.0,37040.630915,7.0,1
297,298,38.0,180046.105941,517.0,9528.686534,17.0,1
298,299,54.0,65767.908082,587.0,11388.739539,5.0,1


In [12]:
cleaned_data['EmploymentYears'].fillna(cleaned_data['EmploymentYears'].median(), inplace=True)

In [13]:
print(f"\n‚úÖ After Cleaning:")
print(f"Missing values: {cleaned_data.isnull().sum().sum()}")
print(cleaned_data.describe())


‚úÖ After Cleaning:
Missing values: 0
       CustomerID         Age         Income  CreditScore    LoanAmount  \
count  300.000000  300.000000     300.000000   300.000000    300.000000   
mean   150.500000   42.783333  106830.564679   565.106667  23568.475621   
std     86.746758   14.402819   46787.551742   153.592026  15605.393399   
min      1.000000   18.000000   27275.336614   300.000000      0.000000   
25%     75.750000   31.750000   67708.441467   442.000000  10302.968420   
50%    150.500000   43.000000  104532.738909   577.000000  24318.078400   
75%    225.250000   53.000000  141388.441940   672.250000  37725.050457   
max    300.000000   69.000000  199411.446488   848.000000  49925.638013   

       EmploymentYears     HasLoan  
count       300.000000  300.000000  
mean         19.566667    0.870000  
std          11.011345    0.336865  
min           0.000000    0.000000  
25%          11.000000    1.000000  
50%          19.000000    1.000000  
75%          29.000000    

In [14]:
print("\nüí° Key Decision:")
print("   Created 'HasLoan' feature because LoanAmount missing")
print("   might mean 'no loan' rather than 'unknown'")
print("   This is FEATURE ENGINEERING from missing data!")


üí° Key Decision:
   Created 'HasLoan' feature because LoanAmount missing
   might mean 'no loan' rather than 'unknown'
   This is FEATURE ENGINEERING from missing data!


In [15]:
cleaned_data.shape

(300, 7)