In [5]:
import pandas as pd
import numpy as np

# Creating the DataFrame
data = {
    'Name': ['Alice', 'Bob', 'Carlie', 'alice', 'Eve', 'Bob'],
    'Age': [28, np.nan, 35, 28, 40, np.nan],
    'salary': [50000, 60000, 55000, 50000, 70000, 100000],
    'department': ['HR', 'IT', 'FINANCE', 'HR', 'IT', 'FINANCE'],
}

df = pd.DataFrame(data)

# Store the original DataFrame for comparison
df_original = df.copy()

# Handling Missing Data: Fill missing Age with mean value (excluding 'Bob')
df['Age'] = df.apply(lambda row: df['Age'].mean() if pd.isna(row['Age']) and row['Name'] != 'Bob' else row['Age'], axis=1)

# Removing Duplicate Records
df.drop_duplicates(inplace=True)

# Handling Outliers: Cap Salary at 100,000 using np.clip
df['salary'] = np.clip(df['salary'], a_min=None, a_max=100000)

# Ensuring Consistent Formatting: Convert Employee Names to Uppercase
df['Name'] = df['Name'].str.upper()

# Display the original DataFrame
print("Original DataFrame:")
print(df_original)

# Display the cleaned DataFrame
print("\nCleaned DataFrame:")
print(df)


Original DataFrame:
     Name   Age  salary department
0   Alice  28.0   50000         HR
1     Bob   NaN   60000         IT
2  Carlie  35.0   55000    FINANCE
3   alice  28.0   50000         HR
4     Eve  40.0   70000         IT
5     Bob   NaN  100000    FINANCE

Cleaned DataFrame:
     Name   Age  salary department
0   ALICE  28.0   50000         HR
1     BOB   NaN   60000         IT
2  CARLIE  35.0   55000    FINANCE
3   ALICE  28.0   50000         HR
4     EVE  40.0   70000         IT
5     BOB   NaN  100000    FINANCE
