In [3]:
import pandas as pd
import numpy as np

# Sample dataset with multiple data issues
data = {
    'Name': [' Alice ', 'BOB', 'Charlie', 'bob', 'alice', 'Eve', 'Eve'],
    'Age': ['25', 'thirty', '35', '40', np.nan, '29', '29'],
    'Gender': ['F', 'M', 'Male', 'female', 'F', 'F', 'F'],
    'Country': ['usa', 'UK', 'India', 'INDIA', 'Usa', 'UK', 'UK'],
    'Salary': ['50000', '60000', '70000', '80000', '90000', '100000', '100000'],
    'JoinDate': ['2021-01-10', '2020/12/15', '15-11-2019', '2018-07-23', '2017-05-30', '2016-03-12', '2016-03-12'],
    'Comments': ['Good', 'Excellent', 'Average', 'Poor', 'Good', 'Excellent', 'Excellent']
}

# Create DataFrame
df = pd.DataFrame(data)

# 1. Trim whitespace from 'Name' column
df['Name'] = df['Name'].str.strip()

# 2. Standardize 'Name' to title case
df['Name'] = df['Name'].str.title()

# 3. Standardize 'Gender' entries
df['Gender'] = df['Gender'].str.lower().map({'f': 'Female', 'female': 'Female', 'm': 'Male', 'male': 'Male'})

# 4. Standardize 'Country' entries to uppercase
df['Country'] = df['Country'].str.upper()

# 5. Convert 'Age' to numeric, coerce errors to NaN
df['Age'] = pd.to_numeric(df['Age'], errors='coerce')

# 6. Handle missing 'Age' values by filling with median age
df['Age'].fillna(df['Age'].median(), inplace=True)

# 7. Convert 'Salary' to numeric
df['Salary'] = pd.to_numeric(df['Salary'], errors='coerce')

# 8. Parse 'JoinDate' into datetime, handling different formats
df['JoinDate'] = pd.to_datetime(df['JoinDate'], errors='coerce', dayfirst=True)

# 9. Remove duplicate rows
df.drop_duplicates(inplace=True)

# 10. Detect and handle outliers in 'Salary' using IQR
Q1 = df['Salary'].quantile(0.25)
Q3 = df['Salary'].quantile(0.75)
IQR = Q3 - Q1
lower_bound = Q1 - 1.5 * IQR
upper_bound = Q3 + 1.5 * IQR
df = df[(df['Salary'] >= lower_bound) & (df['Salary'] <= upper_bound)]

# 11. Final cleaned DataFrame
print("Cleaned DataFrame:")
print(df)

Cleaned DataFrame:
      Name   Age  Gender Country  Salary   JoinDate   Comments
0    Alice  25.0  Female     USA   50000 2021-10-01       Good
1      Bob  29.0    Male      UK   60000        NaT  Excellent
2  Charlie  35.0    Male   INDIA   70000        NaT    Average
3      Bob  40.0  Female   INDIA   80000        NaT       Poor
4    Alice  29.0  Female     USA   90000        NaT       Good
5      Eve  29.0  Female      UK  100000 2016-12-03  Excellent


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['Age'].fillna(df['Age'].median(), inplace=True)
