In [8]:
import pandas as pd
import numpy as np

# Sample DataFrame for demonstration
data = {
    'Name': ['Alice', 'Bob', 'Charlie', np.nan, 'Alice'],
    'Age': [25, 30, 'Unknown', 22, 25],
    'Email': ['alice@example.com', 'bob@example.com', 'charlie@example.com', 'bob@example.com', 'alice@example.com'],
    'DateOfBirth': ['1999-01-01', 'not available', '2000-02-29', '1998-12-31', np.nan]
}

# Create DataFrame
df = pd.DataFrame(data)

print("Original DataFrame:")
print(df)

# 1. Cleaning Data (General cleaning process)
print("\nStep 1: General Data Cleaning")

# Drop the 'Email' column
print("\nStep 2: Dropping 'Email' Column")
df.drop(columns=['Email'], inplace=True)

# 2. Cleaning Empty Cells
print("\nStep 3: Cleaning Empty Cells")
# Fill empty cells with placeholders
df['Name'].fillna('Unknown', inplace=True)
df['DateOfBirth'].fillna('1900-01-01', inplace=True)  # Default date

# 3. Cleaning Wrong Format
print("\nStep 4: Cleaning Wrong Format")
# Convert 'Age' column to numeric, coercing errors to NaN
df['Age'] = pd.to_numeric(df['Age'], errors='coerce')

# Convert 'DateOfBirth' column to datetime, invalid parsing will be set as NaT
df['DateOfBirth'] = pd.to_datetime(df['DateOfBirth'], errors='coerce')

# 4. Cleaning Wrong Data
print("\nStep 5: Cleaning Wrong Data")
# Drop rows where 'Age' or 'DateOfBirth' are NaN after conversion
df.dropna(subset=['Age', 'DateOfBirth'], inplace=True)

# 5. Removing Duplicates
print("\nStep 6: Removing Duplicates")
df.drop_duplicates(inplace=True)

print("\nCleaned DataFrame:")
print(df)


Original DataFrame:
      Name      Age                Email    DateOfBirth
0    Alice       25    alice@example.com     1999-01-01
1      Bob       30      bob@example.com  not available
2  Charlie  Unknown  charlie@example.com     2000-02-29
3      NaN       22      bob@example.com     1998-12-31
4    Alice       25    alice@example.com            NaN

Step 1: General Data Cleaning

Step 2: Dropping 'Email' Column

Step 3: Cleaning Empty Cells

Step 4: Cleaning Wrong Format

Step 5: Cleaning Wrong Data

Step 6: Removing Duplicates

Cleaned DataFrame:
      Name   Age DateOfBirth
0    Alice  25.0  1999-01-01
3  Unknown  22.0  1998-12-31
4    Alice  25.0  1900-01-01


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['Name'].fillna('Unknown', inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['DateOfBirth'].fillna('1900-01-01', inplace=True)  # Default date


In [None]:
 df.corr() 