In [27]:
# Step 1 load the libraries
import pandas as pd
import numpy as np
from sklearn.impute import SimpleImputer


In [28]:

# Step 2: Load the dataset

# Sample dataset
data = {
    "ID": [101, 102, 103, 104, 105, 105, 106],
    "Name": ["Alice", "Bob", "Charlie", "David", np.nan, "Eve", "Frank"],
    "Age": [25, np.nan, 35, 40, 30, 30, 29],
    "Salary": [50000, 60000, np.nan, 75000, 80000, 80000, 72000],
    "Department": ["HR", "IT", "IT", "Finance", "HR", "HR", "IT"]
}

df = pd.DataFrame(data)

In [35]:

# Step 3: Handle missing values

# Impute missing numerical values with mean
imputer = SimpleImputer(strategy='mean')
df["Age"] = imputer.fit_transform(df[["Age"]])
df["Salary"] = imputer.fit_transform(df[["Salary"]])  # Fixed column name capitalization

# Filling missing categorical values with mode
df.fillna({"Name" : df["Name"].mode()[0]}, inplace=True)


In [30]:

# Step 4: Remove Duplicates
df.drop_duplicates(inplace=True)


In [31]:
# Reset index after dropping duplicates (optional but recommended)
# df.reset_index(drop=True, inplace=True)

In [32]:

# Step 5: Fix Inconsistencies
# Convert Department names to lowercase to maintain consistency
df["Department"] = df["Department"].str.lower()

In [33]:

# Step 6: Standardize Data types
df["Age"] = df["Age"].astype(int)
df["Salary"] = df["Salary"].astype(int)

In [34]:


# Step 7: Display Cleaned Data
print(f"Cleaned Data:\n{df}")


Cleaned Data:
    ID     Name  Age  Salary Department
0  101    Alice   25   50000         hr
1  102      Bob   31   60000         it
2  103  Charlie   35   69500         it
3  104    David   40   75000    finance
4  105    Alice   30   80000         hr
5  105      Eve   30   80000         hr
6  106    Frank   29   72000         it
