In [112]:
import pandas as pd
import warnings
warnings.filterwarnings("ignore")

### Load the data

In [113]:
# Load CSV into DataFrame
df = pd.read_csv("dirty_data.csv")

# Display the first few rows
df.head()

Unnamed: 0,ID,Name,Age,Salary,Joining_Date,Department,Email
0,101,Alice,25.0,50000.0,2021-07-15,HR,alice@example.com
1,102,BOB,,60000.0,06/12/2021,ENGINEERING,BOB@EXAMPLE.COM
2,103,Charlie,30.0,,10 May 2021,HR,charlie@example
3,103,Charlie,30.0,,10 May 2021,HR,charlie@example
4,104,,22.0,45000.0,15-08-2020,Sales,


### Inspect Data Issues

In [114]:
# Check for missing values
print(df.isnull().sum())

ID              0
Name            2
Age             3
Salary          4
Joining_Date    0
Department      0
Email           2
dtype: int64


In [115]:
df[df.isnull().any(axis=1)]

Unnamed: 0,ID,Name,Age,Salary,Joining_Date,Department,Email
1,102,BOB,,60000.0,06/12/2021,ENGINEERING,BOB@EXAMPLE.COM
2,103,Charlie,30.0,,10 May 2021,HR,charlie@example
3,103,Charlie,30.0,,10 May 2021,HR,charlie@example
4,104,,22.0,45000.0,15-08-2020,Sales,
10,109,Ivy,120.0,,2022-10-11,legal,
12,110,John,,80000.0,2021-07-30,Engineering,john@EXAMPLE.COM
15,113,Mia,27.0,,"March 5, 2021",Product,MIA@EXAMPLE.COM
16,114,Nathan,,1000000.0,2023-08-25,Marketing,nathan@example.com
21,119,,22.0,30000.0,11-06-2022,Support,supPort@company.com


In [116]:
# Check data types
print(df.dtypes)

ID                int64
Name             object
Age             float64
Salary          float64
Joining_Date     object
Department       object
Email            object
dtype: object


In [117]:
# Check for duplicates
print(df.duplicated().sum())

2


In [118]:
# Display summary statistics
df.describe(include='all')

Unnamed: 0,ID,Name,Age,Salary,Joining_Date,Department,Email
count,22.0,20,19.0,18.0,22,22,20
unique,,17,,,19,14,17
top,,Charlie,,,10 May 2021,HR,charlie@example
freq,,3,,,3,3,3
mean,109.227273,,43.263158,146722.222222,,,
std,5.622504,,36.843275,257687.180524,,,
min,101.0,,22.0,13000.0,,,
25%,104.25,,25.5,45000.0,,,
50%,108.5,,29.0,50000.0,,,
75%,113.75,,32.5,58500.0,,,


### Handle missing values

In [119]:
# Fill missing Age with the mean age
df['Age'].fillna(df['Age'].mean(), inplace=True)

# Fill missing Name with "Unknown"
df['Name'].fillna("Unknown", inplace=True)

# Fill missing Salary with median salary
df['Salary'].fillna(df['Salary'].median(), inplace=True)

# Drop rows with null emails
df.dropna(axis=0, subset=['Email'], how='any', inplace=True)

In [120]:
print(df.isnull().sum())

ID              0
Name            0
Age             0
Salary          0
Joining_Date    0
Department      0
Email           0
dtype: int64


### Convert Data Types

In [121]:
# Convert Joining_Date to datetime format
# df['Joining_Date'] = pd.to_datetime(df['Joining_Date'], errors='coerce')

# Convert Salary to integer type
df['Salary'] = pd.to_numeric(df['Salary'], errors='coerce')

# Convert Age to integer
df['Age'] = df['Age'].astype(int)

In [122]:
print(df.dtypes)

ID                int64
Name             object
Age               int64
Salary          float64
Joining_Date     object
Department       object
Email            object
dtype: object


### Remove Duplicates

In [123]:
df.drop_duplicates(inplace=True)
print(df.duplicated().sum())

0


### Standardize Text Formatting

In [124]:
# Trim spaces and convert names to title case
df['Name'] = df['Name'].str.strip().str.title()

# Convert Department to consistent lowercase format
df['Department'] = df['Department'].str.strip().str.lower()

# Convert Email to lowercase
df['Email'] = df['Email'].str.lower()

In [125]:
df.tail()

Unnamed: 0,ID,Name,Age,Salary,Joining_Date,Department,Email
17,115,Olivia,45,54000.0,01-14-2019,finance,olivia@example.com
18,116,Peter,99,348000.0,2024-02-28,it,peter@example.com
19,117,Quincy,31,45000.0,2021-07-04,operations,quincy@example.com
20,118,Riley,34,50000.0,2020-09-09,legal,riley@example.com
21,119,Unknown,22,30000.0,11-06-2022,support,support@company.com


### Handle Outliers

In [129]:
# Remove rows where Salary is below 20,000 or above 200,000
df = df[(df['Salary'] >= 20000) & (df['Salary'] <= 200000)]

# Check for unrealistic ages (e.g., less than 18 or greater than 65)
df = df[(df['Age'] >= 18) & (df['Age'] <= 65)]

In [134]:
print(df['Age'].max(), df['Age'].min())
print(df['Salary'].max(), df['Salary'].min())

45 22
80000.0 27000.0


### Validate and Save Clean Data

In [136]:
# Check final dataset
print(df.info())
df.head()

<class 'pandas.core.frame.DataFrame'>
Index: 13 entries, 0 to 21
Data columns (total 7 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   ID            13 non-null     int64  
 1   Name          13 non-null     object 
 2   Age           13 non-null     int64  
 3   Salary        13 non-null     float64
 4   Joining_Date  13 non-null     object 
 5   Department    13 non-null     object 
 6   Email         13 non-null     object 
dtypes: float64(1), int64(2), object(4)
memory usage: 832.0+ bytes
None


Unnamed: 0,ID,Name,Age,Salary,Joining_Date,Department,Email
0,101,Alice,25,50000.0,2021-07-15,hr,alice@example.com
1,102,Bob,43,60000.0,06/12/2021,engineering,bob@example.com
2,103,Charlie,30,50000.0,10 May 2021,hr,charlie@example
5,105,Eve,28,52000.0,2021-09-01,marketing,eve@example.com
7,107,George,26,47000.0,12-25-2020,it,george @ example . com


In [137]:
# Save cleaned data
df.to_csv("cleaned_data.csv", index=False)