In [12]:
import pandas as pd

### Load the data

In [13]:
# Load CSV into DataFrame
df = pd.read_csv("dirty_data.csv")

# Display the first few rows
df.head()

Unnamed: 0,ID,Name,Age,Salary,Joining_Date,Department,Email
0,101,Alice,25.0,50000,2021-07-15,HR,alice@example.com
1,102,BOB,,sixty thousand,06/12/2021,ENGINEERING,BOB@EXAMPLE.COM
2,103,Charlie,30.0,,10 May 2021,HR,charlie@example
3,104,,22.0,45000,15-08-2020,Sales,
4,105,Eve,28.0,52000,2021-09-01,Marketing,eve@example.com


### Inspect Data Issues

In [14]:
# Check for missing values
print(df.isnull().sum())

ID              0
Name            2
Age             3
Salary          3
Joining_Date    0
Department      0
Email           2
dtype: int64


In [15]:
# Check data types
print(df.dtypes)

ID                int64
Name             object
Age             float64
Salary           object
Joining_Date     object
Department       object
Email            object
dtype: object


In [17]:
# Check for duplicates
print(df.duplicated().sum())

0


In [18]:
# Display summary statistics
print(df.describe(include='all'))

               ID     Name         Age Salary Joining_Date Department  \
count    20.00000       18   17.000000     17           20         20   
unique        NaN       17         NaN     13           19         14   
top           NaN  Charlie         NaN  50000  10 May 2021         HR   
freq          NaN        2         NaN      3            2          2   
mean    109.65000      NaN   45.058824    NaN          NaN        NaN   
std       5.69649      NaN   38.653057    NaN          NaN        NaN   
min     101.00000      NaN   22.000000    NaN          NaN        NaN   
25%     104.75000      NaN   25.000000    NaN          NaN        NaN   
50%     109.50000      NaN   29.000000    NaN          NaN        NaN   
75%     114.25000      NaN   34.000000    NaN          NaN        NaN   
max     119.00000      NaN  150.000000    NaN          NaN        NaN   

                  Email  
count                18  
unique               17  
top     charlie@example  
freq               

### Handle missing values

In [19]:
# Fill missing Age with the mean age
df['Age'].fillna(df['Age'].mean(), inplace=True)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['Age'].fillna(df['Age'].mean(), inplace=True)


In [None]:
# Fill missing Salary with median salary
df['Salary'].fillna(df['Salary'].median(), inplace=True)

# Fill missing Name with "Unknown"
df['Name'].fillna("Unknown", inplace=True)

### Convert Data Types

In [None]:
# Convert Joining_Date to datetime format
df['Joining_Date'] = pd.to_datetime(df['Joining_Date'], errors='coerce')

# Convert Salary to integer type
df['Salary'] = pd.to_numeric(df['Salary'], errors='coerce')

# Convert Age to integer
df['Age'] = df['Age'].astype(int)

### Remove Duplicates

In [None]:
df.drop_duplicates(inplace=True)

### Standardize Text Formatting

In [None]:
# Trim spaces and convert names to title case
df['Name'] = df['Name'].str.strip().str.title()

# Convert Department to consistent lowercase format
df['Department'] = df['Department'].str.strip().str.lower()

# Convert Email to lowercase
df['Email'] = df['Email'].str.lower()

### Handle Outliers

In [None]:
# Remove rows where Salary is below 20,000 or above 200,000
df = df[(df['Salary'] >= 20000) & (df['Salary'] <= 200000)]

# Check for unrealistic ages (e.g., less than 18 or greater than 65)
df = df[(df['Age'] >= 18) & (df['Age'] <= 65)]

### Validate and Save Clean Data

In [None]:
# Check final dataset
print(df.info())
print(df.head())

# Save cleaned data
df.to_csv("cleaned_data.csv", index=False)