In [3]:
import pandas as pd

# Adjust display settings for better output formatting
pd.set_option('display.max_columns', None)  # Show all columns
pd.set_option('display.expand_frame_repr', False)  # Prevent wrapping

# Load the CSV file
df = pd.read_csv(r"C:\Users\ACER\Downloads\data1.csv")

# Keep a copy of the original data for comparison
original_df = df.copy()

# Remove duplicate IDs, keeping the first occurrence
duplicates_removed = df[df.duplicated(subset="ID", keep="first")]
df = df.drop_duplicates(subset="ID", keep="first")

# Remove rows where Age is missing
rows_with_missing_age = df[df["Age"].isna()]
df = df.dropna(subset=["Age"])

# Fill missing Salary values with the median salary
median_salary = df["Salary"].median()
rows_with_missing_salary = df[df["Salary"].isna()]
df["Salary"] = df["Salary"].fillna(median_salary)

# Save the cleaned data to a new CSV file
df.to_csv("cleaned_data1.csv", index=False)

# Print removed and modified rows with proper formatting
print("\nRows removed due to duplicate IDs:")
print(duplicates_removed.to_string(index=False))

print("\nRows removed due to missing Age:")
print(rows_with_missing_age.to_string(index=False))

print(f"\nRows where missing Salary was filled with median value ({median_salary}):")
print(rows_with_missing_salary.to_string(index=False))

print("\nData cleaning completed. Processed file saved as 'cleaned_data1.csv'.")


Rows removed due to duplicate IDs:
 ID        Name  Age Gender  Salary Department  Join_Date  Performance_Score
  3 Bob Johnson  NaN   Male 45000.0    Finance 10-01-2021              -78.0

Rows removed due to missing Age:
 ID          Name  Age Gender  Salary Department  Join_Date  Performance_Score
 10 Olivia Taylor  NaN Female 60000.0  Marketing 30-06-2021               77.0

Rows where missing Salary was filled with median value (59000.0):
 ID          Name  Age Gender  Salary Department  Join_Date  Performance_Score
  4   Alice Brown 29.0 Female     NaN         HR 05-03-2023               88.0
 16 Amelia Walker 30.0 Female     NaN         HR 05-01-2023               82.0
 34   Hannah Cook 30.0 Female     NaN  Marketing 10-04-2020               79.0

Data cleaning completed. Processed file saved as 'cleaned_data1.csv'.
