In [1]:
import pandas as pd

# Sample dataset
data = {
    'Name': ['Alice', 'Bob', 'Charlie', 'Alice'],
    'Age': [25, None, 30, 25],
    'Salary': ['50000', '55000', 'sixty thousand', '50000']
}

df = pd.DataFrame(data)

print("Original Data:\n", df)

# 1. Remove duplicates
df = df.drop_duplicates()

# 2. Fill missing Age with mean
df['Age'] = df['Age'].fillna(df['Age'].mean())

# 3. Convert Salary to numeric (invalid values become NaN)
df['Salary'] = pd.to_numeric(df['Salary'], errors='coerce')

# 4. Fill invalid salaries with median
df['Salary'] = df['Salary'].fillna(df['Salary'].median())

print("\nCleaned Data:\n", df)


Original Data:
       Name   Age          Salary
0    Alice  25.0           50000
1      Bob   NaN           55000
2  Charlie  30.0  sixty thousand
3    Alice  25.0           50000

Cleaned Data:
       Name   Age   Salary
0    Alice  25.0  50000.0
1      Bob  27.5  55000.0
2  Charlie  30.0  52500.0


In [2]:
df_cleaned = df.drop_duplicates()


In [3]:
df['Salary'] = df['Salary'].astype(int)


In [4]:
print(df)

      Name   Age  Salary
0    Alice  25.0   50000
1      Bob  27.5   55000
2  Charlie  30.0   52500


In [5]:
# Find missing values and count per column
missing_counts = df.isnull().sum()

# Display only columns that have missing values
missing_counts = missing_counts[missing_counts > 0]
print(missing_counts)


Series([], dtype: int64)
