# Data Cleaning & Preprocessing

In [17]:
import pandas as pd
import numpy as np

# Sample dataset with missing values and duplicates
data = {
    'ID': [1, 2, 2, 3, 4, 5, np.nan, 6],
    'Name': ['Alice', 'Bob', 'Bob', 'Charlie', 'David', 'Emma', 'Frank', 'George'],
    'Age': [25, np.nan, 32, 28, 29, np.nan, 35, 40],
    'Salary': [50000, 60000, 60000, 70000, np.nan, 80000, 90000, np.nan]
}

df = pd.DataFrame(data)

print("Original DataFrame")
print(df)

# Handling missing values
df['Age'].fillna(df['Age'].median(), inplace=True)
df['Salary'].fillna(df['Salary'].mean(), inplace=True)

# Dropping duplicate rows
df.drop_duplicates(inplace=True)

# Dropping rows with missing 'ID'
df.dropna(subset=['ID'], inplace=True)

# Convert 'ID' to integer
df['ID'] = df['ID'].astype(int)

print("\n Cleaned DataFrame")
print(df)


Original DataFrame
    ID     Name   Age   Salary
0  1.0    Alice  25.0  50000.0
1  2.0      Bob   NaN  60000.0
2  2.0      Bob  32.0  60000.0
3  3.0  Charlie  28.0  70000.0
4  4.0    David  29.0      NaN
5  5.0     Emma   NaN  80000.0
6  NaN    Frank  35.0  90000.0
7  6.0   George  40.0      NaN

 Cleaned DataFrame
   ID     Name   Age        Salary
0   1    Alice  25.0  50000.000000
1   2      Bob  30.5  60000.000000
2   2      Bob  32.0  60000.000000
3   3  Charlie  28.0  70000.000000
4   4    David  29.0  68333.333333
5   5     Emma  30.5  80000.000000
7   6   George  40.0  68333.333333


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['Age'].fillna(df['Age'].median(), inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['Salary'].fillna(df['Salary'].mean(), inplace=True)


to remove duplicated ID, you need to use 

df.drop_duplicates(subset=['ID'], inplace=True)

# Data Aggregation & Grouping

In [19]:
# Sample DataFrame
df['Department'] = np.random.choice(['HR', 'IT', 'Sales'], size=len(df))

# Group by 'Department' and compute mean salary & age
grouped = df.groupby('Department').agg({'Age': 'mean', 'Salary': 'mean'})

print("\n Average Age and Salary by Department:")
print(grouped)

print(df)


 Average Age and Salary by Department:
             Age        Salary
Department                    
HR          28.6  65666.666667
IT          40.0  68333.333333
Sales       32.0  60000.000000
   ID     Name   Age        Salary Department
0   1    Alice  25.0  50000.000000         HR
1   2      Bob  30.5  60000.000000         HR
2   2      Bob  32.0  60000.000000      Sales
3   3  Charlie  28.0  70000.000000         HR
4   4    David  29.0  68333.333333         HR
5   5     Emma  30.5  80000.000000         HR
7   6   George  40.0  68333.333333         IT
