# Data Cleaning and Preprocessing

In [2]:
import numpy as np
import pandas as pd

In [3]:
data = pd.read_csv("C:\\Users\\USER\\Downloads\\sample_employee_data.csv")
data.head()

Unnamed: 0,Name,Age,Gender,Department,Salary,JoiningDate
0,Alice,25.0,F,HR,50000.0,2015-03-01
1,Bob,,M,Finance,60000.0,2018-07-15
2,Charlie,35.0,M,IT,80000.0,2016-08-21
3,David,45.0,M,IT,,2017-11-10
4,Eve,22.0,F,HR,45000.0,2019-01-20


In [4]:
data.shape

(10, 6)

In [6]:
data[data.isnull()].any()

Name           False
Age            False
Gender         False
Department     False
Salary         False
JoiningDate    False
dtype: bool

In [7]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10 entries, 0 to 9
Data columns (total 6 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   Name         10 non-null     object 
 1   Age          8 non-null      float64
 2   Gender       10 non-null     object 
 3   Department   10 non-null     object 
 4   Salary       8 non-null      float64
 5   JoiningDate  10 non-null     object 
dtypes: float64(2), object(4)
memory usage: 612.0+ bytes


In [12]:
# Fill missing values with mean
data['Age'].fillna(data['Age'].mean(), inplace=True)

# Drop rows with missing values
data.dropna(subset=['Salary'], inplace=True)

# Check after cleaning
print(data.isnull().sum())

Name           0
Age            0
Gender         0
Department     0
Salary         0
JoiningDate    0
dtype: int64


 # Encoding Categorical Variables

In [16]:
from sklearn.preprocessing import LabelEncoder, OneHotEncoder

# Label Encoding
label_encoder= LabelEncoder()
data['Gender'] = label_encoder.fit_transform(data['Gender'])

# One-Hot Encoding
encoded = pd.get_dummies(data['Department'], prefix='Dept')
data = pd.concat([data, encoded], axis=1)

data.head()

Unnamed: 0,Name,Age,Gender,Department,Salary,JoiningDate,Dept_Finance,Dept_HR,Dept_IT,Dept_Marketing,...,Dept_IT.1,Dept_Marketing.1,Dept_Finance.1,Dept_HR.1,Dept_IT.2,Dept_Marketing.2,Dept_Finance.2,Dept_HR.2,Dept_IT.3,Dept_Marketing.3
0,Alice,25.0,0,HR,50000.0,2015-03-01,False,True,False,False,...,False,False,False,True,False,False,False,True,False,False
1,Bob,33.25,1,Finance,60000.0,2018-07-15,True,False,False,False,...,False,False,True,False,False,False,True,False,False,False
2,Charlie,35.0,1,IT,80000.0,2016-08-21,False,False,True,False,...,True,False,False,False,True,False,False,False,True,False
4,Eve,22.0,0,HR,45000.0,2019-01-20,False,True,False,False,...,False,False,False,True,False,False,False,True,False,False
5,Frank,33.0,1,Marketing,70000.0,2015-05-30,False,False,False,True,...,False,True,False,False,False,True,False,False,False,True


# Feature Scaling and Normalization

In [17]:
from sklearn.preprocessing import StandardScaler, MinMaxScaler

# Standardization
scaler = StandardScaler()
data[['Age', 'Salary']] = scaler.fit_transform(data[['Age', 'Salary']])

# Normalization
minmax = MinMaxScaler()
data[['Age', 'Salary']] = minmax.fit_transform(data[['Age', 'Salary']])

data.head()

Unnamed: 0,Name,Age,Gender,Department,Salary,JoiningDate,Dept_Finance,Dept_HR,Dept_IT,Dept_Marketing,...,Dept_IT.1,Dept_Marketing.1,Dept_Finance.1,Dept_HR.1,Dept_IT.2,Dept_Marketing.2,Dept_Finance.2,Dept_HR.2,Dept_IT.3,Dept_Marketing.3
0,Alice,0.166667,0,HR,0.142857,2015-03-01,False,True,False,False,...,False,False,False,True,False,False,False,True,False,False
1,Bob,0.625,1,Finance,0.428571,2018-07-15,True,False,False,False,...,False,False,True,False,False,False,True,False,False,False
2,Charlie,0.722222,1,IT,1.0,2016-08-21,False,False,True,False,...,True,False,False,False,True,False,False,False,True,False
4,Eve,0.0,0,HR,0.0,2019-01-20,False,True,False,False,...,False,False,False,True,False,False,False,True,False,False
5,Frank,0.611111,1,Marketing,0.714286,2015-05-30,False,False,False,True,...,False,True,False,False,False,True,False,False,False,True
