In [2]:
# Loading Data
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder

df = pd.read_csv("/content/emp.csv")

print("Initial Data:\n", df.head())
print("\nShape of data: ", df.shape)

Initial Data:
    employee_id         department     region         education gender  \
0        65438  Sales & Marketing   region_7  Master's & above      f   
1        65141         Operations  region_22        Bachelor's      m   
2         7513  Sales & Marketing  region_19        Bachelor's      m   
3         2542  Sales & Marketing  region_23        Bachelor's      m   
4        48945         Technology  region_26        Bachelor's      m   

  recruitment_channel  no_of_trainings  age  previous_year_rating  \
0            sourcing                1   35                   5.0   
1               other                1   30                   5.0   
2            sourcing                1   34                   3.0   
3               other                2   39                   1.0   
4               other                1   45                   3.0   

   length_of_service  KPIs_met >80%  awards_won?  avg_training_score  \
0                  8              1            0           

In [3]:
#Checking missing values
print("Missing Values: \n", df.isnull().sum())

Missing Values: 
 employee_id                0
department                 0
region                     0
education               2409
gender                     0
recruitment_channel        0
no_of_trainings            0
age                        0
previous_year_rating    4124
length_of_service          0
KPIs_met >80%              0
awards_won?                0
avg_training_score         0
is_promoted                0
dtype: int64


In [4]:
# Imputation
# Numeric columns → fill with mean
numeric_cols = df.select_dtypes(include=[np.number]).columns
for col in numeric_cols:
    df[col] = df[col].fillna(df[col].mean())

# Categorical columns → fill with mode
categorical_cols = df.select_dtypes(exclude=[np.number]).columns
for col in categorical_cols:
    df[col] = df[col].fillna(df[col].mode()[0])

In [5]:
# Label Encoding for categorical columns
le = LabelEncoder()
for col in categorical_cols:
    df[col] = le.fit_transform(df[col])

# Remove duplicates
df.drop_duplicates(inplace=True)

# Remove unnecessary spaces in column names
df.columns = df.columns.str.strip()

# Reset index after cleaning
df.reset_index(drop=True, inplace=True)

In [6]:
# Final processed data
print("Processed Data:\n")
print(df.head())

df.to_csv("/content/sample_data/emp_processed.csv", index=False)
print("\nProcessed file saved as emp_processed.csv")

Processed Data:

   employee_id  department  region  education  gender  recruitment_channel  \
0        65438           7      31          2       0                    2   
1        65141           4      14          0       1                    0   
2         7513           7      10          0       1                    2   
3         2542           7      15          0       1                    0   
4        48945           8      18          0       1                    0   

   no_of_trainings  age  previous_year_rating  length_of_service  \
0                1   35                   5.0                  8   
1                1   30                   5.0                  4   
2                1   34                   3.0                  7   
3                2   39                   1.0                 10   
4                1   45                   3.0                  2   

   KPIs_met >80%  awards_won?  avg_training_score  is_promoted  
0              1            0           