<a href="https://colab.research.google.com/github/E-SaiAnurath/ADM-LAB/blob/main/Assignment_1.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import MinMaxScaler, StandardScaler

# Load the dataset
data = pd.read_csv("employee_data.csv")

# Display the dataset before preprocessing
print("Dataset before preprocessing:")
print(data.head())

# Step 2: Handle missing values
# Impute missing values for numerical columns using mean or median
numerical_cols = ['Age', 'Salary', 'Job_Satisfaction']
for col in numerical_cols:
    data[col].fillna(data[col].mean(), inplace=True)  # Impute with mean

# Impute missing values for categorical columns using mode
categorical_cols = data.select_dtypes(include=['object']).columns
for col in categorical_cols:
    data[col].fillna(data[col].mode()[0], inplace=True)

# Step 3: Feature scaling
# Min-Max Scaling (for Age and Salary)
min_max_scaler = MinMaxScaler()
data[['Age_MinMax', 'Salary_MinMax']] = min_max_scaler.fit_transform(data[['Age', 'Salary']])

# Manual Min-Max Scaling
for col in ['Age', 'Salary']:
    data[f"{col}_MinMax_Manual"] = (data[col] - data[col].min()) / (data[col].max() - data[col].min())

# Standardization (Z-score normalization) (for Job_Satisfaction, Work_Hours_Per_Week)
standard_scaler = StandardScaler()
data[['Job_Satisfaction_Z', 'Work_Hours_Z']] = standard_scaler.fit_transform(data[['Job_Satisfaction', 'Work_Hours_Per_Week']])

# Manual Standardization
for col in ['Job_Satisfaction', 'Work_Hours_Per_Week']:
    data[f"{col}_Z_Manual"] = (data[col] - data[col].mean()) / data[col].std()

# Display the dataset after preprocessing
print("\nDataset after preprocessing:")
print(data.head())

# Save the processed dataset to a new CSV file
data.to_csv("processed_employee_data.csv", index=False)
print("Processed dataset saved as 'processed_employee_data.csv'")

Dataset before preprocessing:
   Employee_ID   Age    Salary  Job_Satisfaction  Work_Hours_Per_Week
0         1001  50.0  108953.0               9.0                   36
1         1002  36.0   82995.0               8.0                   59
2         1003  29.0   70757.0               2.0                   30
3         1004  42.0   39692.0               1.0                   30
4         1005  40.0   75758.0               7.0                   54

Dataset after preprocessing:
   Employee_ID   Age    Salary  Job_Satisfaction  Work_Hours_Per_Week  \
0         1001  50.0  108953.0               9.0                   36   
1         1002  36.0   82995.0               8.0                   59   
2         1003  29.0   70757.0               2.0                   30   
3         1004  42.0   39692.0               1.0                   30   
4         1005  40.0   75758.0               7.0                   54   

   Age_MinMax  Salary_MinMax  Age_MinMax_Manual  Salary_MinMax_Manual  \
0    0.7

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  data[col].fillna(data[col].mean(), inplace=True)  # Impute with mean
