In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import OneHotEncoder, LabelEncoder, StandardScaler, MinMaxScaler

# Load dataset
file_path = "ass3.csv"  
df = pd.read_csv(file_path)

# Identify categorical and numerical features
categorical_cols = df.select_dtypes(include=['object']).columns
numerical_cols = df.select_dtypes(include=['int64', 'float64']).columns

In [2]:
# Convert categorical columns to numerical
label_encoders = {}
for col in categorical_cols:
    le = LabelEncoder()
    df[col] = le.fit_transform(df[col])
    label_encoders[col]=le

In [3]:
# Scale numerical columns
scaler = StandardScaler()  # Or MinMaxScaler()
df[numerical_cols] = scaler.fit_transform(df[numerical_cols])

In [4]:
# Create new meaningful features
df['Education'] = df['Age'] - df['BusinessTravel']

In [5]:
# Check for skewness and apply log transformation if needed
skewed_cols = df[numerical_cols].apply(lambda x: x.skew()).abs()
skewed_cols = skewed_cols[skewed_cols > 0.5].index  # Threshold for skewness

# Ensure non-negative values before log transformation
for col in skewed_cols:
    df[col] = df[col].apply(lambda x: np.log1p(x) if x > 0 else 0)  # Avoid log of negative numbers

In [7]:
# Save processed data
df.to_csv("processed_employee_attrition.csv",index=False)