Step 1: Data Cleaning

In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

In [None]:
# Load the dataset
df = pd.read_csv("C:/Users/devpa/Desktop/synthetic_employee_performance.csv")

# Inspect structure
print(df.head())
print(df.info())
print(df.describe(include='all'))

In [None]:
# Handle missing values
# Numerical columns: Impute with median
num_cols = ['age', 'experience', 'salary', 'performance_score', 'training_hours', 'leave_days']
for col in num_cols:
    df[col] = df[col].fillna(df[col].median())

In [None]:
# Categorical columns: Impute with mode
cat_cols = ['department', 'gender', 'promotion_eligibility']
for col in cat_cols:
    df[col] = df[col].fillna(df[col].mode()[0])

In [None]:
# Remove duplicates
df = df.drop_duplicates(subset=['employee_id'], keep='first')

In [None]:
# Treat outliers using IQR for numerical columns
def treat_outliers(df, column):
    Q1 = df[column].quantile(0.25)
    Q3 = df[column].quantile(0.75)
    IQR = Q3 - Q1
    lower_bound = Q1 - 1.5 * IQR
    upper_bound = Q3 + 1.5 * IQR
    df[column] = np.where(df[column] > upper_bound, upper_bound, 
                         np.where(df[column] < lower_bound, lower_bound, df[column]))
    return df

for col in num_cols:
    df = treat_outliers(df, col)


In [None]:
# Standardize categorical values
df['department'] = df['department'].str.title()
df['gender'] = df['gender'].str.title()

Step 2: Exploratory Data Analysis (EDA)

In [None]:
# Summary statistics
print(df.describe())

# Frequency distributions for categorical variables
print(df['department'].value_counts())
print(df['gender'].value_counts())

In [None]:
# Visualizations
plt.figure(figsize=(12, 6))
sns.histplot(df['salary'], kde=True)
plt.title('Salary Distribution')
plt.show()

sns.boxplot(df['performance_score'])
plt.title('Performance Score Distribution')
plt.show()

Bivariate Analysis

In [None]:
# Correlation matrix
corr_matrix = df[num_cols].corr()
sns.heatmap(corr_matrix, annot=True, cmap='coolwarm')
plt.title('Correlation Matrix')
plt.show()

In [None]:
# Scatter plot: Salary vs Performance Score
sns.scatterplot(data=df, x='salary', y='performance_score', hue='department')
plt.title('Salary vs Performance Score')
plt.show()

In [None]:
# Box plot: Department vs Salary
plt.figure(figsize=(10, 6))
sns.boxplot(data=df, x='department', y='salary')
plt.title('Salary Distribution by Department')
plt.xticks(rotation=45)
plt.show()

Multivariate Analysis

In [None]:
# Pair plot
sns.pairplot(df[num_cols], diag_kind='kde')
plt.show()

# Grouped comparison: Average performance by department and gender
grouped = df.groupby(['department', 'gender'])['performance_score'].mean().reset_index()
sns.barplot(data=grouped, x='department', y='performance_score', hue='gender')
plt.title('Average Performance by Department and Gender')
plt.xticks(rotation=45)
plt.show()

Key Findings
Data Cleaning Insights:

Missing values in age, performance_score, and gender were imputed.

Duplicate employee records (e.g., EMP0129) were removed.

Outliers in salary and training_hours were capped using IQR.

EDA Highlights:

Salary: IT and Finance departments have higher median salaries.

Performance: Sales has the highest variance in performance scores.

Correlation: experience and salary show a moderate positive correlation 
r
=
0.58
r=0.58

Gender: Female employees in HR have higher average performance scores compared to males.