# Employee Performance Analytics (NumPy)

This project analyzes employee productivity and performance using pure NumPy.
The dataset is synthetically generated to simulate real-world HR data.


In [2]:
import numpy as np

In [3]:
np.random.seed(42)

In [4]:
n_employees=200

In [5]:
employee_id=np.arange(1001,1001+n_employees)

In [6]:
employee_id.shape

(200,)

In [7]:
department_id=np.random.randint(1,5,size=n_employees)

In [8]:
hours_worked=np.random.randint(120,240,size=n_employees)

In [9]:
tasks_completed=np.random.randint(30,151,size=n_employees)

In [10]:
quality_score=np.random.uniform(50,100,size=n_employees)

In [11]:
salary=np.random.uniform(30000,120000,size=n_employees)

In [12]:
absenteeism=np.random.randint(0,11,size=n_employees)

In [13]:
employee_data=np.column_stack((
    employee_id,
    department_id,
    hours_worked,
    tasks_completed,
    quality_score,
    salary,
    absenteeism
))

In [14]:
print("Dataset Shape:",employee_data.shape)

Dataset Shape: (200, 7)


In [15]:
print("First 5 Rows:\n",employee_data[:5])

First 5 Rows:
 [[1.00100000e+03 3.00000000e+00 1.82000000e+02 9.50000000e+01
  9.57606864e+01 7.32333074e+04 1.00000000e+00]
 [1.00200000e+03 4.00000000e+00 2.15000000e+02 8.30000000e+01
  7.21176115e+01 1.18675745e+05 4.00000000e+00]
 [1.00300000e+03 1.00000000e+00 2.22000000e+02 6.40000000e+01
  6.19893680e+01 6.39065073e+04 7.00000000e+00]
 [1.00400000e+03 3.00000000e+00 2.32000000e+02 1.09000000e+02
  5.46936645e+01 9.74620470e+04 1.00000000e+01]
 [1.00500000e+03 3.00000000e+00 1.71000000e+02 9.00000000e+01
  5.91432999e+01 6.53690504e+04 8.00000000e+00]]


In [16]:
tasks_completed=employee_data[:,3]
hours_worked=employee_data[:,2]

In [17]:
productivity_score=tasks_completed/hours_worked

In [18]:
print("Productivity Score (first 5):\n", productivity_score[:5])
print("Shape:",productivity_score.shape)
      

Productivity Score (first 5):
 [0.52197802 0.38604651 0.28828829 0.46982759 0.52631579]
Shape: (200,)


In [19]:
quality_score=employee_data[:,4]

In [20]:
quality_normalized=quality_score/100

In [21]:
efficiency_score=productivity_score*quality_normalized

In [22]:
print("Efficiency Score (first 5):\n", efficiency_score[:5])
print("Shape:",efficiency_score.shape)

Efficiency Score (first 5):
 [0.49984974 0.27840752 0.17870809 0.25696592 0.31128053]
Shape: (200,)


In [23]:
absenteeism=employee_data[:,6]

In [24]:
absenteeism_penalty=absenteeism/10

In [25]:
performance_index=efficiency_score-absenteeism_penalty

In [26]:
print("Performance Index (first 5):\n",performance_index[:5])
print("Shape:",performance_index.shape)

Performance Index (first 5):
 [ 0.39984974 -0.12159248 -0.52129191 -0.74303408 -0.48871947]
Shape: (200,)


In [27]:
top_threshold=np.percentile(performance_index,80)
bottom_threshold=np.percentile(performance_index,20)

In [28]:
print("Top Performance Threshold:",top_threshold)
print("At-Risk Threshold:",bottom_threshold)

Top Performance Threshold: 0.20491997810877816
At-Risk Threshold: -0.4553838373974282


In [29]:
performance_category=np.where(
    performance_index>=top_threshold,2,
    np.where(performance_index<=bottom_threshold,0,1)
)

In [30]:
print("First 20 categories:\n", performance_category[:20])
print("Shape:",performance_category.shape)

First 20 categories:
 [2 1 0 0 0 2 2 1 2 1 1 0 0 0 1 1 1 1 1 1]
Shape: (200,)


In [31]:
categories,counts=np.unique(performance_category,return_counts=True)

In [32]:
for cat,count in zip(categories,counts):
    print(f"Category {cat}: {count} employees")

Category 0: 40 employees
Category 1: 120 employees
Category 2: 40 employees


In [33]:
department_id=employee_data[:,1]

In [34]:
departments=np.unique(department_id)

In [35]:
print("Departments:",departments)
print("Number of departments:",len(departments))

Departments: [1. 2. 3. 4.]
Number of departments: 4


In [36]:
dept_avg_performance=[]

for d in departments:
    mask=department_id==d
    avg_perf=performance_index[mask].mean()
    dept_avg_performance.append(avg_perf)

dept_avg_performance=np.array(dept_avg_performance)

In [37]:
for d,avg in zip(departments,dept_avg_performance):
    print(f"Department {int(d)} -> Avg Performance Index: {avg:.3f}")

Department 1 -> Avg Performance Index: -0.099
Department 2 -> Avg Performance Index: -0.167
Department 3 -> Avg Performance Index: -0.176
Department 4 -> Avg Performance Index: -0.097


In [38]:
best_dept_index=np.argmax(dept_avg_performance)

In [39]:
best_department=departments[best_dept_index]
best_score=dept_avg_performance[best_dept_index]

In [40]:
print(f"Best Performing Department: {int(best_department)}")

Best Performing Department: 4


In [41]:
print(f"Average Performance Index: {best_score:.3f}")

Average Performance Index: -0.097


In [42]:
dept_top_performers=[]

In [43]:
for d in departments:
    mask=(department_id==d) & (performance_category==2)
    count_top=np.sum(mask)
    dept_top_performers.append(count_top)
dept_top_performers=np.array(dept_top_performers)

In [44]:
for d,cnt in zip(departments,dept_top_performers):
    print(f"Department {int(d)}-> Top Performers:{cnt}")

Department 1-> Top Performers:10
Department 2-> Top Performers:8
Department 3-> Top Performers:12
Department 4-> Top Performers:10


## Summary

- The dataset contains 200 employees across 4 departments.
- Employees were categorized using a percentile-based performance index:
  - 20% Top Performers
  - 60% Average Performers
  - 20% At-Risk Employees
- Department 4 shows the highest average performance index overall.
- Department 3 has the highest number of top performers, indicating strong individual excellence despite a lower average score.
- Percentile-based categorization ensures fair comparison across varying performance distributions.
