✅ TASK 1: LOAD & EXPLORE DATA

1️⃣ Load CSV using pd.read_csv()

2️⃣ Show first 5 rows

3️⃣ Show last 5 rows

4️⃣ Display:

    Shape

    Column names

    Data types

5️⃣ Summary statistics

6️⃣ Count null values per column

✅ TASK 2: DATA CLEANING

1️⃣ Fill missing numeric values → mean

2️⃣ Fill missing categorical values → mode

3️⃣ Remove duplicates

4️⃣ Strip whitespace from Name & Department

5️⃣ Convert:

    Gender, Department → category type

6️⃣ Verify cleaned dataset

✅ TASK 3: FILTERING & SORTING

1️⃣ Employees with Salary > 1,00,000

2️⃣ Employees younger than 30

3️⃣ Employees with Experience > 5 years

4️⃣ Female employees in Finance

5️⃣ Sort employees by Salary (descending)

6️⃣ Sort by Department → Salary

✅ TASK 4: NEW DERIVED COLUMNS

1️⃣ Salary_per_Year_Exp = Salary / Experience_Years

2️⃣ Seniority column:

    Senior → Experience > 10

    Mid → 5–10

    Junior → <5

3️⃣ Eligible_for_Promotion:

    Yes → Experience > 7 AND Salary < 120000

    No → otherwise

✅ TASK 5: GROUPING & AGGREGATION

1️⃣ Average salary per department

2️⃣ Highest salary per gender

3️⃣ Total salary budget per department

4️⃣ Department-wise average age

5️⃣ Count of employees by gender

✅ TASK 6: ADVANCED ANALYSIS

1️⃣ Highest paid employee (full row)

2️⃣ Lowest salary employee (full row)

3️⃣ Department with highest total salary

4️⃣ Youngest employee

5️⃣ Oldest employee

In [29]:
import pandas as pd
import numpy as np
df = pd.read_csv("employees.csv")


In [None]:
# ✅ TASK 1: LOAD & EXPLORE DATA

# 1️⃣ Load CSV using pd.read_csv()

df = pd.read_csv("employees.csv")


# 2️⃣ Show first 5 rows

print(df.head())

# 3️⃣ Show last 5 rows

print(df.tail())

# 4️⃣ Display:

#     Shape

print(df.shape)

#     Column names


print(df.columns)

#     Data types

print(df.dtypes)

# 5️⃣ Summary statistics

print(df.describe())

# 6️⃣ Count null values per column

print(df.isnull().sum())


In [None]:
# ✅ TASK 2: DATA CLEANING

# 1️⃣ Fill missing numeric values → mean

df = df.fillna(df.mean(numeric_only=True))


# 2️⃣ Fill missing categorical values → mode

cat_cols = df.select_dtypes(include="object").columns

for col in cat_cols:
    df[col] =df[col].fillna(df[col].mode()[0])

# 3️⃣ Remove duplicates

df = df.drop_duplicates()

# 4️⃣ Strip whitespace from Name & Department

df["Name"] = df["Name"].str.strip()
df["Department"] = df["Department"].str.strip()

# 5️⃣ Convert:

#     Gender, Department → category type

df["Gender"] = df["Gender"].astype("category")
df["Department"] = df["Department"].astype("category")

# 6️⃣ Verify cleaned dataset

print(df)



In [None]:
# ✅ TASK 3: FILTERING & SORTING

# 1️⃣ Employees with Salary > 1,00,000

emps = df[df["Salary"]>100000]


# 2️⃣ Employees younger than 30

emp_yng = df[df["Age"]<30]



# 3️⃣ Employees with Experience > 5 years

exp_emp = df[df["Experience_Years"]>5]

# 4️⃣ Female employees in Finance

female_emp = df[df["Gender"]=="F"]

# 5️⃣ Sort employees by Salary (descending)

sorted_sal_emp = df.sort_values(by="Salary",ascending=False)

# 6️⃣ Sort by Department → Salary

sorted_dep_sal = df.groupby(by="Department",observed=False)["Salary"].mean().sort_values()





In [None]:
# ✅ TASK 4: NEW DERIVED COLUMNS

# 1️⃣ Salary_per_Year_Exp = Salary / Experience_Years

df["Salary_per_Year_Exp"] = df["Salary"]/df["Experience_Years"]

# 2️⃣ Seniority column:

#     Senior → Experience > 10

#     Mid → 5–10

#     Junior → <5

df["Seniority"] = pd.cut(
    df["Experience_Years"],
    bins=[0,5,10,float("inf")],
    labels=["Junior","Mid","Sinior"],
    right=False
)

# 3️⃣ Eligible_for_Promotion:

#     Yes → Experience > 7 AND Salary < 120000

#     No → otherwise


df["Eligible_for_Promotion"] = np.where((df["Experience_Years"]>7) & (df["Salary"]<12000),"Yes","No")




In [None]:

# ✅ TASK 5: GROUPING & AGGREGATION


# 1️⃣ Average salary per department


avg_sal_per_dep = df.groupby("Department",observed=False)["Salary"].mean()

# 2️⃣ Highest salary per gender

highest_sal_per_gen = df.groupby("Gender",observed=True)["Salary"].max()

# 3️⃣ Total salary budget per department

total_sal_per_dep = df.groupby("Department",observed=False)["Salary"].sum()


# 4️⃣ Department-wise average age

dep_wise_avg_age = df.groupby("Department",observed=False)["Age"].mean()

# 5️⃣ Count of employees by gender

count = df.groupby("Gender")["Emp_ID"].count()

print(count)


In [None]:
# ✅ TASK 6: ADVANCED ANALYSIS

# 1️⃣ Highest paid employee (full row)

highest_paid_emp = df.nlargest(1,"Salary")

# 2️⃣ Lowest salary employee (full row)

lowest_paid_emp = df.nsmallest(1,"Salary")

# 3️⃣ Department with highest total salary

dep_with_highest_total_sal = df.groupby("Department")["Salary"].sum().nlargest(1)

# 4️⃣ Youngest employee

yng_emp = df["Age"].nsmallest(1)

# 5️⃣ Oldest employee

old_emp = df["Age"].nlargest(1)

