In [1]:
# (Optional) Install/upgrade core libs in Colab
# !pip -q install -U pandas

import pandas as pd
import numpy as np


## 0) Create stand‑in CSV files (so examples run)

If you already have the chapter datasets, you may skip this cell. Otherwise, it creates minimal CSVs that satisfy the columns referenced in the examples.

In [3]:
from pathlib import Path
import pandas as pd
import numpy as np

# File 1: Human_Resources.csv
hr_path = Path('Human_Resources.csv')


# File 2: Case_Study_1 Solved_Employee Data.csv (with a "messy" first row)
case_path = Path('Case_Study_1 Solved_Employee Data.csv')


# File 3: ATTRITION DATA_Sheet1.csv
attr_path = Path('ATTRITION DATA_Sheet1.csv')


print('Stand-in files ready:', [p for p in [hr_path, case_path, attr_path] if p.exists()])


Stand-in files ready: [PosixPath('Human_Resources.csv'), PosixPath('Case_Study_1 Solved_Employee Data.csv'), PosixPath('ATTRITION DATA_Sheet1.csv')]


## 1) Series examples

In [4]:
import pandas as pd

# Create a Series of employee ages
employee_ages = pd.Series([25, 32, 45, 28, 38, 41, 29])
print('Employee ages:')
employee_ages


Employee ages:


Unnamed: 0,0
0,25
1,32
2,45
3,28
4,38
5,41
6,29


In [5]:
# Create a Series with custom index (employee names)
employee_names = ['أحمد', 'فاطمة', 'محمد', 'عائشة', 'خالد', 'زينب', 'عمر']
employee_ages_named = pd.Series([25, 32, 45, 28, 38, 41, 29], index=employee_names)
print('Employee ages (named):')
employee_ages_named


Employee ages (named):


Unnamed: 0,0
أحمد,25
فاطمة,32
محمد,45
عائشة,28
خالد,38
زينب,41
عمر,29


## 2) DataFrame from multiple Series

In [6]:
# Four Series, then combine into a DataFrame
employee_names_series = pd.Series(['عائشة حسن', 'محمد أحمد', 'فاطمة علي', 'أحمد محمد'])
ages = pd.Series([25, 32, 45, 28])
departments = pd.Series(['التسويق', 'الموارد البشرية', 'المالية', 'تقنية المعلومات'])
salaries = pd.Series([5000, 6500, 8000, 5500])

employee_df = pd.DataFrame([employee_names_series, ages, departments, salaries]).T
employee_df.columns = ['EmployeeName', 'Age', 'Department', 'Salary']
print('Employee table:')
employee_df


Employee table:


Unnamed: 0,EmployeeName,Age,Department,Salary
0,عائشة حسن,25,التسويق,5000
1,محمد أحمد,32,الموارد البشرية,6500
2,فاطمة علي,45,المالية,8000
3,أحمد محمد,28,تقنية المعلومات,5500


## 3) Load an HR dataset (CSV) and inspect

In [7]:
# Load HR dataset
hr_dataset = pd.read_csv('Human_Resources.csv')
hr_dataset.head()


Unnamed: 0,Age,Attrition,BusinessTravel,DailyRate,Department,DistanceFromHome,Education,EducationField,EmployeeCount,EmployeeNumber,...,RelationshipSatisfaction,StandardHours,StockOptionLevel,TotalWorkingYears,TrainingTimesLastYear,WorkLifeBalance,YearsAtCompany,YearsInCurrentRole,YearsSinceLastPromotion,YearsWithCurrManager
0,41,Yes,Travel_Rarely,1102,Sales,1,2,Life Sciences,1,1.0,...,1,80,0,8,0,1,6,4,0,5
1,49,No,Travel_Frequently,279,Research & Development,8,1,Life Sciences,1,2.0,...,4,80,1,10,3,3,10,7,1,7
2,37,Yes,Travel_Rarely,1373,Research & Development,2,2,Other,1,4.0,...,2,80,0,7,3,3,0,0,0,0
3,33,No,Travel_Frequently,1392,Research & Development,3,4,Life Sciences,1,5.0,...,3,80,0,8,3,3,8,7,3,0
4,27,No,Travel_Rarely,591,Research & Development,2,1,Medical,1,7.0,...,4,80,1,6,3,3,2,2,2,2


In [8]:
# Basic shape information
print('Number of rows:', hr_dataset.shape[0])
print('Number of columns:', hr_dataset.shape[1])


Number of rows: 1470
Number of columns: 35


In [11]:
import pprint as pp
pp.PrettyPrinter(width=80)

print('Column names:')
pp.pprint(hr_dataset.columns.tolist())

print('Dtypes:')
print(hr_dataset.dtypes)


Column names:
['Age',
 'Attrition',
 'BusinessTravel',
 'DailyRate',
 'Department',
 'DistanceFromHome',
 'Education',
 'EducationField',
 'EmployeeCount',
 'EmployeeNumber',
 'EnvironmentSatisfaction',
 'Gender',
 'HourlyRate',
 'JobInvolvement',
 'JobLevel',
 'JobRole',
 'JobSatisfaction',
 'MaritalStatus',
 'MonthlyIncome',
 'MonthlyRate',
 'NumCompaniesWorked',
 'Over18',
 'OverTime',
 'PercentSalaryHike',
 'PerformanceRating',
 'RelationshipSatisfaction',
 'StandardHours',
 'StockOptionLevel',
 'TotalWorkingYears',
 'TrainingTimesLastYear',
 'WorkLifeBalance',
 'YearsAtCompany',
 'YearsInCurrentRole',
 'YearsSinceLastPromotion',
 'YearsWithCurrManager']
Dtypes:
Age                           int64
Attrition                    object
BusinessTravel               object
DailyRate                     int64
Department                   object
DistanceFromHome              int64
Education                     int64
EducationField               object
EmployeeCount                 int64
E

## 4) Selecting columns and basic EDA

In [12]:
# Access the Age column (Series)
ages_series = hr_dataset['Age']
print('Ages (head):')
print(ages_series.head())
print('Type:', type(ages_series))


Ages (head):
0    41
1    49
2    37
3    33
4    27
Name: Age, dtype: int64
Type: <class 'pandas.core.series.Series'>


In [13]:
# Select multiple columns
basic_info = hr_dataset[['Age', 'Gender', 'Department', 'MonthlyIncome']]
print('Basic info (head):')
print(basic_info.head())


Basic info (head):
   Age  Gender              Department  MonthlyIncome
0   41  Female                   Sales         5993.0
1   49    Male  Research & Development            NaN
2   37    Male  Research & Development         2090.0
3   33  Female  Research & Development         2909.0
4   27    Male  Research & Development         3468.0


In [15]:
# Summary statistics and missing values
print('Describe (numeric):')
print(hr_dataset.describe())

print('Missing values per column:')
print(hr_dataset.isnull().sum())


Describe (numeric):
               Age    DailyRate  DistanceFromHome    Education  EmployeeCount  \
count  1470.000000  1470.000000       1470.000000  1470.000000         1470.0   
mean     36.923810   802.485714          9.192517     2.912925            1.0   
std       9.135373   403.509100          8.106864     1.024165            0.0   
min      18.000000   102.000000          1.000000     1.000000            1.0   
25%      30.000000   465.000000          2.000000     2.000000            1.0   
50%      36.000000   802.000000          7.000000     3.000000            1.0   
75%      43.000000  1157.000000         14.000000     4.000000            1.0   
max      60.000000  1499.000000         29.000000     5.000000            1.0   

       EmployeeNumber  EnvironmentSatisfaction   HourlyRate  JobInvolvement  \
count     1469.000000              1470.000000  1470.000000     1470.000000   
mean      1025.556161                 2.721769    65.891156        2.729932   
std        60

In [16]:
# Unique departments and counts
unique_departments = hr_dataset['Department'].unique()
print('Departments:')
for dept in unique_departments:
    print('-', dept)

dept_counts = hr_dataset['Department'].value_counts()
print('Employees per department:')
print(dept_counts)


Departments:
- Sales
- Research & Development
- nan
- Human Resources
Employees per department:
Department
Research & Development    960
Sales                     446
Human Resources            63
Name: count, dtype: int64


In [17]:
# Mean/min/max age
average_age = hr_dataset['Age'].mean()
min_age = hr_dataset['Age'].min()
max_age = hr_dataset['Age'].max()
print(f'Average age: {average_age:.1f}')
print(f'Min age: {min_age}')
print(f'Max age: {max_age}')


Average age: 36.9
Min age: 18
Max age: 60


In [18]:
# Average monthly income by department
salary_by_dept = hr_dataset.groupby('Department')['MonthlyIncome'].mean()
print('Average monthly income by department:')
for dept, salary in salary_by_dept.items():
    print(f'{dept}: {salary:.0f}')


Average monthly income by department:
Human Resources: 6655
Research & Development: 6290
Sales: 6956


In [19]:
# Duplicate check and missing values for important columns
duplicated_employees = hr_dataset['EmployeeNumber'].duplicated().sum()
print('Duplicated EmployeeNumber:', duplicated_employees)

important_columns = ['Age', 'Department', 'MonthlyIncome']
for col in important_columns:
    missing_count = hr_dataset[col].isnull().sum()
    print(f'Missing in {col}: {missing_count}')


Duplicated EmployeeNumber: 0
Missing in Age: 0
Missing in Department: 1
Missing in MonthlyIncome: 3


In [20]:
# Quick HR snapshot report
print('=== HR Snapshot ===')
print('Total employees:', len(hr_dataset))
print('Departments:', hr_dataset['Department'].nunique())
print(f"Average age: {hr_dataset['Age'].mean():.1f}")
print(f"Average monthly income: {hr_dataset['MonthlyIncome'].mean():.0f}")
print(f"% Male: {(hr_dataset['Gender'] == 'Male').mean() * 100:.1f}%")
print(f"% Female: {(hr_dataset['Gender'] == 'Female').mean() * 100:.1f}%")


=== HR Snapshot ===
Total employees: 1470
Departments: 3
Average age: 36.9
Average monthly income: 6505
% Male: 60.0%
% Female: 39.9%


## 5) Indexing and selection (iloc / loc) with a case dataset

In [31]:
import pandas as pd

# First read (as in the chapter narrative): likely messy headers
employee_data = pd.read_csv('Case_Study_1 Solved_Employee Data.csv')
print('First 5 rows (raw):')
print(employee_data.head())


First 5 rows (raw):
  Unnamed: 0            Unnamed: 1       Unnamed: 2 Unnamed: 3 Unnamed: 4  \
0       S.no         Employee Name  Employee Number      State        Zip   
1          1            Brown, Mia       1103024456         MA       1450   
2          2  LaRotonda, William         1106026572         MA       1460   
3          3      Steans, Tyrone         1302053333         MA       2703   
4          4       Howard, Estelle       1211050782         MA       2170   

  Unnamed: 5 Unnamed: 6   Unnamed: 7   Unnamed: 8           Unnamed: 9  \
0        Age        Sex  MaritalDesc  CitizenDesc         Date of Hire   
1         32     Female      Married   US Citizen  2020-07-02 00:00:00   
2         33       Male     Divorced   US Citizen  2020-07-08 00:00:00   
3         31       Male       Single   US Citizen  2020-07-14 00:00:00   
4         32     Female      Married   US Citizen  2020-07-15 00:00:00   

     Unnamed: 10               Unnamed: 11 Unnamed: 12         Unnamed: 

In [33]:
# Read again using skiprows=1 to use the first data row as header
employee_data = pd.read_csv('Case_Study_1 Solved_Employee Data.csv', skiprows=1)
print('First 5 rows (cleaner header via skiprows):')
print(employee_data.head())
print('Shape:', employee_data.shape)

clean_data = employee_data


First 5 rows (cleaner header via skiprows):
   S.no         Employee Name  Employee Number State   Zip  Age     Sex  \
0     1            Brown, Mia       1103024456    MA  1450   32  Female   
1     2  LaRotonda, William         1106026572    MA  1460   33    Male   
2     3      Steans, Tyrone         1302053333    MA  2703   31    Male   
3     4       Howard, Estelle       1211050782    MA  2170   32  Female   
4     5           Singh, Nan        1307059817    MA  2330   29  Female   

  MaritalDesc CitizenDesc         Date of Hire     Department  \
0     Married  US Citizen  2020-07-02 00:00:00  Admin Offices   
1    Divorced  US Citizen  2020-07-08 00:00:00  Admin Offices   
2      Single  US Citizen  2020-07-14 00:00:00  Admin Offices   
3     Married  US Citizen  2020-07-15 00:00:00  Admin Offices   
4      Single  US Citizen  2020-07-12 00:00:00  Admin Offices   

                   Position  Pay Rate        Manager Name  \
0              Accountant I     28.50  Brandon R. LeB

In [34]:
# iloc examples
first_employee = clean_data.iloc[0]
print('First employee (row 0):')
print(first_employee)

first_three_employees = clean_data.iloc[0:3]
print('First 3 employees:')
print(first_three_employees)

subset_data = clean_data.iloc[0:5, 0:6]
print('Subset (first 5 rows, first 6 cols):')
print(subset_data)


First employee (row 0):
S.no                                   1
Employee Name                 Brown, Mia
Employee Number               1103024456
State                                 MA
Zip                                 1450
Age                                   32
Sex                               Female
MaritalDesc                      Married
CitizenDesc                   US Citizen
Date of Hire         2020-07-02 00:00:00
Department                 Admin Offices
Position                    Accountant I
Pay Rate                            28.5
Manager Name          Brandon R. LeBlanc
Employee Source                 Internal
Performance Score            Fully Meets
Name: 0, dtype: object
First 3 employees:
   S.no         Employee Name  Employee Number State   Zip  Age     Sex  \
0     1            Brown, Mia       1103024456    MA  1450   32  Female   
1     2  LaRotonda, William         1106026572    MA  1460   33    Male   
2     3      Steans, Tyrone         1302053333    MA 

In [36]:
# اختيار موظفين محددين باستخدام فهارس الصفوف
specific_employees = clean_data.loc[0:2]
print("الموظفون من الفهرس 0 إلى 2:")
print(specific_employees[['Employee Name', 'Department', 'Position']])

# اختيار عمود محدد لجميع الموظفين
all_names = clean_data.loc[:, 'Employee Name']
print("\nأسماء جميع الموظفين:")
print(all_names.head(10))



الموظفون من الفهرس 0 إلى 2:
          Employee Name     Department      Position
0            Brown, Mia  Admin Offices  Accountant I
1  LaRotonda, William    Admin Offices  Accountant I
2      Steans, Tyrone    Admin Offices  Accountant I

أسماء جميع الموظفين:
0              Brown, Mia
1    LaRotonda, William  
2        Steans, Tyrone  
3         Howard, Estelle
4             Singh, Nan 
5        Smith, Leigh Ann
6     LeBlanc, Brandon  R
7             Quinn, Sean
8       Boutwell, Bonalyn
9       Foster-Baker, Amy
Name: Employee Name, dtype: object


## 6) Column selection and filtering

In [37]:
# Single column
employee_names = clean_data['Employee Name']
print(employee_names.head())

# Multiple columns
basic_info = clean_data[['Employee Name', 'Department', 'Position', 'Pay Rate']]
print('Basic info:')
print(basic_info.head())

# Demographic columns
demographic_info = clean_data[['Employee Name', 'Age', 'Sex', 'MaritalDesc', 'State']]
print('Demographic info:')
print(demographic_info.head())


0              Brown, Mia
1    LaRotonda, William  
2        Steans, Tyrone  
3         Howard, Estelle
4             Singh, Nan 
Name: Employee Name, dtype: object
Basic info:
          Employee Name     Department                  Position  Pay Rate
0            Brown, Mia  Admin Offices              Accountant I     28.50
1  LaRotonda, William    Admin Offices              Accountant I     23.00
2      Steans, Tyrone    Admin Offices              Accountant I     29.00
3       Howard, Estelle  Admin Offices  Administrative Assistant     21.50
4           Singh, Nan   Admin Offices  Administrative Assistant     16.56
Demographic info:
          Employee Name  Age     Sex MaritalDesc State
0            Brown, Mia   32  Female     Married    MA
1  LaRotonda, William     33    Male    Divorced    MA
2      Steans, Tyrone     31    Male      Single    MA
3       Howard, Estelle   32  Female     Married    MA
4           Singh, Nan    29  Female      Single    MA


In [39]:
# Filter: Admin Offices
admin_employees = clean_data[clean_data['Department'] == 'Admin Offices']
print('Admin Offices employees:')
print(admin_employees[['Employee Name', 'Position', 'Pay Rate']])

print('Available departments:')
print(clean_data['Department'].unique())


Admin Offices employees:
          Employee Name                  Position  Pay Rate
0            Brown, Mia              Accountant I     28.50
1  LaRotonda, William                Accountant I     23.00
2      Steans, Tyrone                Accountant I     29.00
3       Howard, Estelle  Administrative Assistant     21.50
4           Singh, Nan   Administrative Assistant     16.56
5      Smith, Leigh Ann  Administrative Assistant     20.50
6   LeBlanc, Brandon  R   Shared Services Manager     55.00
7           Quinn, Sean   Shared Services Manager     55.00
8     Boutwell, Bonalyn            Sr. Accountant     34.95
9     Foster-Baker, Amy            Sr. Accountant     34.95
Available departments:
['Admin Offices' 'Executive Office' 'IT/IS']


## 7) Type conversion and conditional filters

In [41]:
# Convert Pay Rate to numeric
clean_data['Pay Rate'] = pd.to_numeric(clean_data['Pay Rate'], errors='coerce')

# High earners (> 25)
high_earners = clean_data[clean_data['Pay Rate'] > 25]
print('High earners (>25):')
print(high_earners[['Employee Name', 'Department', 'Position', 'Pay Rate']].sort_values('Pay Rate', ascending=False))

# Low earners (< 20)
low_earners = clean_data[clean_data['Pay Rate'] < 20]
print('Low earners (<20):')
print(low_earners[['Employee Name', 'Department', 'Position', 'Pay Rate']])


High earners (>25):
               Employee Name        Department                 Position  \
10               King, Janet  Executive Office          President & CEO   
25               Foss, Jason             IT/IS              IT Director   
11          Zamora, Jennifer             IT/IS                      CIO   
29             Dougall, Eric             IT/IS     IT Manager - Support   
28             Monroe, Peter             IT/IS       IT Manager - Infra   
43  Ait Sidi, Karthikeyan                IT/IS                  Sr. DBA   
26                Roup,Simon             IT/IS          IT Manager - DB   
6        LeBlanc, Brandon  R     Admin Offices  Shared Services Manager   
7                Quinn, Sean     Admin Offices  Shared Services Manager   
42             Turpin, Jumil             IT/IS         Network Engineer   
13             Goble, Taisha             IT/IS   Database Administrator   
40           Shepard, Anita              IT/IS         Network Engineer   
22   

In [42]:
# Convert Age to numeric and create filtered subsets
clean_data['Age'] = pd.to_numeric(clean_data['Age'], errors='coerce')

female_employees = clean_data[clean_data['Sex'] == 'Female']
print('Female employees:')
print(female_employees[['Employee Name', 'Age', 'Department', 'Position']])

young_employees = clean_data[clean_data['Age'] < 30]
print('Young employees (<30):')
print(young_employees[['Employee Name', 'Age', 'Department']])


Female employees:
        Employee Name  Age        Department                  Position
0          Brown, Mia   32     Admin Offices              Accountant I
3     Howard, Estelle   32     Admin Offices  Administrative Assistant
4         Singh, Nan    29     Admin Offices  Administrative Assistant
5    Smith, Leigh Ann   30     Admin Offices  Administrative Assistant
8   Boutwell, Bonalyn   30     Admin Offices            Sr. Accountant
9   Foster-Baker, Amy   38     Admin Offices            Sr. Accountant
10        King, Janet   63  Executive Office           President & CEO
11   Zamora, Jennifer   38             IT/IS                       CIO
12       Bansal, Renu   31             IT/IS    Database Administrator
13      Goble, Taisha   46             IT/IS    Database Administrator
15      Horton, Jayne   33             IT/IS    Database Administrator
16   Johnson, Noelle    31             IT/IS    Database Administrator
19  Petrowsky, Thelma   33             IT/IS    Database Ad

In [43]:
# Compound filters
female_admin = clean_data[(clean_data['Sex'] == 'Female') & (clean_data['Department'] == 'Admin Offices')]
print('Female employees in Admin Offices:')
print(female_admin[['Employee Name', 'Position', 'Pay Rate']])

young_male_high_earners = clean_data[(clean_data['Sex'] == 'Male') & (clean_data['Age'] < 35) & (clean_data['Pay Rate'] > 20)]
print('Young male high earners:')
print(young_male_high_earners[['Employee Name', 'Age', 'Department', 'Pay Rate']])

excellent_married = clean_data[(clean_data['MaritalDesc'] == 'Married') & (clean_data['Performance Score'] == 'Exceeds')]
print('Married employees with Exceeds:')
print(excellent_married[['Employee Name', 'Age', 'Department', 'Performance Score']])


Female employees in Admin Offices:
       Employee Name                  Position  Pay Rate
0         Brown, Mia              Accountant I     28.50
3    Howard, Estelle  Administrative Assistant     21.50
4        Singh, Nan   Administrative Assistant     16.56
5   Smith, Leigh Ann  Administrative Assistant     20.50
8  Boutwell, Bonalyn            Sr. Accountant     34.95
9  Foster-Baker, Amy            Sr. Accountant     34.95
Young male high earners:
           Employee Name  Age     Department  Pay Rate
1   LaRotonda, William     33  Admin Offices     23.00
2       Steans, Tyrone     31  Admin Offices     29.00
6    LeBlanc, Brandon  R   33  Admin Offices     55.00
7            Quinn, Sean   33  Admin Offices     55.00
14     Hernandez, Daniff   31          IT/IS     40.10
17        Murray, Thomas   29          IT/IS     35.50
18      Pearson, Randall   33          IT/IS     41.00
21          Rogers, Ivan   31          IT/IS     42.20
22         Salter, Jason   30          IT/IS  

## 8) Performance levels

In [44]:
print('Performance Score counts:')
print(clean_data['Performance Score'].value_counts())

top_performers = clean_data[clean_data['Performance Score'] == 'Exceeds']
print('Top performers (Exceeds):')
print(top_performers[['Employee Name', 'Department', 'Position', 'Pay Rate']])

needs_improvement = clean_data[clean_data['Performance Score'] == 'PIP']
if len(needs_improvement) > 0:
    print('Employees on PIP:')
    print(needs_improvement[['Employee Name', 'Department', 'Manager Name']])
else:
    print('No PIP employees in this dataset.')


Performance Score counts:
Performance Score
Fully Meets                 22
N/A- too early to review     8
90-day meets                 7
Exceptional                  4
Exceeds                      2
Needs Improvement            1
Name: count, dtype: int64
Top performers (Exceeds):
        Employee Name Department              Position  Pay Rate
29      Dougall, Eric      IT/IS  IT Manager - Support      64.0
32  Lindsay, Leonara       IT/IS            IT Support      26.0
No PIP employees in this dataset.


## 9) Attrition case study dataset: load, inspect, filter

In [45]:
import pandas as pd

data = pd.read_csv('ATTRITION DATA_Sheet1.csv')
print('Head:')
print(data.head())


Head:
   e_code  status   hire_date service_agreement  job_level  \
0   45631       0  1990-09-17                 N          3   
1   45632       0  1995-10-23                 N          2   
2   45633       0  1996-03-10                 N          2   
3   45634       0  1998-07-13                 N          1   
4   45635       0  1991-09-16                 N          2   

   performance_rating_2018  performance_rating_2017  year_of_birth gender  \
0                        4                        3           1967      M   
1                        1                        2           1971      M   
2                        4                        4           1972      M   
3                        3                        3           1968      F   
4                        3                        3           1961      M   

   distance_from_home  ...  potential_rating  bonus  no_courses_taken  \
0                0.52  ...                 5      0                 8   
1           

In [46]:
print('Info:')
print(data.info())

print('Describe:')
print(data.describe())


Info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 686 entries, 0 to 685
Data columns (total 23 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   e_code                   686 non-null    int64  
 1   status                   686 non-null    int64  
 2   hire_date                686 non-null    object 
 3   service_agreement        686 non-null    object 
 4   job_level                686 non-null    int64  
 5   performance_rating_2018  686 non-null    int64  
 6   performance_rating_2017  686 non-null    int64  
 7   year_of_birth            686 non-null    int64  
 8   gender                   686 non-null    object 
 9   distance_from_home       686 non-null    float64
 10  manager_changes          686 non-null    int64  
 11  no_of_promotions         686 non-null    int64  
 12  risk_of_attrition        686 non-null    int64  
 13  potential_rating         686 non-null    int64  
 14  bonus               

In [47]:
# Gender subsets
male_employees = data[data['gender'] == 'M']
female_employees = data[data['gender'] == 'F']
print('Male count:', len(male_employees))
print('Female count:', len(female_employees))


Male count: 507
Female count: 179


In [48]:
# High performers (>=4)
high_performers = data[data['performance_rating_2018'] >= 4]
print('High performers count:', len(high_performers))
print(high_performers[['e_code', 'performance_rating_2018', 'salary_2018']].head())


High performers count: 198
    e_code  performance_rating_2018  salary_2018
0    45631                        4       124173
2    45633                        4       115500
6    45638                        5       159212
9    45643                        5       165000
13   45649                        5        57803


In [49]:
# High salary (> 50000)
high_salary_employees = data[data['salary_2018'] > 50000]
print('High salary count (>50000):', len(high_salary_employees))
print('Avg salary (high salary group):', high_salary_employees['salary_2018'].mean())


High salary count (>50000): 682
Avg salary (high salary group): 119554.2302052786


In [50]:
# Male + high performance + high salary
high_performing_males = data[(data['gender'] == 'M') & (data['performance_rating_2018'] >= 4) & (data['salary_2018'] > 50000)]
print('High-performing, high-salary males:', len(high_performing_males))


High-performing, high-salary males: 140


In [51]:
# Awards or bonus
awarded_or_bonus = data[(data['awards'] == 'Y') | (data['bonus'] == 1)]
print('Awarded or bonus count:', len(awarded_or_bonus))


Awarded or bonus count: 60


In [52]:
# Selected job levels (3,4,5)
selected_job_levels = data[data['job_level'].isin([3, 4, 5])]
print('Count in job levels 3/4/5:', len(selected_job_levels))
print('Distribution:')
print(selected_job_levels['job_level'].value_counts())


Count in job levels 3/4/5: 129
Distribution:
job_level
3    85
4    33
5    11
Name: count, dtype: int64


In [53]:
# Employees born in the 1980s
eighties_born = data[(data['year_of_birth'] >= 1980) & (data['year_of_birth'] <= 1989)]
current_year = 2024
eighties_born_ages = current_year - eighties_born['year_of_birth']
print('Born in 1980s count:', len(eighties_born))
print('Average age (born in 1980s):', eighties_born_ages.mean())


Born in 1980s count: 151
Average age (born in 1980s): 41.04635761589404


## 10) Missing values and query

In [55]:
print('Missing values per column:')
print(data.isnull().sum())

missing_distance = data[data['distance_from_home'].isnull()]
complete_distance_data = data[data['distance_from_home'].notnull()]
print('Missing distance_from_home count:', len(missing_distance))
print('Complete distance_from_home count:', len(complete_distance_data))


Missing values per column:
e_code                     0
status                     0
hire_date                  0
service_agreement          0
job_level                  0
performance_rating_2018    0
performance_rating_2017    0
year_of_birth              0
gender                     0
distance_from_home         0
manager_changes            0
no_of_promotions           0
risk_of_attrition          0
potential_rating           0
bonus                      0
no_courses_taken           0
time_in_position           0
awards                     0
signon                     0
manager_sat                0
employee_sat               0
salary_2017                0
salary_2018                0
dtype: int64
Missing distance_from_home count: 0
Complete distance_from_home count: 686


In [56]:
# Distribution of risk_of_attrition
print(data['risk_of_attrition'].value_counts())


risk_of_attrition
2    301
3    285
4     66
1     34
Name: count, dtype: int64


In [57]:
# Advanced filtering with query
high_risk_employees = data.query('risk_of_attrition == 4 and performance_rating_2018 >= 4 and salary_2018 < 90000')
print('High risk group count:', len(high_risk_employees))

if len(high_risk_employees) > 0:
    print('Avg distance_from_home:', high_risk_employees['distance_from_home'].mean())
    print('Avg manager_changes:', high_risk_employees['manager_changes'].mean())
    print('Avg employee_sat:', high_risk_employees['employee_sat'].mean())


High risk group count: 7
Avg distance_from_home: 2.2471428571428573
Avg manager_changes: 3.2857142857142856
Avg employee_sat: 80.85714285714286


## 11) groupby + lambda analyses

In [58]:
# Average salary by gender
print('Average salary_2018 by gender:')
print(data.groupby('gender')['salary_2018'].mean().sort_values(ascending=False))


Average salary_2018 by gender:
gender
M    120158.838264
F    116236.826816
Name: salary_2018, dtype: float64


In [59]:
# Awards rate by gender (lambda)
awards_rate = data.groupby('gender')['awards'].apply(lambda x: (x == 'Y').mean() * 100)
print('Awards rate (%) by gender:')
print(awards_rate.sort_values(ascending=False))


Awards rate (%) by gender:
gender
F    8.938547
M    8.678501
Name: awards, dtype: float64


In [60]:
# Mean attrition risk by gender
print('Mean risk_of_attrition by gender:')
print(data.groupby('gender')['risk_of_attrition'].mean().sort_values(ascending=False))


Mean risk_of_attrition by gender:
gender
M    2.568047
F    2.530726
Name: risk_of_attrition, dtype: float64


In [61]:
# Compare low-risk vs high-risk groups
low_risk = data[data['risk_of_attrition'] == 1]
high_risk = data[data['risk_of_attrition'] == 4]

print('Low-risk avg salary:', low_risk['salary_2018'].mean())
print('High-risk avg salary:', high_risk['salary_2018'].mean())
print('Low-risk avg performance (2018):', low_risk['performance_rating_2018'].mean())
print('High-risk avg performance (2018):', high_risk['performance_rating_2018'].mean())


Low-risk avg salary: 197062.5
High-risk avg salary: 124440.56060606061
Low-risk avg performance (2018): 3.0
High-risk avg performance (2018): 2.8636363636363638


## 12) Feature engineering and correlations

In [62]:
from datetime import datetime

current_year = datetime.now().year
data['Age'] = current_year - data['year_of_birth']

data['hire_date'] = pd.to_datetime(data['hire_date'])
current_date = pd.to_datetime(datetime.now())
data['TotalWorkingYears'] = (current_date - data['hire_date']).dt.days // 365

print(data[['year_of_birth','Age','hire_date','TotalWorkingYears']].head())


   year_of_birth  Age  hire_date  TotalWorkingYears
0           1967   59 1990-09-17                 35
1           1971   55 1995-10-23                 30
2           1972   54 1996-03-10                 29
3           1968   58 1998-07-13                 27
4           1961   65 1991-09-16                 34


In [63]:
# Correlations
age_income_correlation = data['Age'].corr(data['salary_2018'])
experience_income_correlation = data['TotalWorkingYears'].corr(data['salary_2018'])

print('Correlation (Age vs salary_2018):', age_income_correlation)
print('Correlation (TotalWorkingYears vs salary_2018):', experience_income_correlation)


Correlation (Age vs salary_2018): 0.24028693983805566
Correlation (TotalWorkingYears vs salary_2018): -0.11862436176458498
