# Descriptives, Frequencies, and Averages

# Outline
- Descriptives statistics
- Frequencies
- Mean, standard deviation, and median

In [1]:
import pandas as pd
import numpy as np
df = pd.read_csv('employee_attrition.csv')

## Descriptive Statistics

In [2]:
##  Handling floats because 2 decimal places is easier to read.  In your own analysis in the future you may need more accuracy
pd.set_option("display.float_format", "{:.2f}".format)
df.describe()

Unnamed: 0,Age,DailyRate,DistanceFromHome,EnvironmentSatisfaction,HourlyRate,JobInvolvement,JobLevel,JobSatisfaction,MonthlyIncome,NumCompaniesWorked,...,PerformanceRating,RelationshipSatisfaction,StockOptionLevel,TotalWorkingYears,TrainingTimesLastYear,WorkLifeBalance,YearsAtCompany,YearsInCurrentRole,YearsSinceLastPromotion,YearsWithCurrManager
count,1470.0,1470.0,1470.0,1470.0,1470.0,1470.0,1470.0,1470.0,1470.0,1470.0,...,1470.0,1470.0,1470.0,1470.0,1470.0,1470.0,1470.0,1470.0,1470.0,1470.0
mean,36.92,802.49,9.19,2.72,65.89,2.73,2.06,2.73,6502.93,2.69,...,3.15,2.71,0.79,11.28,2.8,2.76,7.01,4.23,2.19,4.12
std,9.14,403.51,8.11,1.09,20.33,0.71,1.11,1.1,4707.96,2.5,...,0.36,1.08,0.85,7.78,1.29,0.71,6.13,3.62,3.22,3.57
min,18.0,102.0,1.0,1.0,30.0,1.0,1.0,1.0,1009.0,0.0,...,3.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
25%,30.0,465.0,2.0,2.0,48.0,2.0,1.0,2.0,2911.0,1.0,...,3.0,2.0,0.0,6.0,2.0,2.0,3.0,2.0,0.0,2.0
50%,36.0,802.0,7.0,3.0,66.0,3.0,2.0,3.0,4919.0,2.0,...,3.0,3.0,1.0,10.0,3.0,3.0,5.0,3.0,1.0,3.0
75%,43.0,1157.0,14.0,4.0,83.75,3.0,3.0,4.0,8379.0,4.0,...,3.0,4.0,1.0,15.0,3.0,3.0,9.0,7.0,3.0,7.0
max,60.0,1499.0,29.0,4.0,100.0,4.0,5.0,4.0,19999.0,9.0,...,4.0,4.0,3.0,40.0,6.0,4.0,40.0,18.0,15.0,17.0


In [3]:
df['Age'].describe()

count   1470.00
mean      36.92
std        9.14
min       18.00
25%       30.00
50%       36.00
75%       43.00
max       60.00
Name: Age, dtype: float64

## Frequencies

In [4]:
df['Age'].count()

1470

In [5]:
df['Age'].value_counts()

35    78
34    77
31    69
36    69
29    68
32    61
30    60
33    58
38    58
40    57
37    50
27    48
28    48
42    46
39    42
45    41
41    40
26    39
46    33
44    33
43    32
50    30
24    26
25    26
47    24
49    24
55    22
48    19
51    19
53    19
52    18
54    18
22    16
56    14
58    14
23    14
21    13
20    11
59    10
19     9
18     8
60     5
57     4
Name: Age, dtype: int64

In [6]:
###  Categorical features unique values

object_col = []
for column in df.columns:
    if df[column].dtype == object and len(df[column].unique()) <= 30:
        object_col.append(column)
        print(f"{column} : {df[column].unique()}")
        print(df[column].value_counts())
        print("====================================")
object_col.remove('Attrition')

Attrition : ['Yes' 'No']
No     1233
Yes     237
Name: Attrition, dtype: int64
BusinessTravel : ['Travel_Rarely' 'Travel_Frequently' 'Non-Travel']
Travel_Rarely        1043
Travel_Frequently     277
Non-Travel            150
Name: BusinessTravel, dtype: int64
Gender : ['Female' 'Male']
Male      882
Female    588
Name: Gender, dtype: int64
MaritalStatus : ['Single' 'Married' 'Divorced']
Married     673
Single      470
Divorced    327
Name: MaritalStatus, dtype: int64
OverTime : ['Yes' 'No']
No     1054
Yes     416
Name: OverTime, dtype: int64


In [7]:
##  Number of unique values
for column in df.columns:
    print(f"{column}: Number of unique values {df[column].nunique()}")

Age: Number of unique values 43
Attrition: Number of unique values 2
BusinessTravel: Number of unique values 3
DailyRate: Number of unique values 886
DistanceFromHome: Number of unique values 29
EnvironmentSatisfaction: Number of unique values 4
Gender: Number of unique values 2
HourlyRate: Number of unique values 71
JobInvolvement: Number of unique values 4
JobLevel: Number of unique values 5
JobSatisfaction: Number of unique values 4
MaritalStatus: Number of unique values 3
MonthlyIncome: Number of unique values 1349
NumCompaniesWorked: Number of unique values 10
OverTime: Number of unique values 2
PercentSalaryHike: Number of unique values 15
PerformanceRating: Number of unique values 2
RelationshipSatisfaction: Number of unique values 4
StockOptionLevel: Number of unique values 4
TotalWorkingYears: Number of unique values 40
TrainingTimesLastYear: Number of unique values 7
WorkLifeBalance: Number of unique values 4
YearsAtCompany: Number of unique values 37
YearsInCurrentRole: Numb

## Mean, standard deviation, and median

In [8]:
np.mean(df['Age'])

36.923809523809524

In [9]:
np.std(df['Age'])

9.13226569061539

In [10]:
np.median(df['Age'])

36.0

In [11]:
np.nanmedian(df['Age'])

36.0