In [1]:
import pandas as pd
import numpy as np
import eda  

In [2]:
data = eda.file2df(file_ = "Absenteeism_data.csv")
raw_data_copy = data.copy() #Safety dataset copy
data.head(5)

Unnamed: 0,ID,Reason for Absence,Date,Transportation Expense,Distance to Work,Age,Daily Work Load Average,Body Mass Index,Education,Children,Pets,Absenteeism Time in Hours
0,11,26,07/07/2015,289,36,33,239.554,30,1,2,1,4
1,36,0,14/07/2015,118,13,50,239.554,31,1,1,0,0
2,3,23,15/07/2015,179,51,38,239.554,31,1,0,0,2
3,7,7,16/07/2015,279,5,39,239.554,24,1,2,0,4
4,11,23,23/07/2015,289,36,33,239.554,30,1,2,1,2


In [3]:
data.drop(["ID"], axis = 1, inplace = True) #ID not important information 
data.rename(columns = {"Reason for Absence":"reason", "Transportation Expense":"transp exp", "Distance to work":"distance", "Daily Work Load Average":"avg work load", "Body Mass Index":"BMI", "Children":"kids", "Absenteeism Time in Hours":"absence/h"}, inplace = True)
data.rename(str.lower, axis = 1, inplace = True)
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 700 entries, 0 to 699
Data columns (total 11 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   reason            700 non-null    int64  
 1   date              700 non-null    object 
 2   transp exp        700 non-null    int64  
 3   distance to work  700 non-null    int64  
 4   age               700 non-null    int64  
 5   avg work load     700 non-null    float64
 6   bmi               700 non-null    int64  
 7   education         700 non-null    int64  
 8   kids              700 non-null    int64  
 9   pets              700 non-null    int64  
 10  absence/h         700 non-null    int64  
dtypes: float64(1), int64(9), object(1)
memory usage: 60.3+ KB


In [4]:
data.describe() #reason categorical variable, from 0 to 28 classes. Statistical analysis not important

Unnamed: 0,reason,transp exp,distance to work,age,avg work load,bmi,education,kids,pets,absence/h
count,700.0,700.0,700.0,700.0,700.0,700.0,700.0,700.0,700.0,700.0
mean,19.411429,222.347143,29.892857,36.417143,271.801774,26.737143,1.282857,1.021429,0.687143,6.761429
std,8.356292,66.31296,14.804446,6.379083,40.021804,4.254701,0.66809,1.112215,1.166095,12.670082
min,0.0,118.0,5.0,27.0,205.917,19.0,1.0,0.0,0.0,0.0
25%,13.0,179.0,16.0,31.0,241.476,24.0,1.0,0.0,0.0,2.0
50%,23.0,225.0,26.0,37.0,264.249,25.0,1.0,1.0,0.0,3.0
75%,27.0,260.0,50.0,40.0,294.217,31.0,1.0,2.0,1.0,8.0
max,28.0,388.0,52.0,58.0,378.884,38.0,4.0,4.0,8.0,120.0


In [5]:
reasons = pd.get_dummies(data["reason"], drop_first = True)
data.drop(columns = ["reason"], axis = 1, inplace = True)

#reasons grouping in 4 major categories:
data["reason_1"]= reasons.loc[:, 1:14].max(axis = 1) 
data["reason_2"] = reasons.loc[:, 15:17].max(axis = 1) 
data["reason_3"] = reasons.loc[:, 18:21].max(axis = 1) 
data["reason_4"] = reasons.loc[:, 22:].max(axis = 1)
data.head()

Unnamed: 0,date,transp exp,distance to work,age,avg work load,bmi,education,kids,pets,absence/h,reason_1,reason_2,reason_3,reason_4
0,07/07/2015,289,36,33,239.554,30,1,2,1,4,0,0,0,1
1,14/07/2015,118,13,50,239.554,31,1,1,0,0,0,0,0,0
2,15/07/2015,179,51,38,239.554,31,1,0,0,2,0,0,0,1
3,16/07/2015,279,5,39,239.554,24,1,2,0,4,1,0,0,0
4,23/07/2015,289,36,33,239.554,30,1,2,1,2,0,0,0,1


In [6]:
data_until_reasons = data.copy() #check point until reason classification

In [7]:
data["date"] = pd.to_datetime(data["date"], format = "%d/%m/%Y")
data["weekday"] = data["date"].apply(eda.date2weekday)
data["month"] = data["date"].apply(eda.date2month)
data.drop(["date"], axis = 1, inplace = True)
data.head()

Unnamed: 0,transp exp,distance to work,age,avg work load,bmi,education,kids,pets,absence/h,reason_1,reason_2,reason_3,reason_4,weekday,month
0,289,36,33,239.554,30,1,2,1,4,0,0,0,1,1,7
1,118,13,50,239.554,31,1,1,0,0,0,0,0,0,1,7
2,179,51,38,239.554,31,1,0,0,2,0,0,0,1,2,7
3,279,5,39,239.554,24,1,2,0,4,1,0,0,0,3,7
4,289,36,33,239.554,30,1,2,1,2,0,0,0,1,3,7


In [8]:
print(data["education"].unique())
data["education"].value_counts() 

[1 3 2 4]


1    583
3     73
2     40
4      4
Name: education, dtype: int64

In [9]:
#4 ranges of education, make sence to group in 2 categories with or without higher education
data["education"] = data["education"].map({1:0, 2:1, 3:1, 4:1}) #Now 0 != [Higher Education] & 1 == [Higher Education]
print(data["education"].unique())
data["education"].value_counts() 

[0 1]


0    583
1    117
Name: education, dtype: int64

In [10]:
eda.df2file(data)