In [60]:
# importing library

import pandas as pd
import numpy as np

In [61]:
raw_data = pd.read_csv('Absenteeism_data.csv')
raw_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 700 entries, 0 to 699
Data columns (total 12 columns):
 #   Column                     Non-Null Count  Dtype  
---  ------                     --------------  -----  
 0   ID                         700 non-null    int64  
 1   Reason for Absence         700 non-null    int64  
 2   Date                       700 non-null    object 
 3   Transportation Expense     700 non-null    int64  
 4   Distance to Work           700 non-null    int64  
 5   Age                        700 non-null    int64  
 6   Daily Work Load Average    700 non-null    float64
 7   Body Mass Index            700 non-null    int64  
 8   Education                  700 non-null    int64  
 9   Children                   700 non-null    int64  
 10  Pets                       700 non-null    int64  
 11  Absenteeism Time in Hours  700 non-null    int64  
dtypes: float64(1), int64(10), object(1)
memory usage: 65.8+ KB


In [62]:
df = raw_data.copy()

In [4]:
df = df.drop(['ID'], axis=1)

In [5]:
# working on date
df['Date'] = pd.to_datetime(df['Date'], format='%d/%m/%Y')

In [6]:
df['Date']

0     2015-07-07
1     2015-07-14
2     2015-07-15
3     2015-07-16
4     2015-07-23
         ...    
695   2018-05-23
696   2018-05-23
697   2018-05-24
698   2018-05-24
699   2018-05-31
Name: Date, Length: 700, dtype: datetime64[ns]

In [7]:
# Relevant functions
def to_month(n):
    return n.month
def to_weekday(n):
    return n.weekday()
    

In [8]:
df['Month value'] = df['Date'].apply(to_month)
df["Day of the week"] = df['Date'].apply(to_weekday)

In [9]:
df = df.drop(['Date'], axis=1)

In [10]:
# Working on Education{0: lower education, 1: Higher education}
df_with_education = df.copy()

In [12]:
df_with_education['Education'] = df_with_education['Education'].map({1:0,2:1,3:1,4:1}) 

In [15]:
# working on Asenteeism time in Hours. if the absenteeism is greater than the median it was  major absenteeism otherwise minor
df_with_absenteeism = df_with_education.copy()

In [22]:
df_with_absenteeism['Absenteeism Time in Hours'].median()

3.0

In [23]:
absenteeism = np.where(df_with_absenteeism['Absenteeism Time in Hours'] > df_with_absenteeism['Absenteeism Time in Hours'].median(),1,0)

In [25]:
df_with_absenteeism['Absenteeism'] = absenteeism

In [26]:
df_with_absenteeism = df_with_absenteeism.drop(['Absenteeism Time in Hours'], axis = 1)

In [27]:
df_with_absenteeism

Unnamed: 0,Reason for Absence,Transportation Expense,Distance to Work,Age,Daily Work Load Average,Body Mass Index,Education,Children,Pets,Month value,Day of the week,Absenteeism
0,26,289,36,33,239.554,30,0,2,1,7,1,1
1,0,118,13,50,239.554,31,0,1,0,7,1,0
2,23,179,51,38,239.554,31,0,0,0,7,2,0
3,7,279,5,39,239.554,24,0,2,0,7,3,1
4,23,289,36,33,239.554,30,0,2,1,7,3,0
...,...,...,...,...,...,...,...,...,...,...,...,...
695,10,179,22,40,237.656,22,1,2,0,5,2,1
696,6,225,26,28,237.656,24,0,1,2,5,2,0
697,10,330,16,28,237.656,25,1,0,0,5,3,1
698,23,235,16,32,237.656,25,1,0,0,5,3,0


In [28]:
# Working with Reason for Absence



df_with_reason = df_with_absenteeism.copy()

In [30]:
reasons = pd.get_dummies(df_with_reason['Reason for Absence'])

In [32]:
reasons.columns.unique()

# The resons have been divided into four categoties

Int64Index([ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16,
            17, 18, 19, 21, 22, 23, 24, 25, 26, 27, 28],
           dtype='int64')

In [33]:
reason_type1 = reasons.loc[:,1:14].max(axis = 1)
reason_type2 = reasons.loc[:,15:17].max(axis = 1)
reason_type3 = reasons.loc[:,18:21].max(axis = 1)
reason_type4 = reasons.loc[:,22:28].max(axis = 1)

In [42]:
df_with_reasons = pd.concat([df_with_reason, reason_type1,reason_type2,reason_type3,reason_type4], axis = 1)

In [44]:
df_with_reasons = df_with_reasons.drop(['Reason for Absence'], axis = 1)

In [45]:
df_with_reasons.columns.values

array(['Transportation Expense', 'Distance to Work', 'Age',
       'Daily Work Load Average', 'Body Mass Index', 'Education',
       'Children', 'Pets', 'Month value', 'Day of the week',
       'Absenteeism', 0, 1, 2, 3], dtype=object)

In [46]:
column_rename = ['Transportation Expense', 'Distance to Work', 'Age',
       'Daily Work Load Average', 'Body Mass Index', 'Education',
       'Children', 'Pets', 'Month value', 'Day of the week',
       'Absenteeism', 'reason_type1', 'reason_type2', 'reason_type3', 'reason_type4']

In [49]:
df_with_reasons.columns = column_rename

In [51]:
df_with_reasons.columns.values

array(['Transportation Expense', 'Distance to Work', 'Age',
       'Daily Work Load Average', 'Body Mass Index', 'Education',
       'Children', 'Pets', 'Month value', 'Day of the week',
       'Absenteeism', 'reason_type1', 'reason_type2', 'reason_type3',
       'reason_type4'], dtype=object)

In [56]:
df_preprocessed = df_with_reasons[['reason_type1', 'reason_type2', 'reason_type3',
        'reason_type4', 'Month value', 'Day of the week','Transportation Expense', 'Distance to Work', 'Age',
        'Daily Work Load Average', 'Body Mass Index', 'Education','Children', 'Pets','Absenteeism']]

In [59]:
df_preprocessed.to_csv('df_preprocessed.csv')