In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 5GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/absenteeism_data.csv


In [2]:
raw_csv_data = pd.read_csv("/kaggle/input/absenteeism_data.csv")

raw_csv_data.head()

Unnamed: 0,ID,Reason for Absence,Date,Transportation Expense,Distance to Work,Age,Daily Work Load Average,Body Mass Index,Education,Children,Pets,Absenteeism Time in Hours
0,11,26,07-07-2015,289,36,33,239.554,30,1,2,1,4
1,36,0,14-07-2015,118,13,50,239.554,31,1,1,0,0
2,3,23,15-07-2015,179,51,38,239.554,31,1,0,0,2
3,7,7,16-07-2015,279,5,39,239.554,24,1,2,0,4
4,11,23,23-07-2015,289,36,33,239.554,30,1,2,1,2


## Preprocessing the data

In [3]:
df = raw_csv_data.copy()

# Setting maximum number of rows displayed in output
pd.options.display.max_rows = 3

In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 699 entries, 0 to 698
Data columns (total 12 columns):
 #   Column                     Non-Null Count  Dtype  
---  ------                     --------------  -----  
 0   ID                         699 non-null    int64  
 1   Reason for Absence         699 non-null    int64  
 2   Date                       699 non-null    object 
 3   Transportation Expense     699 non-null    int64  
 4   Distance to Work           699 non-null    int64  
 5   Age                        699 non-null    int64  
 6   Daily Work Load Average    699 non-null    float64
 7   Body Mass Index            699 non-null    int64  
 8   Education                  699 non-null    int64  
 9   Children                   699 non-null    int64  
 10  Pets                       699 non-null    int64  
 11  Absenteeism Time in Hours  699 non-null    int64  
dtypes: float64(1), int64(10), object(1)
memory usage: 65.7+ KB


Dependent variables in dataset - reason for absence, transportation expense, distance to work, age, daily work load average, BMI, education, children, pets
Independent variable - absenteeism time in hrs

#### Dropping the irrelevant variables (ID) to our analysis

In [5]:
df = df.drop(['ID'], axis = 1)

display(df)

Unnamed: 0,Reason for Absence,Date,Transportation Expense,Distance to Work,Age,Daily Work Load Average,Body Mass Index,Education,Children,Pets,Absenteeism Time in Hours
0,26,07-07-2015,289,36,33,239.554,30,1,2,1,4
...,...,...,...,...,...,...,...,...,...,...,...
698,28,31-05-2018,291,31,40,237.656,25,1,1,1,2


#### Analyzing the reason for absence

In [6]:
display(df['Reason for Absence'].min())
display(df['Reason for Absence'].max())

# Checking all unique values of 'Reason of Absence'
display(pd.unique(df['Reason for Absence']))

# Counting the length of array returned
display(len(pd.unique(df['Reason for Absence'])))

0

28

array([26,  0, 23,  7, 22, 19,  1, 11, 14, 21, 10, 13, 28, 18, 25, 24,  6,
       27, 17,  8, 12,  5,  9, 15,  4,  3,  2, 16])

28

In [7]:
# Sorting array to find out the missing value
sorted(df['Reason for Absence'].unique())

[0,
 1,
 2,
 3,
 4,
 5,
 6,
 7,
 8,
 9,
 10,
 11,
 12,
 13,
 14,
 15,
 16,
 17,
 18,
 19,
 21,
 22,
 23,
 24,
 25,
 26,
 27,
 28]

#### Create dummy variables for 'Reason for Absence'

In [8]:
reason_columns = pd.get_dummies(df['Reason for Absence'], drop_first = True)

In [9]:
# Checking if an employee was absent from work for more than one reason
reason_columns['check'] = reason_columns.sum(axis = 1)

reason_columns

Unnamed: 0,1,2,3,4,5,6,7,8,9,10,...,19,21,22,23,24,25,26,27,28,check
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,1,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
698,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,1


In [10]:
# Calculating sum of check column
display(reason_columns['check'].sum(axis = 0))

# Displaying unique values in check column
display(reason_columns['check'].unique())

661

array([1, 0])

In [11]:
# Dropping the check column
reason_columns = reason_columns.drop(['check'], axis = 1)

# Dropping 'Reason for Absence' to avoid multicollinearity
df = df.drop(['Reason for Absence'], axis = 1)

#### Classifying the 28 reasons into 4 groups

In [12]:
reason_type_1 = reason_columns.iloc[:, 1:14].max(axis = 1)
reason_type_2 = reason_columns.iloc[:, 15:17].max(axis = 1)
reason_type_3 = reason_columns.iloc[:, 18:21].max(axis = 1)
reason_type_4 = reason_columns.iloc[:, 22:].max(axis = 1)

#### Concatenating reason types and df

In [13]:
df = pd.concat([df, reason_type_1, reason_type_2, reason_type_3, reason_type_4], axis = 1)

display(df)

Unnamed: 0,Date,Transportation Expense,Distance to Work,Age,Daily Work Load Average,Body Mass Index,Education,Children,Pets,Absenteeism Time in Hours,0,1,2,3
0,07-07-2015,289,36,33,239.554,30,1,2,1,4,0,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
698,31-05-2018,291,31,40,237.656,25,1,1,1,2,0,0,0,1


In [14]:
# Renaming the columns 0, 1, 2, 4
column_names = ['Date', 'Transportation Expense', 'Distance to Work', 'Age',
       'Daily Work Load Average', 'Body Mass Index', 'Education',
       'Children', 'Pets', 'Absenteeism Time in Hours', 'Reason_1', 'Reason_2', 'Reason_3', 'Reason_4'] 
df.columns = column_names

# Reordering the columns
column_names_reordered = [ 'Reason_1', 'Reason_2', 'Reason_3', 'Reason_4', 'Date', 'Transportation Expense', 'Distance to Work', 'Age',
       'Daily Work Load Average', 'Body Mass Index', 'Education',
       'Children', 'Pets', 'Absenteeism Time in Hours'] 
df = df[column_names_reordered]

#### Creating a checkpoint

In [15]:
df_reasons_mod = df.copy()


#### Analyzing the 'Date' column

In [16]:
type(df_reasons_mod['Date'][0])

str

In [17]:
# Converting the column values to timestamp
df_reasons_mod['Date'] = pd.to_datetime(df_reasons_mod['Date'], format = '%d-%m-%Y')

df_reasons_mod['Date']

0     2015-07-07
         ...    
698   2018-05-31
Name: Date, Length: 699, dtype: datetime64[ns]

In [18]:
# Extracting the month values from 'Date' column
list_months = []
df_reasons_rows_length = df_reasons_mod.shape[0]
for i in range(df_reasons_rows_length):
    list_months.append(df_reasons_mod['Date'][i].month)
    
# Printing the length of the list_months
display(len(list_months))    

699

In [19]:
# Adding the list_months to df_reasons_mod
df_reasons_mod['Month Value'] = list_months

# Extracting the day of the week from 'Date'
def date_to_weekday(date_value):
    return date_value.weekday()

# Adding 'Day of the Week' column to the dataset by calling the date_to_weekday function
df_reasons_mod['Day of the Week'] = df_reasons_mod['Date'].apply(date_to_weekday)

In [20]:
# Removing the 'Date' column
df_reasons_mod.drop(['Date'], axis = 1)

# Reordering the columns
column_names_reordered = [ 'Reason_1', 'Reason_2', 'Reason_3', 'Reason_4', 'Month Value', 'Day of the Week', 'Transportation Expense', 'Distance to Work', 'Age',
       'Daily Work Load Average', 'Body Mass Index', 'Education',
       'Children', 'Pets', 'Absenteeism Time in Hours'] 
df_reasons_mod = df_reasons_mod[column_names_reordered]

# Printing df_reasons_mod
display(df_reasons_mod)

Unnamed: 0,Reason_1,Reason_2,Reason_3,Reason_4,Month Value,Day of the Week,Transportation Expense,Distance to Work,Age,Daily Work Load Average,Body Mass Index,Education,Children,Pets,Absenteeism Time in Hours
0,0,0,0,1,7,1,289,36,33,239.554,30,1,2,1,4
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
698,0,0,0,1,5,3,291,31,40,237.656,25,1,1,1,2


#### Creating a checkpoint

In [21]:
df_date_reasons_mod = df_reasons_mod.copy()

#### Analyzing 'Education' column

In [22]:
# Calculating the number of unique values in the 'Education' column
df_date_reasons_mod['Education'].unique()

array([1, 3, 2, 4])

In [23]:
# Setting maximum number of rows displayed in output
pd.options.display.max_rows = 4

# Counting the number of each value present in the column
df_date_reasons_mod['Education'].value_counts()

1    582
3     73
2     40
4      4
Name: Education, dtype: int64

As the number of 2s, 3s, and 4s are less compared to 1s, we will group the last three values into one

In [24]:
# Grouping values 2, 3, and 4 to one group
# Mapping 1 to 0 & 2, 3, 4 to 1
df_date_reasons_mod['Education'] = df_date_reasons_mod['Education'].map({1:0, 2:1, 3:1, 4:1})

#### Final Checkpoint

In [25]:
df_preprocessed = df_date_reasons_mod.copy()

df_preprocessed

Unnamed: 0,Reason_1,Reason_2,Reason_3,Reason_4,Month Value,Day of the Week,Transportation Expense,Distance to Work,Age,Daily Work Load Average,Body Mass Index,Education,Children,Pets,Absenteeism Time in Hours
0,0,0,0,1,7,1,289,36,33,239.554,30,0,2,1,4
1,0,0,0,0,7,1,118,13,50,239.554,31,0,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
697,0,0,0,0,5,3,235,16,32,237.656,25,1,0,0,2
698,0,0,0,1,5,3,291,31,40,237.656,25,0,1,1,2


Creating the targets

In [26]:
# Calculating the mean of the 'Absenteeism Time in Hours' column values
df_preprocessed_median = df_preprocessed['Absenteeism Time in Hours'].median()

In [27]:
# Classifying data into two groups, for 'Absenteeism Time in Hours' > median - class 1 otherwise 0
targets = np.where(df_preprocessed['Absenteeism Time in Hours'] > df_preprocessed_median, 1, 0)

# Adding targets to the dataframe 
df_preprocessed['Excessive Absenteeism'] = targets

In [28]:
# Check if the groups have been divded almost equally, proceed if value is between 0.40 to 0.50 for accurate model creation
targets.sum() / targets.shape[0]

0.4563662374821173

In [29]:
# Dropping 'Absenteeism Time in Hours' column, and assigning the returned dataframe
data_with_targets = df_preprocessed.drop(['Absenteeism Time in Hours', 'Distance to Work', 'Daily Work Load Average', 'Age'], axis = 1)

### Select inputs for regression

In [30]:
# Selecting first 14 columns as inputs
unscaled_inputs = data_with_targets.iloc[:, :-1]

# Printing the data
display(unscaled_inputs)

Unnamed: 0,Reason_1,Reason_2,Reason_3,Reason_4,Month Value,Day of the Week,Transportation Expense,Body Mass Index,Education,Children,Pets
0,0,0,0,1,7,1,289,30,0,2,1
1,0,0,0,0,7,1,118,31,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...
697,0,0,0,0,5,3,235,25,1,0,0
698,0,0,0,1,5,3,291,25,0,1,1


Standardize the data

In [31]:
# Importing StandardScaler, BaseEstimator, and TransformerMixin from sklean
from sklearn.preprocessing import StandardScaler
from sklearn.base import BaseEstimator, TransformerMixin

class CustomScaler(BaseEstimator, TransformerMixin):
    
    def __init__(self, columns, copy = True, with_mean = True, with_std = True):
        self.scaler = StandardScaler(copy, with_mean, with_std)
        self.columns = columns
        self.mean_ = None
        self.var_ = None
    
    def fit(self, X, y = None):
        self.scaler.fit(X[self.columns], y)
        self.mean_ = np.mean(X[self.columns])
        self.var_ = np.var(X[self.columns])
        return self
    
    def transform(self, X, y = None, copy = None):
        init_col_order = X.columns
        X_scaled = pd.DataFrame(self.scaler.transform(X[self.columns]), columns = self.columns)
        X_not_scaled = X.iloc[:, ~X.columns.isin(self.columns)]
        return pd.concat([X_not_scaled, X_scaled], axis = 1)[init_col_order]
    
columns_to_omit = ['Reason_1', 'Reason_2', 'Reason_3', 'Reason_4']
columns_to_scale = [x for x in unscaled_inputs.columns.values if x not in columns_to_omit] 
        
# Creating an empty scaler object
absenteeism_scaler = CustomScaler(columns_to_scale)

In [32]:
# Calculating mean and standard deviation of each unscaled input
absenteeism_scaler.fit(unscaled_inputs)



CustomScaler(columns=['Month Value', 'Day of the Week',
                      'Transportation Expense', 'Body Mass Index', 'Education',
                      'Children', 'Pets'],
             copy=None, with_mean=None, with_std=None)

In [33]:
# Creating scaled inputs using absenteeism_scaler object
scaled_inputs = absenteeism_scaler.transform(unscaled_inputs)

# Printing scaled inputs
display(scaled_inputs)

Unnamed: 0,Reason_1,Reason_2,Reason_3,Reason_4,Month Value,Day of the Week,Transportation Expense,Body Mass Index,Education,Children,Pets
0,0,0,0,1,0.183294,-0.682176,1.004498,0.768869,-0.448365,0.879058,0.267518
1,0,0,0,0,0.183294,-0.682176,-1.574973,1.004073,-0.448365,-0.020593,-0.590258
...,...,...,...,...,...,...,...,...,...,...,...
697,0,0,0,0,-0.387407,0.670564,0.189928,-0.407147,2.230327,-0.920243,-0.590258
698,0,0,0,1,-0.387407,0.670564,1.034667,-0.407147,-0.448365,-0.020593,0.267518


Shuffle and divide the data into train & test

In [34]:
# Importing train_test_split
from sklearn.model_selection import train_test_split

In [35]:
# Shuffling and dividing the data
x_train, x_test, y_train, y_test = train_test_split(scaled_inputs, targets, train_size = 0.8, shuffle = True, random_state = 5)

# Checking shapes train and test data
print(x_train.shape, y_train.shape, "&", x_test.shape, y_test.shape)

(559, 11) (559,) & (140, 11) (140,)


## **Logistic Regression using Machine Learning library sklearn**

In [36]:
# Importing LogisticRegression and matrics from sklearn
from sklearn.linear_model import LogisticRegression
from sklearn import metrics

Training the model

In [37]:
# Creating object of LogisticRegression
regression = LogisticRegression()

# Creating the model using machine learning
regression.fit(x_train, y_train)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='auto', n_jobs=None, penalty='l2',
                   random_state=None, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False)

In [38]:
# Calculating accuracy of the model
print("Accuracy: ", regression.score(x_train, y_train))

# Calculationg accuracy manually
model_output = regression.predict(x_train)
correct_predictions = np.sum(model_output == y_train) # Calculating sum of correctly predicted outputs
manual_accuracy = correct_predictions / model_output.shape[0]
print("Accuracy by manual calculation: ", manual_accuracy)

Accuracy:  0.7513416815742398
Accuracy by manual calculation:  0.7513416815742398


In [39]:
# Finding intercepts and coefficiants
print("Intercepts", regression.intercept_)
print("Coefficiants", regression.coef_)

# Creating table to map coefficiants with indepdent variable values
feature_names = unscaled_inputs.columns.values 

# Creating summary
summary = pd.DataFrame(columns = ['Feature Names'], data = feature_names)
summary['Coefficients'] = np.transpose(regression.coef_)

# Adding intercept to the summary
summary.index = summary.index + 1
summary.loc[0] = ['Intercept', regression.intercept_[0]]

# Sorting my index
summary = summary.sort_index()
display(summary)

Intercepts [-0.80301785]
Coefficiants [[ 1.82954524 -0.29577508  2.21488453 -0.18015217  0.09219541 -0.27972192
   0.45541654  0.09492564  0.11584819  0.43397297 -0.36155211]]


Unnamed: 0,Feature Names,Coefficients
0,Intercept,-0.803018
1,Reason_1,1.829545
...,...,...
10,Children,0.433973
11,Pets,-0.361552


#### Studying the model coefficiants

In [40]:
# Adding odds ratio column
summary['Odds Ratio'] = np.exp(summary['Coefficients'])

# Setting maximum number of rows displayed in output
pd.options.display.max_rows = None

# Sorting summary table
summary.sort_values('Odds Ratio', ascending = False)

Unnamed: 0,Feature Names,Coefficients,Odds Ratio
3,Reason_3,2.214885,9.160351
1,Reason_1,1.829545,6.231052
7,Transportation Expense,0.455417,1.57683
10,Children,0.433973,1.543377
9,Education,0.115848,1.122825
8,Body Mass Index,0.094926,1.099577
5,Month Value,0.092195,1.096579
4,Reason_4,-0.180152,0.835143
6,Day of the Week,-0.279722,0.755994
2,Reason_2,-0.295775,0.743955


## Testing the model

In [41]:
# Testing the accuracy with the testing data
display(regression.score(x_test, y_test))

0.6857142857142857

In [42]:
# Finding estimates for possible outputs with x_test as input
predicted_proba = regression.predict_proba(x_test)

display(predicted_proba[:,1])

array([0.78866967, 0.40377287, 0.93723839, 0.71712195, 0.69153393,
       0.64377733, 0.22855656, 0.17590453, 0.17532485, 0.17211057,
       0.15912522, 0.8938425 , 0.55840089, 0.6460777 , 0.28817229,
       0.1466337 , 0.24165084, 0.87694228, 0.07667595, 0.3580118 ,
       0.40386486, 0.16869974, 0.23928916, 0.64656505, 0.15829953,
       0.84897568, 0.54537747, 0.69045794, 0.31003333, 0.81724814,
       0.24374028, 0.76242583, 0.2202764 , 0.5115144 , 0.1544825 ,
       0.15416738, 0.15608692, 0.2411538 , 0.40890341, 0.90920237,
       0.56486606, 0.31237339, 0.80721754, 0.598641  , 0.8312926 ,
       0.82776821, 0.32628336, 0.36524827, 0.70628642, 0.8087439 ,
       0.63397683, 0.67088781, 0.11412049, 0.86735033, 0.64978742,
       0.76318701, 0.29340836, 0.25858088, 0.80721754, 0.8529218 ,
       0.54972459, 0.79985717, 0.09364933, 0.84897706, 0.1466337 ,
       0.59068584, 0.47598312, 0.18701644, 0.4742865 , 0.55611114,
       0.59931252, 0.81568061, 0.2878127 , 0.19089448, 0.20480