Importing Libraries

In [2]:
import numpy as np
import pandas as pd 
import matplotlib.pyplot as plt 
import seaborn as sns 
sns.set()
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
import tensorflow as tf 
from sklearn import metrics
import pickle 

Importing dataset 

In [3]:
raw_data = pd.read_csv('Absenteeism-data.csv')
data = raw_data.copy()

In [4]:
print(data.shape)
print(data.columns)

(700, 12)
Index(['ID', 'Reason for Absence', 'Date', 'Transportation Expense',
       'Distance to Work', 'Age', 'Daily Work Load Average', 'Body Mass Index',
       'Education', 'Children', 'Pets', 'Absenteeism Time in Hours'],
      dtype='object')


 We have 12 columns and 700 rows of data 

Dropping unwanted ID and age column

In [5]:
data = data.drop(['ID'],axis=1)
data = data.drop(['Age'],axis=1)

In [6]:
data.isnull().sum()

Reason for Absence           0
Date                         0
Transportation Expense       0
Distance to Work             0
Daily Work Load Average      0
Body Mass Index              0
Education                    0
Children                     0
Pets                         0
Absenteeism Time in Hours    0
dtype: int64

There is no missing data in the dataset

In [7]:
data.describe()

Unnamed: 0,Reason for Absence,Transportation Expense,Distance to Work,Daily Work Load Average,Body Mass Index,Education,Children,Pets,Absenteeism Time in Hours
count,700.0,700.0,700.0,700.0,700.0,700.0,700.0,700.0,700.0
mean,19.411429,222.347143,29.892857,271.801774,26.737143,1.282857,1.021429,0.687143,6.761429
std,8.356292,66.31296,14.804446,40.021804,4.254701,0.66809,1.112215,1.166095,12.670082
min,0.0,118.0,5.0,205.917,19.0,1.0,0.0,0.0,0.0
25%,13.0,179.0,16.0,241.476,24.0,1.0,0.0,0.0,2.0
50%,23.0,225.0,26.0,264.249,25.0,1.0,1.0,0.0,3.0
75%,27.0,260.0,50.0,294.217,31.0,1.0,2.0,1.0,8.0
max,28.0,388.0,52.0,378.884,38.0,4.0,4.0,8.0,120.0


In [9]:
data.head()

Unnamed: 0,Reason for Absence,Date,Transportation Expense,Distance to Work,Daily Work Load Average,Body Mass Index,Education,Children,Pets,Absenteeism Time in Hours
0,26,07/07/2015,289,36,239.554,30,1,2,1,4
1,0,14/07/2015,118,13,239.554,31,1,1,0,0
2,23,15/07/2015,179,51,239.554,31,1,0,0,2
3,7,16/07/2015,279,5,239.554,24,1,2,0,4
4,23,23/07/2015,289,36,239.554,30,1,2,1,2


## Modifying Reason column

In [10]:
data['Reason for Absence'].unique()

array([26,  0, 23,  7, 22, 19,  1, 11, 14, 21, 10, 13, 28, 18, 25, 24,  6,
       27, 17,  8, 12,  5,  9, 15,  4,  3,  2, 16], dtype=int64)

Creating dummy variables for reason column

Reducing the number of dummy variables by combining variables with diseases in same category

In [11]:
reason_column = pd.get_dummies(data['Reason for Absence'], drop_first=True)

reason_type_1 = reason_column.loc[:,1:14].max(axis=1)
reason_type_2 = reason_column.loc[:,15:17].max(axis=1)
reason_type_3 = reason_column.loc[:,18:21].max(axis=1)
reason_type_4 = reason_column.loc[:,22:28].max(axis=1)

Adding the four reason types in our dataset and dropping the orignal reason column as it may introduce multicollinearity

In [12]:
data = pd.concat([data,reason_type_1,reason_type_2,reason_type_3,reason_type_4],axis=1)

data = data.drop(['Reason for Absence'],axis=1)

Changing the names of the added columns to the relevant ones so that it is easy to identify them 

In [13]:
cols = ['Date', 'Transportation Expense', 'Distance to Work',
       'Daily Work Load Average', 'Body Mass Index', 'Education',
       'Children', 'Pets', 'Absenteeism Time in Hours', 'reason_type_1', 'reason_type_2', 'reason_type_3', 'reason_type_4'] 

data.columns = cols

Reorder the Column names

In [14]:
reorder_columns = ['reason_type_1', 'reason_type_2', 'reason_type_3', 'reason_type_4',
                   'Date', 'Transportation Expense', 'Distance to Work','Daily Work Load Average', 
                   'Body Mass Index', 'Education','Children', 'Pets', 'Absenteeism Time in Hours'] 

data = data[reorder_columns]

In [15]:
data.head()

Unnamed: 0,reason_type_1,reason_type_2,reason_type_3,reason_type_4,Date,Transportation Expense,Distance to Work,Daily Work Load Average,Body Mass Index,Education,Children,Pets,Absenteeism Time in Hours
0,0,0,0,1,07/07/2015,289,36,239.554,30,1,2,1,4
1,0,0,0,0,14/07/2015,118,13,239.554,31,1,1,0,0
2,0,0,0,1,15/07/2015,179,51,239.554,31,1,0,0,2
3,1,0,0,0,16/07/2015,279,5,239.554,24,1,2,0,4
4,0,0,0,1,23/07/2015,289,36,239.554,30,1,2,1,2


Creating a checkpoint after modifying reason column 

In [16]:
df_checkpoint = data.copy()

## Modifying Date column 

Changing Date column to time stamp for further processing

In [17]:
data['Date'] = pd.to_datetime(data['Date'], format='%d/%m/%Y')

Extracting the months from the time stamp column and adding a new month column to the dataframe

In [18]:
month = []

for i in range (data.shape[0]):
    month.append(data['Date'][i].month)
    
data['Month'] = month

Extracting the Weekdays from the time stamp column and adding a new Weekday column to the dataframe

In [19]:
weekday = []

for i in range (data.shape[0]):
    weekday.append(data['Date'][i].weekday())
    
data['Weekday'] = weekday

In [20]:
## Dropping the date column from the dataframe

data = data.drop(['Date'],axis=1)

In [21]:
## Reordering the columns so that month and date columns coms where date used to be

cols_names = ['reason_type_1', 'reason_type_2', 'reason_type_3', 'reason_type_4', 'Month', 'Weekday',
       'Transportation Expense', 'Distance to Work', 'Daily Work Load Average',
       'Body Mass Index', 'Education', 'Children', 'Pets',
       'Absenteeism Time in Hours']

data = data[cols_names]

Cresting Checkpoint after modifying reason and date column 

In [9]:
data_reason_date_mod = data.copy()

## Modifying the Education column 

The Education column have 4 distinct values going from high school qualified up to the post graduate or even higher

Since most of the people have high school education (1) only so its better to group them as one and the rest as remaining 

In [23]:
print(data['Education'].value_counts())

data['Education'] = data['Education'].map({1:0, 2:1, 3:1, 4:1})

print(data['Education'].value_counts())

1    583
3     73
2     40
4      4
Name: Education, dtype: int64
0    583
1    117
Name: Education, dtype: int64


Final Check Point and saving the preprocessed dataset as a .csv file 

In [24]:
df_preprocessed = data.copy()

df_preprocessed.to_csv('Absenteeism_preprocessed.csv',index=False)

## Lets scale and split the data first

In [25]:
data_preprocessed = pd.read_csv('Absenteeism_preprocessed.csv')

Defining the targets based on the median of the absenteeism data. 

**NOTE-** Median is a good parameter as it devides the data into almost 2 equal parts

In [26]:
absent_time_median = data_preprocessed['Absenteeism Time in Hours'].median()

targets = []

for i in range (data_preprocessed.shape[0]):
    if data_preprocessed['Absenteeism Time in Hours'][i] <= absent_time_median :
        targets.append(0)
    else:
        targets.append(1)

Adding column 'Excessive Absenteeism' in the dataframe which is equal to the targets 

In [27]:
data_preprocessed['Excessive Absenteeism'] = targets
targets = data_preprocessed['Excessive Absenteeism']

In [28]:
## Dropping 'Absenteeism Time in Hours' column from the dataframe 

data_preprocessed = data_preprocessed.drop(['Absenteeism Time in Hours'], axis=1)

Defining the inputs and targets for the machine learning part 

In [29]:
unscaled_inputs = data_preprocessed.iloc[:, :-1]
targets = data_preprocessed['Excessive Absenteeism']

Standarizing the data 

In [30]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
scaler.fit(unscaled_inputs)
scaled_inputs = scaler.transform(unscaled_inputs)

Splitting the data using train_test_split

In [31]:
from sklearn.model_selection import train_test_split 

x_train, x_test, y_train, y_test = train_test_split(scaled_inputs, targets, test_size=0.2, random_state=20)

In [32]:
print(x_train.shape, y_train.shape)
print(x_test.shape, y_test.shape)

(560, 13) (560,)
(140, 13) (140,)


## Logistic Regression With SKlearn

In [33]:
from sklearn.linear_model import LogisticRegression
from sklearn import metrics

reg = LogisticRegression()
reg.fit(x_train, y_train)

LogisticRegression()

In [34]:
reg.score(x_train,y_train)

0.775

Creating the Summary table with Feature names and intercepts

In [35]:
features_names = unscaled_inputs.columns.values
intercept = reg.intercept_

summary_table = pd.DataFrame(columns = ['Feature Names'], data=features_names)
summary_table['intercepts'] = np.transpose(reg.coef_)
summary_table.append({'Feature Names':'intercept', 'intercepts':np.transpose(reg.intercept_)}, ignore_index=True)

Unnamed: 0,Feature Names,intercepts
0,reason_type_1,2.05558
1,reason_type_2,0.32201
2,reason_type_3,1.54297
3,reason_type_4,1.32257
4,Month,0.193569
5,Weekday,-0.0832674
6,Transportation Expense,0.725748
7,Distance to Work,-0.00922534
8,Daily Work Load Average,-0.00537205
9,Body Mass Index,0.213681


calculating the odds ratio 

In [36]:
summary_table['odds Ratio'] = np.exp(summary_table['intercepts'])
summary_table.sort_values('odds Ratio',ascending = False)

Unnamed: 0,Feature Names,intercepts,odds Ratio
0,reason_type_1,2.055582,7.811383
2,reason_type_3,1.542973,4.678477
3,reason_type_4,1.322569,3.75305
6,Transportation Expense,0.725748,2.066277
11,Children,0.344023,1.410611
1,reason_type_2,0.32201,1.379899
9,Body Mass Index,0.213681,1.238228
4,Month,0.193569,1.213573
8,Daily Work Load Average,-0.005372,0.994642
7,Distance to Work,-0.009225,0.990817


## Testing the model

In [37]:
## Checking the accuracy of model in case of test data

reg.score(x_test,y_test)

0.7071428571428572

In [38]:
## Cheking the output of the model 
## checking the accuracy of the model manually

model_output = reg.predict(x_test)
np.sum(model_output==y_test) / y_test.shape[0]

0.7071428571428572

In [39]:
## Calculating the probablity of wach output

predicted_proba = reg.predict_proba(x_test)
predicted_proba

array([[0.74146157, 0.25853843],
       [0.5445401 , 0.4554599 ],
       [0.38616124, 0.61383876],
       [0.8160007 , 0.1839993 ],
       [0.06129188, 0.93870812],
       [0.31296195, 0.68703805],
       [0.31851423, 0.68148577],
       [0.07220814, 0.92779186],
       [0.83125527, 0.16874473],
       [0.76100144, 0.23899856],
       [0.48146841, 0.51853159],
       [0.17978915, 0.82021085],
       [0.04876915, 0.95123085],
       [0.73214144, 0.26785856],
       [0.13081363, 0.86918637],
       [0.59623387, 0.40376613],
       [0.54985654, 0.45014346],
       [0.58212355, 0.41787645],
       [0.23177289, 0.76822711],
       [0.03074164, 0.96925836],
       [0.72815912, 0.27184088],
       [0.80739885, 0.19260115],
       [0.39488063, 0.60511937],
       [0.42207322, 0.57792678],
       [0.21845725, 0.78154275],
       [0.79042404, 0.20957596],
       [0.45563249, 0.54436751],
       [0.88874886, 0.11125114],
       [0.14059648, 0.85940352],
       [0.79849418, 0.20150582],
       [0.

Now the left side of the predicted_proba have the probablity when the output is zero (not absent) while the 
right side have probablity when the output is one (is absent) 
We are only interested in the prbablity of being absent therefore slice of the right side 

In [40]:
predicted_proba[:,1]

array([0.25853843, 0.4554599 , 0.61383876, 0.1839993 , 0.93870812,
       0.68703805, 0.68148577, 0.92779186, 0.16874473, 0.23899856,
       0.51853159, 0.82021085, 0.95123085, 0.26785856, 0.86918637,
       0.40376613, 0.45014346, 0.41787645, 0.76822711, 0.96925836,
       0.27184088, 0.19260115, 0.60511937, 0.57792678, 0.78154275,
       0.20957596, 0.54436751, 0.11125114, 0.85940352, 0.20150582,
       0.43057003, 0.6625198 , 0.69413143, 0.54581699, 0.19260115,
       0.49229685, 0.17750735, 0.79105437, 0.4929229 , 0.61136794,
       0.20994615, 0.39647478, 0.18430655, 0.10047391, 0.87407236,
       0.64064693, 0.6797718 , 0.26847512, 0.24149055, 0.16627182,
       0.575604  , 0.07333263, 0.67123151, 0.34380075, 0.89131478,
       0.53952082, 0.92408775, 0.22334806, 0.07489618, 0.07509744,
       0.7111641 , 0.66130308, 0.28590585, 0.83312546, 0.17152878,
       0.25962775, 0.0118102 , 0.19373783, 0.84065528, 0.31350102,
       0.17675679, 0.11667399, 0.92549531, 0.50496717, 0.61697

## Save the model

We will save the model (i.e the reg object) using the pickle module

Pickle module is used to convert the python object into a character stream

In simple terms we will save the reg variable into a file and this file we will load into the new notebook and thus 

we will be able to use the machine algorithm

In [41]:
import pickle 

with open ('model','wb') as file:
    pickle.dump(reg,file)
    
## In the same way we can save the scaler information also as it can be used to scale new data

with open ('scaler','wb') as file:
    pickle.dump(scaler,file)



## USING TENSORFLOW TO DEPLOY MACHINE LEARNING MODEL 



In [52]:
## splitting the x_train and y_train into further train and validation data to perform machine learning model

x_train_2 , x_validation, y_train_2, y_validation = train_test_split(x_train, y_train, random_state=360, test_size=0.1)

In [53]:
## Creating the Model

input_size = 13
output_size = 2
hidden_layer_size = 50 

model_absent = tf.keras.Sequential([
    tf.keras.layers.Dense(hidden_layer_size, activation='relu'),
    tf.keras.layers.Dense(hidden_layer_size, activation='relu'),
    tf.keras.layers.Dense(output_size, activation='softmax'),
])

Compiling the model

In [54]:
model_absent.compile(optimizer='adam',loss='sparse_categorical_crossentropy',metrics=['accuracy'])

To convert panda series to numpy array to fit the model

In [75]:
y_train_2 = y_train_2.to_numpy()
y_validation = y_validation.to_numpy()

Fitting the model

In [85]:
batch_size = 1
num_epochs = 10
early_stopping = tf.keras.callbacks.EarlyStopping(patience=2)

model_absent.fit(x_train_2,y_train_2,
                batch_size = batch_size,
                epochs = num_epochs,
                callbacks = [early_stopping],
                validation_data=(x_validation,y_validation),
                verbose=2,
                validation_steps=10)

Train on 504 samples, validate on 56 samples
Epoch 1/10
504/504 - 2s - loss: 0.3959 - accuracy: 0.8254 - val_loss: 0.0959 - val_accuracy: 0.8000
Epoch 2/10
504/504 - 2s - loss: 0.3803 - accuracy: 0.8373 - val_loss: 0.1011 - val_accuracy: 0.8000
Epoch 3/10
504/504 - 2s - loss: 0.3625 - accuracy: 0.8433 - val_loss: 0.0952 - val_accuracy: 0.8000
Epoch 4/10
504/504 - 2s - loss: 0.3529 - accuracy: 0.8353 - val_loss: 0.0908 - val_accuracy: 0.8000
Epoch 5/10
504/504 - 2s - loss: 0.3402 - accuracy: 0.8353 - val_loss: 0.1178 - val_accuracy: 0.7000
Epoch 6/10
504/504 - 2s - loss: 0.3346 - accuracy: 0.8333 - val_loss: 0.1005 - val_accuracy: 0.8000


<tensorflow.python.keras.callbacks.History at 0x24e95de4c48>

Testing the model

In [86]:
test_loss, test_accuracy = model_absent.evaluate(x_test,y_test)



In [87]:
print("The loss is {0:.2f} and the accuracy of the model is {1:.2f}".format(test_loss, test_accuracy*100))

The loss is 0.57 and the accuracy of the model is 75.00
