### Creating a logistic regression to predict absenteeism

### Import relevant libraries

In [2]:
import pandas as pd
import numpy as np

## Load data

In [3]:
data_preprocessed = pd.read_csv('Data/Absenteeism_preprocessed.csv')

In [4]:
data_preprocessed.head()

Unnamed: 0,reason_1,reason_2,reason_3,reason_4,Month Value,Day of the Week,Transportation Expense,Distance to Work,Age,Daily Work Load Average,Body Mass Index,Education,Children,Pets,Absenteeism Time in Hours
0,0,0,0,1,7,1,289,36,33,239.554,30,0,2,1,4
1,0,0,0,0,7,1,118,13,50,239.554,31,0,1,0,0
2,0,0,0,1,7,2,179,51,38,239.554,31,0,0,0,2
3,1,0,0,0,7,3,279,5,39,239.554,24,0,2,0,4
4,0,0,0,1,7,3,289,36,33,239.554,30,0,2,1,2


### Create the targets
Categorize people in 2 classes: people excessively absent and moderately absent
Take median value of absenteism to determine categories

In [9]:
targets = np.where(data_preprocessed['Absenteeism Time in Hours'] >
                   data_preprocessed['Absenteeism Time in Hours'].median(),
                   1, 0)

In [13]:
data_preprocessed['Excessive Absenteeism'] = targets

In [15]:
data_with_targets = data_preprocessed.drop(['Absenteeism Time in Hours'], axis = 1)

In [16]:
data_with_targets.head()

Unnamed: 0,reason_1,reason_2,reason_3,reason_4,Month Value,Day of the Week,Transportation Expense,Distance to Work,Age,Daily Work Load Average,Body Mass Index,Education,Children,Pets,Excessive Absenteeism
0,0,0,0,1,7,1,289,36,33,239.554,30,0,2,1,1
1,0,0,0,0,7,1,118,13,50,239.554,31,0,1,0,0
2,0,0,0,1,7,2,179,51,38,239.554,31,0,0,0,0
3,1,0,0,0,7,3,279,5,39,239.554,24,0,2,0,1
4,0,0,0,1,7,3,289,36,33,239.554,30,0,2,1,0


In [24]:
unscaled_inputs = data_with_targets.iloc[:, :-1] #remove label

### Standardize data

In [20]:
from sklearn.preprocessing import StandardScaler

In [22]:
absenteeism_scaler = StandardScaler() # will substract the mean and divide by stdev

In [25]:
absenteeism_scaler.fit(unscaled_inputs) #stores mean and stdev

StandardScaler(copy=True, with_mean=True, with_std=True)

In [28]:
scaled_inputs = absenteeism_scaler.transform(unscaled_inputs)

### Split the data into train & test + shuffle it

#### Import relevant modules

In [33]:
from sklearn.model_selection import train_test_split

#### Split

In [35]:
x_train, x_test, y_train, y_test = train_test_split(scaled_inputs, targets)

In [36]:
print(x_train.shape, y_train.shape)

(525, 14) (525,)


In [37]:
print(x_test.shape, y_test.shape)

(175, 14) (175,)
