In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')

In [2]:
pd.options.display.max_columns = 999

In [3]:
from sklearn.model_selection import train_test_split, GridSearchCV, StratifiedKFold
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix, classification_report

from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer

from sklearn.preprocessing import RobustScaler, OneHotEncoder

from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier

In [12]:
df = pd.read_csv('absence_clean.csv')

In [13]:
df = df.drop(columns=['Unnamed: 0'])

In [14]:
df.head()

Unnamed: 0,ID,Month of absence,Day of the week,Transportation expense,Distance from Residence to Work,Service time,Age,Work load Average/day,Education,Son,Social drinker,Absenteeism time in hours,Reason
0,11,7,Tuesday,289,36,13,33,239.554,High School,2,Yes,4,Without ICD
1,3,7,Wednesday,179,51,18,38,239.554,High School,0,Yes,2,Without ICD
2,7,7,Thursday,279,5,14,39,239.554,High School,2,Yes,4,ICD
3,11,7,Thursday,289,36,13,33,239.554,High School,2,Yes,2,Without ICD
4,3,7,Friday,179,51,18,38,239.554,High School,0,Yes,2,Without ICD


In [15]:
df['Reason'] = df['Reason'].replace({'Without ICD' : 1, 'ICD' : 0})

In [26]:
df['Reason'].value_counts()/len(df['Reason'])

1    0.624103
0    0.375897
Name: Reason, dtype: float64

### Dikarenakan Data Classification Balance, maka difokuskan pada Accuracy Score

In [19]:
# Splitting Data

In [17]:
X = df.drop(columns=['Reason'])
y = df['Reason']

In [18]:
X_train, X_test, y_train, y_test = train_test_split(X, y, stratify = y, test_size = 0.2, random_state = 42)

# Evaluation Matrix

In [5]:
# Evaluation Matrix Classification
def Eva_Matrix1(Model, X_train, y_train, X_test, y_test, Name):
    y_pred_train = Model.predict(X_train)
    acc_train = accuracy_score(y_train, y_pred_train)
    rec_train = recall_score(y_train, y_pred_train)
    prec_train = precision_score(y_train, y_pred_train)
    f1_train = f1_score(y_train, y_pred_train)
    y_pred_test = Model.predict(X_test)
    acc_test = accuracy_score(y_test, y_pred_test)
    rec_test = recall_score(y_test, y_pred_test)
    prec_test = precision_score(y_test, y_pred_test)
    f1_test = f1_score(y_train, y_pred_train)
    data = {
        f"Training {Name}" : [acc_train, rec_train, prec_train, f1_train],
        f"Test {Name}" : [acc_test, rec_test, prec_test, f1_test]
    }
    df = pd.DataFrame(data=data, index=['Accuracy', 'Recall', 'Precision', 'F1'])
    return df

In [20]:
X_train.head()

Unnamed: 0,ID,Month of absence,Day of the week,Transportation expense,Distance from Residence to Work,Service time,Age,Work load Average/day,Education,Son,Social drinker,Absenteeism time in hours
218,36,6,Tuesday,118,13,18,50,377.55,High School,1,Yes,1
495,6,10,Monday,189,29,13,33,284.853,High School,2,No,8
199,3,5,Tuesday,179,51,18,38,378.884,High School,0,Yes,8
302,34,11,Wednesday,118,10,10,37,284.031,High School,0,No,3
163,20,3,Friday,260,50,11,36,343.253,High School,4,Yes,4


# Pipeline

In [21]:
X_train.columns

Index(['ID', 'Month of absence', 'Day of the week', 'Transportation expense',
       'Distance from Residence to Work', 'Service time', 'Age',
       'Work load Average/day ', 'Education', 'Son', 'Social drinker',
       'Absenteeism time in hours'],
      dtype='object')

In [22]:
num_columns = ['ID', 'Month of absence', 'Transportation expense', 'Distance from Residence to Work', 'Service time', 'Age', 'Work load Average/day ', 'Son', 'Absenteeism time in hours']

cat_columns = ['Day of the week', 'Education', 'Social drinker']

numeric_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy='mean')),
    ('scaler', RobustScaler()),
    #('poly', PolynomialFeatures(degree=3, include_bias=False)),
    #('power', PowerTransformer(method='yeo-johnson'))
])

categoric_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('encoder', OneHotEncoder())
])

preprocessor = ColumnTransformer([
    ('numeric', numeric_pipeline, num_columns),
    ('categoric', categoric_pipeline, cat_columns)
])

pipeSVM = Pipeline([
    ("prep", preprocessor),
    ("algo", SVC(max_iter=400, probability=True))
])

pipeLR = Pipeline([
    ("prep", preprocessor),
    ("algo", LogisticRegression())
])

pipeKNN = Pipeline([
    ("prep", preprocessor),
    ("algo", KNeighborsClassifier())
])
               
pipeDT = Pipeline([
    ("prep", preprocessor),
    ("algo", DecisionTreeClassifier())
])
            
pipeRF = Pipeline([
    ("prep", preprocessor),
    ("algo", RandomForestClassifier())
])

## Base Model (KNN)

In [23]:
pipeKNN.fit(X_train, y_train)

Pipeline(steps=[('prep',
                 ColumnTransformer(transformers=[('numeric',
                                                  Pipeline(steps=[('imputer',
                                                                   SimpleImputer()),
                                                                  ('scaler',
                                                                   RobustScaler())]),
                                                  ['ID', 'Month of absence',
                                                   'Transportation expense',
                                                   'Distance from Residence to '
                                                   'Work',
                                                   'Service time', 'Age',
                                                   'Work load Average/day ',
                                                   'Son',
                                                   'Absenteeism time in '
           

In [24]:
y_KNN_test = pipeKNN.predict(X_test)

In [25]:
df_KNN = Eva_Matrix1(pipeKNN, X_train, y_train, X_test, y_test, "Pipeline KNN")
df_KNN

Unnamed: 0,Training Pipeline KNN,Test Pipeline KNN
Accuracy,0.806104,0.728571
Recall,0.925287,0.91954
Precision,0.79703,0.720721
F1,0.856383,0.856383


In [30]:
print(classification_report(y_test, y_KNN_test))

              precision    recall  f1-score   support

           0       0.76      0.42      0.54        53
           1       0.72      0.92      0.81        87

    accuracy                           0.73       140
   macro avg       0.74      0.67      0.67       140
weighted avg       0.74      0.73      0.71       140



## Base Model (LogisticRegression)

In [27]:
pipeLR.fit(X_train, y_train)

Pipeline(steps=[('prep',
                 ColumnTransformer(transformers=[('numeric',
                                                  Pipeline(steps=[('imputer',
                                                                   SimpleImputer()),
                                                                  ('scaler',
                                                                   RobustScaler())]),
                                                  ['ID', 'Month of absence',
                                                   'Transportation expense',
                                                   'Distance from Residence to '
                                                   'Work',
                                                   'Service time', 'Age',
                                                   'Work load Average/day ',
                                                   'Son',
                                                   'Absenteeism time in '
           

In [28]:
y_LR_test = pipeLR.predict(X_test)

In [29]:
df_LR = Eva_Matrix1(pipeLR, X_train, y_train, X_test, y_test, "Pipeline LR")
df_LR

Unnamed: 0,Training Pipeline LR,Test Pipeline LR
Accuracy,0.766607,0.764286
Recall,0.899425,0.91954
Precision,0.767157,0.754717
F1,0.828042,0.828042


In [31]:
print(classification_report(y_test, y_LR_test))

              precision    recall  f1-score   support

           0       0.79      0.51      0.62        53
           1       0.75      0.92      0.83        87

    accuracy                           0.76       140
   macro avg       0.77      0.71      0.72       140
weighted avg       0.77      0.76      0.75       140



## Base Model (SVM)

In [32]:
pipeSVM.fit(X_train, y_train)

Pipeline(steps=[('prep',
                 ColumnTransformer(transformers=[('numeric',
                                                  Pipeline(steps=[('imputer',
                                                                   SimpleImputer()),
                                                                  ('scaler',
                                                                   RobustScaler())]),
                                                  ['ID', 'Month of absence',
                                                   'Transportation expense',
                                                   'Distance from Residence to '
                                                   'Work',
                                                   'Service time', 'Age',
                                                   'Work load Average/day ',
                                                   'Son',
                                                   'Absenteeism time in '
           

In [33]:
y_SVM_test = pipeSVM.predict(X_test)

In [34]:
df_SVM = Eva_Matrix1(pipeSVM, X_train, y_train, X_test, y_test, "Pipeline SVM")
df_SVM

Unnamed: 0,Training Pipeline SVM,Test Pipeline SVM
Accuracy,0.800718,0.742857
Recall,0.925287,0.942529
Precision,0.791155,0.725664
F1,0.85298,0.85298


In [35]:
print(classification_report(y_test, y_SVM_test))

              precision    recall  f1-score   support

           0       0.81      0.42      0.55        53
           1       0.73      0.94      0.82        87

    accuracy                           0.74       140
   macro avg       0.77      0.68      0.69       140
weighted avg       0.76      0.74      0.72       140



## Base Model (DecisionTreeClassifier)

In [36]:
pipeDT.fit(X_train, y_train)

Pipeline(steps=[('prep',
                 ColumnTransformer(transformers=[('numeric',
                                                  Pipeline(steps=[('imputer',
                                                                   SimpleImputer()),
                                                                  ('scaler',
                                                                   RobustScaler())]),
                                                  ['ID', 'Month of absence',
                                                   'Transportation expense',
                                                   'Distance from Residence to '
                                                   'Work',
                                                   'Service time', 'Age',
                                                   'Work load Average/day ',
                                                   'Son',
                                                   'Absenteeism time in '
           

In [37]:
y_DT_test = pipeDT.predict(X_test)

In [38]:
df_DT = Eva_Matrix1(pipeDT, X_train, y_train, X_test, y_test, "Pipeline DT")
df_DT

Unnamed: 0,Training Pipeline DT,Test Pipeline DT
Accuracy,0.996409,0.714286
Recall,0.994253,0.816092
Precision,1.0,0.747368
F1,0.997118,0.997118


In [39]:
print(classification_report(y_test, y_DT_test))

              precision    recall  f1-score   support

           0       0.64      0.55      0.59        53
           1       0.75      0.82      0.78        87

    accuracy                           0.71       140
   macro avg       0.70      0.68      0.69       140
weighted avg       0.71      0.71      0.71       140



## Base Model (RandomForestClassifier)

In [40]:
pipeRF.fit(X_train, y_train)

Pipeline(steps=[('prep',
                 ColumnTransformer(transformers=[('numeric',
                                                  Pipeline(steps=[('imputer',
                                                                   SimpleImputer()),
                                                                  ('scaler',
                                                                   RobustScaler())]),
                                                  ['ID', 'Month of absence',
                                                   'Transportation expense',
                                                   'Distance from Residence to '
                                                   'Work',
                                                   'Service time', 'Age',
                                                   'Work load Average/day ',
                                                   'Son',
                                                   'Absenteeism time in '
           

In [41]:
y_RF_test = pipeRF.predict(X_test)

In [42]:
df_RF = Eva_Matrix1(pipeRF, X_train, y_train, X_test, y_test, "Pipeline RF")
df_RF

Unnamed: 0,Training Pipeline RF,Test Pipeline RF
Accuracy,0.996409,0.785714
Recall,1.0,0.862069
Precision,0.994286,0.806452
F1,0.997135,0.997135


In [43]:
print(classification_report(y_test, y_RF_test))

              precision    recall  f1-score   support

           0       0.74      0.66      0.70        53
           1       0.81      0.86      0.83        87

    accuracy                           0.79       140
   macro avg       0.78      0.76      0.77       140
weighted avg       0.78      0.79      0.78       140



# Improvement Algoritma (Hyper Parameter Tuning)

## KNN (Hyper Parameter Tuning)

In [44]:
param_KNN = {
    "algo__n_neighbors" : np.arange(1, 51, 2),
    "algo__p" : [1, 2],
    "algo__weights" : ['uniform', 'distance']
}

In [45]:
skf = StratifiedKFold(n_splits = 3, random_state = 42)

In [46]:
KNN_GS = GridSearchCV(pipeKNN, param_KNN, cv = skf, scoring = 'balanced_accuracy', n_jobs = -1, verbose = 1)

In [47]:
KNN_GS.fit(X_train, y_train)

Fitting 3 folds for each of 100 candidates, totalling 300 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:    8.1s
[Parallel(n_jobs=-1)]: Done 300 out of 300 | elapsed:   17.9s finished


GridSearchCV(cv=StratifiedKFold(n_splits=3, random_state=42, shuffle=False),
             estimator=Pipeline(steps=[('prep',
                                        ColumnTransformer(transformers=[('numeric',
                                                                         Pipeline(steps=[('imputer',
                                                                                          SimpleImputer()),
                                                                                         ('scaler',
                                                                                          RobustScaler())]),
                                                                         ['ID',
                                                                          'Month '
                                                                          'of '
                                                                          'absence',
                                             

In [48]:
KNN_Tuned = KNN_GS.best_estimator_

In [49]:
y_train_KNN_Tuned = KNN_Tuned.predict(X_train)
y_test_KNN_Tuned = KNN_Tuned.predict(X_test)

In [50]:
df_KNN_Tuned = Eva_Matrix1(KNN_Tuned, X_train, y_train, X_test, y_test, "Pipeline KNN Tuned")
df_KNN_Tuned

Unnamed: 0,Training Pipeline KNN Tuned,Test Pipeline KNN Tuned
Accuracy,0.996409,0.685714
Recall,0.994253,0.816092
Precision,1.0,0.717172
F1,0.997118,0.997118


In [51]:
print(classification_report(y_test, y_test_KNN_Tuned))

              precision    recall  f1-score   support

           0       0.61      0.47      0.53        53
           1       0.72      0.82      0.76        87

    accuracy                           0.69       140
   macro avg       0.66      0.64      0.65       140
weighted avg       0.68      0.69      0.68       140



## LogisticRegression (Hyper Parameter Tuning)

In [52]:
param_LR = {
    "algo__C" : np.logspace(-3, 3, 7),
    "algo__penalty" : ['l1', 'l2', 'elasticnet']
}

In [53]:
skf = StratifiedKFold(n_splits = 3, random_state = 42)

In [54]:
LR_GS = GridSearchCV(pipeLR, param_LR, cv = skf, scoring = 'balanced_accuracy', n_jobs = -1, verbose = 1)

In [55]:
LR_GS.fit(X_train, y_train)

Fitting 3 folds for each of 21 candidates, totalling 63 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  63 out of  63 | elapsed:    1.3s finished


GridSearchCV(cv=StratifiedKFold(n_splits=3, random_state=42, shuffle=False),
             estimator=Pipeline(steps=[('prep',
                                        ColumnTransformer(transformers=[('numeric',
                                                                         Pipeline(steps=[('imputer',
                                                                                          SimpleImputer()),
                                                                                         ('scaler',
                                                                                          RobustScaler())]),
                                                                         ['ID',
                                                                          'Month '
                                                                          'of '
                                                                          'absence',
                                             

In [56]:
LR_Tuned = LR_GS.best_estimator_

In [57]:
y_train_LR_Tuned = LR_Tuned.predict(X_train)
y_test_LR_Tuned = LR_Tuned.predict(X_test)

In [58]:
df_LR_Tuned = Eva_Matrix1(LR_Tuned, X_train, y_train, X_test, y_test, "Pipeline LR Tuned")
df_LR_Tuned

Unnamed: 0,Training Pipeline LR Tuned,Test Pipeline LR Tuned
Accuracy,0.764811,0.75
Recall,0.896552,0.896552
Precision,0.766585,0.75
F1,0.82649,0.82649


In [59]:
print(classification_report(y_test, y_test_LR_Tuned))

              precision    recall  f1-score   support

           0       0.75      0.51      0.61        53
           1       0.75      0.90      0.82        87

    accuracy                           0.75       140
   macro avg       0.75      0.70      0.71       140
weighted avg       0.75      0.75      0.74       140



## SVM (Hyper Parameter Tuning)

In [60]:
param_SVM = {
    'algo__C' : np.logspace(-3,3,7),
    'algo__gamma' : np.arange(10, 101, 10),
    'algo__class_weight' : [{0 : 0.4, 1 : 0.6}, 'balanced', {0 : 0.3, 1 : 0.7}]
}

In [61]:
skf = StratifiedKFold(n_splits = 3, random_state = 42)

In [62]:
SVM_GS = GridSearchCV(pipeSVM, param_SVM, cv = skf, scoring = 'balanced_accuracy', n_jobs = -1, verbose = 1)

In [63]:
SVM_GS.fit(X_train, y_train)

Fitting 3 folds for each of 210 candidates, totalling 630 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  50 tasks      | elapsed:    5.0s
[Parallel(n_jobs=-1)]: Done 200 tasks      | elapsed:   16.8s
[Parallel(n_jobs=-1)]: Done 450 tasks      | elapsed:   37.3s
[Parallel(n_jobs=-1)]: Done 630 out of 630 | elapsed:   50.4s finished


GridSearchCV(cv=StratifiedKFold(n_splits=3, random_state=42, shuffle=False),
             estimator=Pipeline(steps=[('prep',
                                        ColumnTransformer(transformers=[('numeric',
                                                                         Pipeline(steps=[('imputer',
                                                                                          SimpleImputer()),
                                                                                         ('scaler',
                                                                                          RobustScaler())]),
                                                                         ['ID',
                                                                          'Month '
                                                                          'of '
                                                                          'absence',
                                             

In [64]:
SVM_Tuned = SVM_GS.best_estimator_

In [65]:
y_train_SVM_Tuned = SVM_Tuned.predict(X_train)
y_test_SVM_Tuned = SVM_Tuned.predict(X_test)

In [66]:
df_SVM_Tuned = Eva_Matrix1(SVM_Tuned, X_train, y_train, X_test, y_test, "Pipeline SVM Tuned")
df_SVM_Tuned

Unnamed: 0,Training Pipeline SVM Tuned,Test Pipeline SVM Tuned
Accuracy,0.994614,0.6
Recall,0.994253,0.91954
Precision,0.997118,0.620155
F1,0.995683,0.995683


In [67]:
print(classification_report(y_test, y_test_SVM_Tuned))

              precision    recall  f1-score   support

           0       0.36      0.08      0.12        53
           1       0.62      0.92      0.74        87

    accuracy                           0.60       140
   macro avg       0.49      0.50      0.43       140
weighted avg       0.52      0.60      0.51       140



## DecisionTreeClassifier (Hyper Parameter Tuning)

In [68]:
param_DT = {
    "algo__max_depth" : [None, 5, 10, 15],
    "algo__min_samples_leaf" : np.arange(1, 20, 5),
    "algo__max_features" : np.arange(0, 1.1, 0.3)
}

In [69]:
skf = StratifiedKFold(n_splits = 3, random_state = 42)

In [70]:
DT_GS = GridSearchCV(pipeDT, param_DT, cv = skf, scoring = 'balanced_accuracy', n_jobs = -1, verbose = 1)

In [71]:
DT_GS.fit(X_train, y_train)

Fitting 3 folds for each of 64 candidates, totalling 192 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  76 tasks      | elapsed:    1.5s
[Parallel(n_jobs=-1)]: Done 192 out of 192 | elapsed:    5.9s finished


GridSearchCV(cv=StratifiedKFold(n_splits=3, random_state=42, shuffle=False),
             estimator=Pipeline(steps=[('prep',
                                        ColumnTransformer(transformers=[('numeric',
                                                                         Pipeline(steps=[('imputer',
                                                                                          SimpleImputer()),
                                                                                         ('scaler',
                                                                                          RobustScaler())]),
                                                                         ['ID',
                                                                          'Month '
                                                                          'of '
                                                                          'absence',
                                             

In [72]:
DT_GS.best_params_

{'algo__max_depth': 10,
 'algo__max_features': 0.8999999999999999,
 'algo__min_samples_leaf': 16}

In [73]:
DT_Tuned = DT_GS.best_estimator_

In [74]:
y_train_DT_Tuned = DT_Tuned.predict(X_train)
y_test_DT_Tuned = DT_Tuned.predict(X_test)

In [75]:
df_DT_Tuned = Eva_Matrix1(DT_Tuned, X_train, y_train, X_test, y_test, "Pipeline DT Tuned")
df_DT_Tuned

Unnamed: 0,Training Pipeline DT Tuned,Test Pipeline DT Tuned
Accuracy,0.807899,0.685714
Recall,0.853448,0.816092
Precision,0.84136,0.717172
F1,0.847361,0.847361


In [76]:
print(classification_report(y_test, y_test_DT_Tuned))

              precision    recall  f1-score   support

           0       0.61      0.47      0.53        53
           1       0.72      0.82      0.76        87

    accuracy                           0.69       140
   macro avg       0.66      0.64      0.65       140
weighted avg       0.68      0.69      0.68       140



## RandomForestClassifier (Hyper Parameter Tuning)

In [77]:
param_RF = {
    "algo__n_estimators" : np.arange(100, 301, 100), # ada 3
    "algo__max_depth" : [None, 5, 10], # ada 4
    "algo__min_samples_leaf" : np.arange(1, 12, 5), # ada 4
    "algo__max_features" : [0.3, 0.5, 0.7, 0.8] # ada 4
}

In [78]:
skf = StratifiedKFold(n_splits = 3, random_state = 42)

In [79]:
RF_GS = GridSearchCV(pipeRF, param_RF, cv = skf, scoring = 'balanced_accuracy', n_jobs = -1, verbose = 1)

In [80]:
RF_GS.fit(X_train, y_train)

Fitting 3 folds for each of 108 candidates, totalling 324 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:   22.1s
[Parallel(n_jobs=-1)]: Done 192 tasks      | elapsed:  1.3min
[Parallel(n_jobs=-1)]: Done 324 out of 324 | elapsed:  2.3min finished


GridSearchCV(cv=StratifiedKFold(n_splits=3, random_state=42, shuffle=False),
             estimator=Pipeline(steps=[('prep',
                                        ColumnTransformer(transformers=[('numeric',
                                                                         Pipeline(steps=[('imputer',
                                                                                          SimpleImputer()),
                                                                                         ('scaler',
                                                                                          RobustScaler())]),
                                                                         ['ID',
                                                                          'Month '
                                                                          'of '
                                                                          'absence',
                                             

In [81]:
RF_Tuned = RF_GS.best_estimator_

In [82]:
y_train_RF_Tuned = RF_Tuned.predict(X_train)
y_test_RF_Tuned = RF_Tuned.predict(X_test)

In [83]:
df_RF_Tuned = Eva_Matrix1(RF_Tuned, X_train, y_train, X_test, y_test, "Pipeline RF Tuned")
df_RF_Tuned

Unnamed: 0,Training Pipeline RF Tuned,Test Pipeline RF Tuned
Accuracy,0.800718,0.785714
Recall,0.876437,0.862069
Precision,0.817694,0.806452
F1,0.846047,0.846047


In [84]:
print(classification_report(y_test, y_test_RF_Tuned))

              precision    recall  f1-score   support

           0       0.74      0.66      0.70        53
           1       0.81      0.86      0.83        87

    accuracy                           0.79       140
   macro avg       0.78      0.76      0.77       140
weighted avg       0.78      0.79      0.78       140



### Penjelasan Classification Report
- Precision 0 dan Recall 1 searah, jika nilai Precision 0 naik maka nilai Recall 1 juga akan naik, begitu juga sebaliknya.
- Precision 1 dan Recall 0 searah, jika nilai Precision 1 naik maka nilai Recall 0 juga akan naik, begitu juga sebaliknya.
- Precision 0 dan Precision 1 berbanding terbalik, jika nilai Precision 0 naik maka nilai Precision 1 akan turun, begitu juga sebaliknya.
- Recall 0 dan Recall 1 berbanding terbalik, jika nilai Recall 0 naik maka nilai Recall 1 akan turun, begitu juga sebaliknya.
- Accuracy dapat digunakan ketika dataset Balance.
- Accuracy merupakan perbandingan antara Jumlah Seluruh Tebakan / Prediksi yang Benar (True) dibandingkan dengan Jumlah Seluruh Data / Tebakan / Prediksi.
- Precision menargetkan False Positive sekecil mungkin. (Ini diabaikan)
- Recall menargetkan False Negative sekecil mungkin. (Ini diabaikan)

Fokus Model ini pada **Accuracy** dikarenakan pada Kolom Target **(Reason)** valuenya masing-masing **balance**. Model terbaik menghasilkan **Nilai Accuracy 0.79** pada perbandingan y_test dengan y_test_RF_Tuned dengan menggunakan **(Model Random Forest Classifier dengan Hyper Parameter Tuning) dan tidak terlihat Overfit dari segi Nilai data Train & Test**.

In [85]:
import joblib
joblib.dump(RF_Tuned, 'Model_RF_Tuned')

['Model_RF_Tuned']

# Conclusion
- Untuk Alasan Karyawan Absen/Tidak Masuk Kerja yang termasuk golongan Without ICD seharusnya dapat ditoleransi oleh Karyawan tersebut, sehingga karyawan tidak perlu Absen/Tidak masuk Kerja. Absen/Tidak Masuk kerja yang wajar adalah untuk Karyawan yang sedang sakit dirawat atau terkena Disease, atau ada Sanak/Saudara yang meninggal. 
- Ada sebesar 62,4% Karyawan yang melakukan Absen/Tidak Masuk Kerja dengan Without ICD, dan 37,6% Karyawan yang melakukan Absen/Tidak Masuk Kerja dengan ICD.
- Paling banyak Karyawan Absen/tidak masuk kerja selama 8 jam.
- Karyawan yang rumah tinggalnya jaraknya dekat ke kantor paling jarang melakukan Absen/Tidak Masuk Kerja.
- Karyawan yang berlatar belakang pendidikan High School paling sering melakukan Absen/Tidak Masuk Kerja.
- Season, Hit target, Social smoker, Pet, Weight, Height, BMI dari data tidak mempengaruhi Karyawan untuk melakukan Absen/Tidak Masuk Kerja.
- Model terbaik yang digunakan untuk Machine Learning ini adalah Model Random Forest Classifier dengan Hyper Parameter Tuning yang menggunakan best_estimator_ dengan nilai Accuracy untuk data Train 0.800718 dan untuk data Test 0.785714.

# Recommendation
- Dengan adanya Machine Learning, Perusahaan dapat melakukan prediksi jika si Karyawan melakukan Absen/Tidak Masuk Kerja, apabila si Karyawan tetap melakukan Absen/Tidak Masuk Kerja dengan Reason **Without ICD** maka perusahaan dapat melakukan tindakan kepada si Karyawan tersebut.
- Dikarenakan banyaknya Karyawan yang sering Absen/Tidak Masuk Kerja dengan Reason **Without ICD** maka Perusahaan perlu mempertimbangkan untuk menyediakan Dokter Umum di Kantor, agar bagi para pekerja yang merasa untuk butuh Medical Consultation (Alasan Without ICD paling tinggi dikarenakan mereka Medical Consultation) dapat berkonsultasi dengan Dokter Umum di Kantor sehingga tidak perlu Absen/Tidak Masuk Kerja.
- Untuk Kategori yang termasuk **Without ICD** seperti Patient Follow Up, Medical Consultation, Blood Donation, Laboratory Examination, Physiotherapy, Dental Consultation bagi Perusahaan perlu disediakan Dokter/Tenaga Medis yang dapat menangani hal tersebut.
- Perusahaan harus memberikan penghargaan bagi Karyawan yang paling sedikit untuk Absen/Tidak Masuk Kerja, sehingga para Karyawan akan semakin semangat bekerja dan berlomba-lomba untuk menjadi yang paling sedikit Absen/Tidak Masuk Kerja.