In [14]:
import joblib
import pandas as pd
from sklearn.metrics import accuracy_score, precision_score, recall_score
from time import time

In [8]:
val_features = pd.read_csv('../../data/split_data/val_features.csv', index_col='AppointmentID')
val_labels = pd.read_csv('../../data/split_data/val_labels.csv', index_col='AppointmentID')

In [9]:
val_features.head()

Unnamed: 0_level_0,Gender,Age,Scholarship,Hipertension,Diabetes,Alcoholism,Handcap,SMS_received,AppDay_weekend,DayDiff,MissedApp
AppointmentID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
108480,0,21,0,0,0,0,0,0,0,0,0
109126,0,35,0,0,0,0,0,1,0,14,0
8311,1,20,0,0,0,0,0,1,0,8,0
7578,1,17,0,0,0,0,0,1,0,17,0
5019,0,79,0,1,0,0,0,0,0,2,0


In [10]:
val_labels.head()

Unnamed: 0_level_0,No-show
AppointmentID,Unnamed: 1_level_1
108480,0
109126,0
8311,0
7578,0
5019,0


#### Evaluate different models on the validation data set

* Random forest, logistic regression, Gradient boosting

In [11]:
# load the best models from the training set
models = {}
for mdl in ['lr', 'rf', 'gb']:
    models[mdl] = joblib.load(f'../../model/{mdl}_model.pkl')

In [12]:
models

{'lr': LogisticRegression(C=100, class_weight=None, dual=False, fit_intercept=True,
                    intercept_scaling=1, l1_ratio=None, max_iter=100,
                    multi_class='auto', n_jobs=None, penalty='l2',
                    random_state=None, solver='lbfgs', tol=0.0001, verbose=0,
                    warm_start=False),
 'rf': RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
                        criterion='gini', max_depth=8, max_features='auto',
                        max_leaf_nodes=None, max_samples=None,
                        min_impurity_decrease=0.0, min_impurity_split=None,
                        min_samples_leaf=1, min_samples_split=2,
                        min_weight_fraction_leaf=0.0, n_estimators=64,
                        n_jobs=None, oob_score=False, random_state=None,
                        verbose=0, warm_start=False),
 'gb': GradientBoostingClassifier(ccp_alpha=0.0, criterion='friedman_mse', init=None,
                  

In [15]:
def evaluate_models(name, model, features, labels):
    start = time()
    pred = model.predict(features)
    end = time()
    
    accuracy = round(accuracy_score(labels, pred), 2)
    precision = round(precision_score(labels, pred), 2)
    recall = round(recall_score(labels, pred), 2)
    
    print(f'{name} : accuracy: {accuracy} / precision: {precision} / recall: {recall} / time: {round((end-start)*1000,1)}')

In [17]:
for name, mdl in models.items():
    evaluate_models(name, mdl, val_features, val_labels)

lr : accuracy: 0.86 / precision: 0.73 / recall: 0.49 / time: 2.0
rf : accuracy: 0.93 / precision: 0.76 / recall: 0.93 / time: 89.1
gb : accuracy: 0.93 / precision: 0.76 / recall: 0.93 / time: 55.9


## Gradient boosting and random forest have exactly the same accuary, precision and recall score, we can choose any one of them. SInce *Gradient boosting* takes less time to predict we will choose this model

In [23]:
# testing GB model on a test dataset
test_features = pd.read_csv('../../data/split_data/test_features.csv', index_col='AppointmentID')
test_labels = pd.read_csv('../../data/split_data/test_labels.csv', index_col='AppointmentID')

test_features.head()

Unnamed: 0_level_0,Gender,Age,Scholarship,Hipertension,Diabetes,Alcoholism,Handcap,SMS_received,AppDay_weekend,DayDiff,MissedApp
AppointmentID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
26368,1,12,1,0,0,0,0,0,0,0,0
91989,0,66,0,0,0,0,0,0,0,1,0
58142,1,67,0,0,0,0,0,1,0,13,1
82240,1,4,0,0,0,0,0,0,0,37,2
23160,0,39,0,0,0,0,0,0,0,0,2


In [24]:
test_labels.head()

Unnamed: 0_level_0,No-show
AppointmentID,Unnamed: 1_level_1
26368,0
91989,0
58142,1
82240,1
23160,0


In [25]:
gb_model = joblib.load('../../model/gb_model.pkl')

In [26]:
gb_model

GradientBoostingClassifier(ccp_alpha=0.0, criterion='friedman_mse', init=None,
                           learning_rate=0.01, loss='deviance', max_depth=3,
                           max_features=None, max_leaf_nodes=None,
                           min_impurity_decrease=0.0, min_impurity_split=None,
                           min_samples_leaf=1, min_samples_split=2,
                           min_weight_fraction_leaf=0.0, n_estimators=250,
                           n_iter_no_change=None, presort='deprecated',
                           random_state=None, subsample=1.0, tol=0.0001,
                           validation_fraction=0.1, verbose=0,
                           warm_start=False)

In [28]:
evaluate_models('gb', gb_model, test_features, test_labels)

gb : accuracy: 0.92 / precision: 0.75 / recall: 0.92 / time: 62.9


## Model - Gradient boosting   accuracy: 0.92 / precision: 0.75 / recall: 0.92 / time: 62.9