In [1]:
import numpy as np
import pandas as pd

data = pd.read_csv(r'../../data/processed/processed-data.csv')
data.head()

Unnamed: 0,Gender,Age,Neighbourhood,Scholarship,Hipertension,Diabetes,Alcoholism,Handcap,SMS_received,WaitingTime,AppointmentDayOfWeek,WaitGroup,ChronicGroup,No-show
0,0,62,39,0,1,0,0,1,0,-1,4,5,2,0
1,1,56,39,0,0,0,0,1,0,-1,4,5,1,0
2,0,62,45,0,0,0,0,1,0,-1,4,5,1,0
3,0,8,54,0,0,0,0,1,0,-1,4,5,1,0
4,0,56,39,0,1,1,0,1,0,-1,4,5,0,0


### Fetch X and Y's for our training from dataset.

In [2]:
x = data.drop('No-show', axis=1)
y = data['No-show']

In [3]:
from sklearn.model_selection import train_test_split

xtrain, xtest, ytrain, ytest = train_test_split(x, y, test_size=0.2, random_state=42)

In [4]:
from sklearn.ensemble import StackingClassifier, RandomForestClassifier
from sklearn.pipeline import make_pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC
from sklearn.preprocessing import StandardScaler


estimators = [
    ('rf', RandomForestClassifier(n_estimators=10, random_state=42)),
    ('svr', make_pipeline(StandardScaler(),
                          LinearSVC(random_state=42)))
]

model = StackingClassifier(
    estimators=estimators, final_estimator=LogisticRegression()
)

ypred = model.fit(xtrain, ytrain).predict(xtest)

In [5]:
from sklearn.metrics import accuracy_score, classification_report

score = accuracy_score(ytest, ypred)

print("Test Accuracy of the Best Model:", score)
print("\nClassification Report of the Best Model:")
print(classification_report(ytest, ypred))

Test Accuracy of the Best Model: 0.7803615014182994

Classification Report of the Best Model:
              precision    recall  f1-score   support

           0       0.80      0.81      0.80     16900
           1       0.75      0.75      0.75     13418

    accuracy                           0.78     30318
   macro avg       0.78      0.78      0.78     30318
weighted avg       0.78      0.78      0.78     30318



## Let's scale our data for best accuracy.

In [6]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()

scaler.fit(x)
xscaled = scaler.fit_transform(x)

In [7]:
xtrain, xtest, ytrain, ytest = train_test_split(xscaled, y, test_size=0.2, random_state=42)

In [10]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import BernoulliNB

estimators = [
    ('rf', RandomForestClassifier(n_estimators=10, random_state=42)),
    ('svr', make_pipeline(StandardScaler(),
                          LinearSVC(random_state=42))),
    ('dt', DecisionTreeClassifier(random_state=42)),
    ('bnb', BernoulliNB())
]

model = StackingClassifier(
    estimators=estimators, 
    final_estimator=LogisticRegression(),
    verbose=1,
    n_jobs=-1
)

ypred = model.fit(xtrain, ytrain).predict(xtest)

In [11]:
from sklearn.metrics import accuracy_score, classification_report

score = accuracy_score(ytest, ypred)

print("Test Accuracy of the Best Model:", score)
print("\nClassification Report of the Best Model:")
print(classification_report(ytest, ypred))

Test Accuracy of the Best Model: 0.7839897090837127

Classification Report of the Best Model:
              precision    recall  f1-score   support

           0       0.80      0.81      0.81     16900
           1       0.76      0.75      0.75     13418

    accuracy                           0.78     30318
   macro avg       0.78      0.78      0.78     30318
weighted avg       0.78      0.78      0.78     30318

