In [1]:
import pandas as pd

data = pd.read_csv(r'../../data/processed/processed-data.csv')
data.head()

Unnamed: 0,Gender,Age,Neighbourhood,Scholarship,Hipertension,Diabetes,Alcoholism,Handcap,SMS_received,WaitingTime,AppointmentDayOfWeek,WaitGroup,ChronicGroup,No-show
0,0,62,39,0,1,0,0,1,0,-1,4,5,2,0
1,1,56,39,0,0,0,0,1,0,-1,4,5,1,0
2,0,62,45,0,0,0,0,1,0,-1,4,5,1,0
3,0,8,54,0,0,0,0,1,0,-1,4,5,1,0
4,0,56,39,0,1,1,0,1,0,-1,4,5,0,0


## First'o fall extract X and Y from our dataset.

In [2]:
x = data.drop('No-show', axis=1)
y = data['No-show']

In [3]:
from sklearn.model_selection import train_test_split

xtrain, xtest, ytrain, ytest = train_test_split(x, y, test_size=0.2, random_state=42)

In [4]:
from sklearn.neighbors import KNeighborsClassifier

model = KNeighborsClassifier()
model.fit(xtrain, ytrain)

In [5]:
ypred = model.predict(xtest)

In [6]:
from sklearn.metrics import accuracy_score, classification_report

score = accuracy_score(ytest, ypred)

print("Test Accuracy of the Best Model:", score)
print("\nClassification Report of the Best Model:")
print(classification_report(ytest, ypred))

Test Accuracy of the Best Model: 0.7646282736328254

Classification Report of the Best Model:
              precision    recall  f1-score   support

           0       0.85      0.71      0.77     16900
           1       0.69      0.84      0.76     13418

    accuracy                           0.76     30318
   macro avg       0.77      0.77      0.76     30318
weighted avg       0.78      0.76      0.77     30318



### Let's scale our data to get more accurate result.

In [7]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
scaler.fit(x)

xscaled = scaler.fit_transform(x)

In [8]:
xtrain, xtest, ytrain, ytest = train_test_split(xscaled, y, test_size=0.2, random_state=42)

In [9]:
from sklearn.neighbors import KNeighborsClassifier

model = KNeighborsClassifier()
model.fit(xtrain, ytrain)

In [10]:
ypred = model.predict(xtest)

In [11]:
from sklearn.metrics import accuracy_score, classification_report

score = accuracy_score(ytest, ypred)

print("Test Accuracy of the Best Model:", score)
print("\nClassification Report of the Best Model:")
print(classification_report(ytest, ypred))

Test Accuracy of the Best Model: 0.7420674186951646

Classification Report of the Best Model:
              precision    recall  f1-score   support

           0       0.78      0.74      0.76     16900
           1       0.70      0.74      0.72     13418

    accuracy                           0.74     30318
   macro avg       0.74      0.74      0.74     30318
weighted avg       0.74      0.74      0.74     30318



## Hyperperameter Tunning.

In [14]:
model = KNeighborsClassifier(
    n_neighbors=5,
    weights='distance',
    algorithm='ball_tree',
    n_jobs=-1,
)

model.fit(xtrain, ytrain)

In [15]:
ypred = model.predict(xtest)

In [16]:
from sklearn.metrics import accuracy_score, classification_report

score = accuracy_score(ytest, ypred)

print("Test Accuracy of the Best Model:", score)
print("\nClassification Report of the Best Model:")
print(classification_report(ytest, ypred))

Test Accuracy of the Best Model: 0.7670360841744178

Classification Report of the Best Model:
              precision    recall  f1-score   support

           0       0.81      0.75      0.78     16900
           1       0.72      0.78      0.75     13418

    accuracy                           0.77     30318
   macro avg       0.77      0.77      0.77     30318
weighted avg       0.77      0.77      0.77     30318



## GridSearchCV

In [17]:
param_grids = {
    'n_neighbors': [3, 5],
    'algorithm': ['ball_tree', 'kd_tree'],
    'weights': ['uniform', 'distance']
}

In [18]:
from sklearn.model_selection import GridSearchCV

grid_search = GridSearchCV(
    KNeighborsClassifier(),
    param_grid=param_grids,
    verbose=1, 
    n_jobs=-1
)

In [None]:
grid_search.fit(xtrain, ytrain)

Fitting 5 folds for each of 8 candidates, totalling 40 fits


In [None]:
print("Best Hyperparameters:", grid_search.best_params_)
print("Best Cross-Validation Score:", grid_search.best_score_)

best_knn_model = grid_search.best_estimator_

In [None]:
from sklearn.metrics import classification_report

y_pred = best_knn_model.predict(xtest)
accuracy = accuracy_score(ytest, y_pred)

print("Test Accuracy of the Best Model:", accuracy)
print("\nClassification Report of the Best Model:")
print(classification_report(ytest, y_pred))