#Medical Appointment No Show

In [None]:
import warnings
warnings.filterwarnings('ignore')
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix
from sklearn.ensemble import RandomForestClassifier


In [None]:
filepath = input("Enter the file path: ")



In [None]:
df = pd.read_csv(filepath)

In [None]:
df.head(7)

In [None]:
df.describe()

In [None]:
df.info()

In [None]:
#check for missing values in the dataset
df.isna().sum()


In [None]:
#check for duplicates
df.duplicated().sum()

In [None]:
#selct specific columns
#df_features = df[["Gender", "Age", "Scholarship", "Hipertension", "Diabetes", "Alcoholism", "Handcap", "SMS_received"]]

In [None]:
#droped unnecessary columns
df_drop = df.drop(["PatientId", "AppointmentID", "ScheduledDay", "AppointmentDay", "Neighbourhood"], axis = 1)

In [None]:

df_drop.head()

In [None]:
df_drop.describe()

In [None]:
#exploratory data analysis
sns.countplot(x ="No-show", data = df_drop)


In [None]:
sns.barplot(x="Gender", y="Age", hue="No-show", data=df_drop)

In [None]:
sns.violinplot(x="Hipertension", y="Age", hue="No-show", data=df_drop)


In [None]:
#encode the class column
LabelEncoder = LabelEncoder()
df_drop["No-show"] = LabelEncoder.fit_transform(df_drop["No-show"])

In [None]:
df_drop["No-show"].value_counts()

In [None]:
#mapping the gender column to numerical values
gender_mapping = {"F": 0, "M": 1}
df_drop["Gender"] = df_drop["Gender"].map(gender_mapping)
df_drop.head()

In [None]:
#scale the age column
scaler = MinMaxScaler()
df_drop["Age"] = scaler.fit_transform(df["Age"].values.reshape(-1, 1))
df_drop.head()

In [None]:
df_drop.head()

In [None]:
X = df_drop.drop(["No-show"], axis = 1).values
y = df_drop["No-show"].values

In [None]:

#Split dataset into training, validation and testing sets: 
X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size = 0.2, random_state = 42) #random_state = 42 ensures you get same split every time
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size = 0.5, random_state = 42) 

In [None]:
print("X_train.shape:", y_train.shape)
print("X_val.shape:", y_val.shape)
print("X_test.shape:", y_test.shape)

In [None]:
#Train the model 
model = DecisionTreeClassifier()
model.fit(X_train, y_train)

In [None]:
#Validate the model
y_val_pred = model.predict(X_val)
print("classification_report:\n", classification_report(y_val, y_val_pred))
print("confusion_matrix:\n", confusion_matrix(y_val, y_val_pred))

In [None]:
#test the model
y_test_pred = model.predict(X_test)
print("classification_report:\n", classification_report(y_test, y_test_pred))
print("confusion_matrix:\n", confusion_matrix(y_test, y_test_pred))

In [None]:
#Optimize the model using GridSearchCV and train the model
param_grid = {
    'criterion': ['gini', 'entropy'],
    'max_depth': [None, 10, 20, 30, 40, 50],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
    'max_features': ['auto', 'sqrt', 'log2']
}
model_optimized = DecisionTreeClassifier(random_state=42)
#perform a  grid search
grid_search = GridSearchCV(model_optimized, param_grid = param_grid, cv=5, n_jobs=-1,scoring="accuracy")
grid_search.fit(X_train, y_train)


In [None]:
best_params = grid_search.best_params_
print("Best parameters:", best_params)

#train with the best model
best_model = grid_search.best_estimator_


In [None]:
y_test_pred = best_model.predict(X_test)
print("classification_report:\n", classification_report(y_test, y_test_pred))
print("confusion_matrix:\n", confusion_matrix(y_test, y_test_pred))

In [None]:
#check the importance of the features
feature_importances = best_model.feature_importances_
feature_names = df_drop.drop(["No-show"], axis = 1).columns
feature_importances = pd.Series(feature_importances, index=feature_names)
feature_importances.sort_values(ascending=False).plot(kind='bar', title='Feature Importance', figsize=(10, 6), color='cornflowerblue')

In [None]:
#Random Forest


RFmodel = RandomForestClassifier()
RFmodel.fit(X_train, y_train)

In [None]:
#Validate the RFmodel
y_valRF_pred = RFmodel.predict(X_val)
print("classification_report:\n", classification_report(y_val, y_valRF_pred))
print("confusion_matrix:\n", confusion_matrix(y_val, y_valRF_pred))

In [None]:
#test the model
y_testRF_pred = RFmodel.predict(X_test)
print("classification_report:\n", classification_report(y_test, y_testRF_pred))
print("confusion_matrix:\n", confusion_matrix(y_test, y_testRF_pred))

In [None]:
#check with different n_estimators 
accuracy = []
for n_estimators in [10, 50, 100, 200, 500]:

    RFmodel_estimators = RandomForestClassifier(n_estimators=n_estimators, random_state=42)
    RFmodel_estimators.fit(X_train, y_train)

    RFmodel = RandomForestClassifier()
    RFmodel.fit(X_train, y_train)
    y_testRF_pred = RFmodel.predict(X_test)
    accuracy.append(accuracy_score(y_test, y_testRF_pred))

In [None]:
accuracy_score = pd.Series(accuracy, index=[10, 50, 100, 200,500])
accuracy_score.plot(kind='bar', title='Accuracy Score', figsize=(10, 6), color='cornflowerblue')


In [None]:

print("classification_report:\n", classification_report(y_test, y_testRF_pred))
print("confusion_matrix:\n", confusion_matrix(y_test, y_testRF_pred))