In [1]:
##########################


In [2]:
#  Importing required libraries
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix, classification_report


In [3]:
#  Load the dataset
file_path = "path/to/your/dataset.csv"  # Replace with your dataset path
df = pd.read_csv("KaggleV2-May-2016.csv" )


print("\n🔎 Dataset shape before cleaning:", df.shape)
print("\n🔎 First 5 rows of the dataset:")
print(df.head())

#  Check for missing values
print("\n🔎 Missing values in each column:")
print(df.isnull().sum())


print("\n🔎 Unique values in 'No-show':", df['No-show'].unique())
print("\n🔎 Value counts in 'No-show':")
print(df['No-show'].value_counts(dropna=False))



🔎 Dataset shape before cleaning: (110527, 14)

🔎 First 5 rows of the dataset:
      PatientId  AppointmentID Gender          ScheduledDay  \
0  2.987250e+13        5642903      F  2016-04-29T18:38:08Z   
1  5.589978e+14        5642503      M  2016-04-29T16:08:27Z   
2  4.262962e+12        5642549      F  2016-04-29T16:19:04Z   
3  8.679512e+11        5642828      F  2016-04-29T17:29:31Z   
4  8.841186e+12        5642494      F  2016-04-29T16:07:23Z   

         AppointmentDay  Age      Neighbourhood  Scholarship  Hipertension  \
0  2016-04-29T00:00:00Z   62    JARDIM DA PENHA            0             1   
1  2016-04-29T00:00:00Z   56    JARDIM DA PENHA            0             0   
2  2016-04-29T00:00:00Z   62      MATA DA PRAIA            0             0   
3  2016-04-29T00:00:00Z    8  PONTAL DE CAMBURI            0             0   
4  2016-04-29T00:00:00Z   56    JARDIM DA PENHA            0             1   

   Diabetes  Alcoholism  Handcap  SMS_received No-show  
0         0     

In [4]:

# Replace missing values in categorical columns with mode
df.fillna({col: df[col].mode()[0] for col in df.select_dtypes(include='object').columns}, inplace=True)

# Replace missing values in numerical columns with median
df.fillna({col: df[col].median() for col in df.select_dtypes(include='number').columns}, inplace=True)



In [5]:

df['No-show'] = df['No-show'].str.strip().str.capitalize()


df['No-show'] = df['No-show'].map({'No': 0, 'Yes': 1})

#  Remove invalid rows
df = df.dropna(subset=['No-show'])


In [6]:

df['ScheduledDay'] = pd.to_datetime(df['ScheduledDay']).astype('int64') // 10**9
df['AppointmentDay'] = pd.to_datetime(df['AppointmentDay']).astype('int64') // 10**9


In [7]:
# One-hot encode categorical features
df = pd.get_dummies(df, drop_first=True)


In [8]:

X = df.drop('No-show', axis=1)
y = df['No-show']

#  Check the shape
print("\n X shape:", X.shape)
print("\n y shape:", y.shape)



 X shape: (110527, 92)

 y shape: (110527,)


In [9]:
# Split the dataset into training and testing sets (80-20 split)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

print("\n Training set shape:", X_train.shape, y_train.shape)
print("\n Testing set shape:", X_test.shape, y_test.shape)



 Training set shape: (88421, 92) (88421,)

 Testing set shape: (22106, 92) (22106,)


In [10]:
#  Train Logistic Regression model
log_reg = LogisticRegression(max_iter=1000)
log_reg.fit(X_train, y_train)

#  Train Random Forest model
rf = RandomForestClassifier(n_estimators=100, random_state=42)
rf.fit(X_train, y_train)


In [11]:
#  Make predictions
y_pred_log = log_reg.predict(X_test)
y_pred_rf = rf.predict(X_test)


In [12]:
#  Function to evaluate the models
def evaluate_model(y_true, y_pred, model_name):
    accuracy = accuracy_score(y_true, y_pred)
    precision = precision_score(y_true, y_pred)
    recall = recall_score(y_true, y_pred)
    f1 = f1_score(y_true, y_pred)
    
    print(f"\n {model_name} Results:")
    print(f"Accuracy: {accuracy:.4f}")
    print(f"Precision: {precision:.4f}")
    print(f"Recall: {recall:.4f}")
    print(f"F1-Score: {f1:.4f}")
    print("\nConfusion Matrix:")
    print(confusion_matrix(y_true, y_pred))
    print("\nClassification Report:")
    print(classification_report(y_true, y_pred))


In [13]:
#  Logistic Regression Results
evaluate_model(y_test, y_pred_log, "Logistic Regression")

#  Random Forest Results
evaluate_model(y_test, y_pred_rf, "Random Forest")



 Logistic Regression Results:
Accuracy: 0.7993
Precision: 0.0000
Recall: 0.0000
F1-Score: 0.0000

Confusion Matrix:
[[17669     0]
 [ 4437     0]]

Classification Report:
              precision    recall  f1-score   support

           0       0.80      1.00      0.89     17669
           1       0.00      0.00      0.00      4437

    accuracy                           0.80     22106
   macro avg       0.40      0.50      0.44     22106
weighted avg       0.64      0.80      0.71     22106


 Random Forest Results:
Accuracy: 0.8067
Precision: 0.5777
Recall: 0.1375
F1-Score: 0.2221

Confusion Matrix:
[[17223   446]
 [ 3827   610]]

Classification Report:
              precision    recall  f1-score   support

           0       0.82      0.97      0.89     17669
           1       0.58      0.14      0.22      4437

    accuracy                           0.81     22106
   macro avg       0.70      0.56      0.56     22106
weighted avg       0.77      0.81      0.76     22106



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [None]:
#  Logistic Regression
evaluate_model(y_test, y_pred_log, "Logistic Regression")

#  Random Forest Results
evaluate_model(y_test, y_pred_rf, "Random Forest")
###

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))



 Logistic Regression Results:
Accuracy: 0.7993
Precision: 0.0000
Recall: 0.0000
F1-Score: 0.0000

Confusion Matrix:
[[17669     0]
 [ 4437     0]]

Classification Report:
              precision    recall  f1-score   support

           0       0.80      1.00      0.89     17669
           1       0.00      0.00      0.00      4437

    accuracy                           0.80     22106
   macro avg       0.40      0.50      0.44     22106
weighted avg       0.64      0.80      0.71     22106


 Random Forest Results:
Accuracy: 0.8067
Precision: 0.5777
Recall: 0.1375
F1-Score: 0.2221

Confusion Matrix:
[[17223   446]
 [ 3827   610]]

Classification Report:
              precision    recall  f1-score   support

           0       0.82      0.97      0.89     17669
           1       0.58      0.14      0.22      4437

    accuracy                           0.81     22106
   macro avg       0.70      0.56      0.56     22106
weighted avg       0.77      0.81      0.76     22106



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [15]:

log_reg = LogisticRegression(max_iter=1000, class_weight='balanced')
log_reg.fit(X_train, y_train)

# Random Forest with Class Weights
rf = RandomForestClassifier(n_estimators=100, random_state=42, class_weight='balanced')
rf.fit(X_train, y_train)


y_pred_log_weighted = log_reg.predict(X_test)
y_pred_rf_weighted = rf.predict(X_test)

#  Evaluate models
print("\n Logistic Regression with Class Weights:")
evaluate_model(y_test, y_pred_log_weighted, "Logistic Regression with Class Weights")

print("\n Random Forest with Class Weights:")
evaluate_model(y_test, y_pred_rf_weighted, "Random Forest with Class Weights")




 Logistic Regression with Class Weights:

 Logistic Regression with Class Weights Results:
Accuracy: 0.7993
Precision: 0.0000
Recall: 0.0000
F1-Score: 0.0000

Confusion Matrix:
[[17669     0]
 [ 4437     0]]

Classification Report:
              precision    recall  f1-score   support

           0       0.80      1.00      0.89     17669
           1       0.00      0.00      0.00      4437

    accuracy                           0.80     22106
   macro avg       0.40      0.50      0.44     22106
weighted avg       0.64      0.80      0.71     22106


 Random Forest with Class Weights:

 Random Forest with Class Weights Results:
Accuracy: 0.8059
Precision: 0.5731
Recall: 0.1298
F1-Score: 0.2117

Confusion Matrix:
[[17240   429]
 [ 3861   576]]

Classification Report:
              precision    recall  f1-score   support

           0       0.82      0.98      0.89     17669
           1       0.57      0.13      0.21      4437

    accuracy                           0.81     22106
 

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
