In [5]:
# STEP 1: Upload file from local (for Google Colab)
from google.colab import files
uploaded = files.upload()  # Select your CSV file when prompted

# STEP 2: Import libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier, plot_tree
from sklearn.metrics import accuracy_score, classification_report
from sklearn.preprocessing import LabelEncoder

# STEP 3: Load and clean your uploaded CSV file
df = pd.read_csv('healthcare_noshows_appointments.csv')

# DEBUG: Show raw column names with hidden characters
print("Column names before cleaning:")
for col in df.columns:
    print(repr(col))

# Clean column names
df.columns = df.columns.str.strip().str.replace('\ufeff', '', regex=True)

# Confirm cleaned names
print("\nColumn names after cleaning:", df.columns.tolist())

# STEP 4: Initial exploration
print("\nFirst 5 rows:\n", df.head())
print("\nMissing values:\n", df.isnull().sum())

# STEP 5: Convert date columns to datetime
df['ScheduledDay'] = pd.to_datetime(df['ScheduledDay'])
df['AppointmentDay'] = pd.to_datetime(df['AppointmentDay'])

# STEP 6: Feature Engineering
df['DaysBetween'] = (df['AppointmentDay'] - df['ScheduledDay']).dt.days
df['DaysBetween'] = df['DaysBetween'].apply(lambda x: max(x, 0))
df['AppointmentWeekday'] = df['AppointmentDay'].dt.dayofweek  # 0=Monday

# STEP 7: Encode categorical variables
le_gender = LabelEncoder()
df['Gender'] = le_gender.fit_transform(df['Gender'])

le_neighbourhood = LabelEncoder()
df['Neighbourhood'] = le_neighbourhood.fit_transform(df['Neighbourhood'])

# STEP 8: Define features and target variable
features = ['Gender', 'Age', 'Neighbourhood', 'Scholarship', 'Hipertension',
            'Diabetes', 'Alcoholism', 'SMS_received', 'DaysBetween', 'AppointmentWeekday']
target = 'No-show'  # Assuming 'No-show' is your target column

# ... (Rest of your code for model training, prediction, and evaluation)

Saving healthcare_noshows_appointments.csv to healthcare_noshows_appointments (2).csv
Column names before cleaning:
'PatientId'
'AppointmentID'
'Gender'
'ScheduledDay'
'AppointmentDay'
'Age'
'Neighbourhood'
'Scholarship'
'Hipertension'
'Diabetes'
'Alcoholism'
'Handcap'
'SMS_received'
'Showed_up'
'Date.diff'

Column names after cleaning: ['PatientId', 'AppointmentID', 'Gender', 'ScheduledDay', 'AppointmentDay', 'Age', 'Neighbourhood', 'Scholarship', 'Hipertension', 'Diabetes', 'Alcoholism', 'Handcap', 'SMS_received', 'Showed_up', 'Date.diff']

First 5 rows:
       PatientId  AppointmentID Gender ScheduledDay AppointmentDay  Age  \
0  2.987250e+13        5642903      F   2016-04-29     2016-04-29   62   
1  5.589978e+14        5642503      M   2016-04-29     2016-04-29   56   
2  4.262962e+12        5642549      F   2016-04-29     2016-04-29   62   
3  8.679512e+11        5642828      F   2016-04-29     2016-04-29    8   
4  8.841186e+12        5642494      F   2016-04-29     2016-04-29 