# Python Script for Hospital appointment no-show analysis

In [62]:
import pandas as pd
import numpy as np


In [64]:
no_show_df = pd.read_csv("C:/Users/Dell/OneDrive/Desktop/Noshowdataset/no_show_dataset.csv")

In [65]:
no_show_df.head()


Unnamed: 0,PatientId,AppointmentID,Gender,ScheduledDay,AppointmentDay,Age,Neighbourhood,Scholarship,Hipertension,Diabetes,Alcoholism,Handcap,SMS_received,No-show
0,29872500000000.0,5642903,F,2016-04-29T18:38:08Z,2016-04-29T00:00:00Z,62,JARDIM DA PENHA,0,1,0,0,0,0,No
1,558997800000000.0,5642503,M,2016-04-29T16:08:27Z,2016-04-29T00:00:00Z,56,JARDIM DA PENHA,0,0,0,0,0,0,No
2,4262962000000.0,5642549,F,2016-04-29T16:19:04Z,2016-04-29T00:00:00Z,62,MATA DA PRAIA,0,0,0,0,0,0,No
3,867951200000.0,5642828,F,2016-04-29T17:29:31Z,2016-04-29T00:00:00Z,8,PONTAL DE CAMBURI,0,0,0,0,0,0,No
4,8841186000000.0,5642494,F,2016-04-29T16:07:23Z,2016-04-29T00:00:00Z,56,JARDIM DA PENHA,0,1,1,0,0,0,No


In [68]:
#check the dtypes 
no_show_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 110527 entries, 0 to 110526
Data columns (total 14 columns):
 #   Column          Non-Null Count   Dtype  
---  ------          --------------   -----  
 0   PatientId       110527 non-null  float64
 1   AppointmentID   110527 non-null  int64  
 2   Gender          110527 non-null  object 
 3   ScheduledDay    110527 non-null  object 
 4   AppointmentDay  110527 non-null  object 
 5   Age             110527 non-null  int64  
 6   Neighbourhood   110527 non-null  object 
 7   Scholarship     110527 non-null  int64  
 8   Hipertension    110527 non-null  int64  
 9   Diabetes        110527 non-null  int64  
 10  Alcoholism      110527 non-null  int64  
 11  Handcap         110527 non-null  int64  
 12  SMS_received    110527 non-null  int64  
 13  No-show         110527 non-null  object 
dtypes: float64(1), int64(8), object(5)
memory usage: 11.8+ MB


In [71]:
#Looking for Null Values 
no_show_df.isnull().sum()

PatientId         0
AppointmentID     0
Gender            0
ScheduledDay      0
AppointmentDay    0
Age               0
Neighbourhood     0
Scholarship       0
Hipertension      0
Diabetes          0
Alcoholism        0
Handcap           0
SMS_received      0
No-show           0
dtype: int64

## Cleaning dataset 


In [74]:
#Rename columns 
no_show_df.rename(columns={"Handcap": "Handicap"},inplace=True)

no_show_df.rename(columns={"PatientId": "PatientID"},inplace=True)

no_show_df.rename(columns={"Hipertension": "Hypertension"},inplace=True)

In [76]:
# Convert datatypes to appropriate dtypes 
no_show_df["PatientID"]= no_show_df["PatientID"].astype(str)
no_show_df["Gender"]= no_show_df["Gender"].astype(str)

In [78]:
no_show_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 110527 entries, 0 to 110526
Data columns (total 14 columns):
 #   Column          Non-Null Count   Dtype 
---  ------          --------------   ----- 
 0   PatientID       110527 non-null  object
 1   AppointmentID   110527 non-null  int64 
 2   Gender          110527 non-null  object
 3   ScheduledDay    110527 non-null  object
 4   AppointmentDay  110527 non-null  object
 5   Age             110527 non-null  int64 
 6   Neighbourhood   110527 non-null  object
 7   Scholarship     110527 non-null  int64 
 8   Hypertension    110527 non-null  int64 
 9   Diabetes        110527 non-null  int64 
 10  Alcoholism      110527 non-null  int64 
 11  Handicap        110527 non-null  int64 
 12  SMS_received    110527 non-null  int64 
 13  No-show         110527 non-null  object
dtypes: int64(8), object(6)
memory usage: 11.8+ MB


In [80]:


# Convert to datetime (with UTC timezone)
no_show_df['ScheduledDay'] = pd.to_datetime(
    no_show_df['ScheduledDay'], errors='coerce', utc=True
)

no_show_df['AppointmentDay'] = pd.to_datetime(
    no_show_df['AppointmentDay'], errors='coerce', utc=True
)

# Confirm conversion
print(no_show_df[['ScheduledDay', 'AppointmentDay']].dtypes)


ScheduledDay      datetime64[ns, UTC]
AppointmentDay    datetime64[ns, UTC]
dtype: object


In [81]:
# Keep only the date (YYYY-MM-DD)
no_show_df['ScheduledDay'] = no_show_df['ScheduledDay'].dt.date
no_show_df['AppointmentDay'] = no_show_df['AppointmentDay'].dt.date

# Optional: rename for clarity
no_show_df.rename(
    columns={
        'ScheduledDay': 'ScheduledDate',
        'AppointmentDay': 'AppointmentDate'
    },
    inplace=True
)

# Check result
print(no_show_df[['ScheduledDate', 'AppointmentDate']].head())


  ScheduledDate AppointmentDate
0    2016-04-29      2016-04-29
1    2016-04-29      2016-04-29
2    2016-04-29      2016-04-29
3    2016-04-29      2016-04-29
4    2016-04-29      2016-04-29


In [84]:
#creating Waiting time
no_show_df['WaitingDays'] = (
    pd.to_datetime(no_show_df['AppointmentDate']) - pd.to_datetime(no_show_df['ScheduledDate'])
).dt.days.astype('Int64')
no_show_df = no_show_df[no_show_df['WaitingDays'] >= 0]



In [86]:
# Fixing Age column 
no_show_df["Age"]= no_show_df["Age"][(no_show_df["Age"]>=0) & (no_show_df["Age"]<=100)]
no_show_df= no_show_df.dropna(subset=['Age'])
no_show_df['Age'] = no_show_df['Age'].astype('Int64')


In [88]:
no_show_df

Unnamed: 0,PatientID,AppointmentID,Gender,ScheduledDate,AppointmentDate,Age,Neighbourhood,Scholarship,Hypertension,Diabetes,Alcoholism,Handicap,SMS_received,No-show,WaitingDays
0,29872499824296.0,5642903,F,2016-04-29,2016-04-29,62,JARDIM DA PENHA,0,1,0,0,0,0,No,0
1,558997776694438.0,5642503,M,2016-04-29,2016-04-29,56,JARDIM DA PENHA,0,0,0,0,0,0,No,0
2,4262962299951.0,5642549,F,2016-04-29,2016-04-29,62,MATA DA PRAIA,0,0,0,0,0,0,No,0
3,867951213174.0,5642828,F,2016-04-29,2016-04-29,8,PONTAL DE CAMBURI,0,0,0,0,0,0,No,0
4,8841186448183.0,5642494,F,2016-04-29,2016-04-29,56,JARDIM DA PENHA,0,1,1,0,0,0,No,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
110522,2572134369293.0,5651768,F,2016-05-03,2016-06-07,56,MARIA ORTIZ,0,0,0,0,0,1,No,35
110523,3596266328735.0,5650093,F,2016-05-03,2016-06-07,51,MARIA ORTIZ,0,0,0,0,0,1,No,35
110524,15576631729893.0,5630692,F,2016-04-27,2016-06-07,21,MARIA ORTIZ,0,0,0,0,0,1,No,41
110525,92134931435557.0,5630323,F,2016-04-27,2016-06-07,38,MARIA ORTIZ,0,0,0,0,0,1,No,41


In [90]:
# Creating a binary column for no-show 
no_show_df["No-show"]= no_show_df["No-show"].map({'No':0,'Yes':1})

In [92]:
# make gender binary value
no_show_df['Gender'] = (
    no_show_df['Gender']
    .astype(str)       # ensure string
    .str.upper()       # clean
    .str.strip()       # remove extra spaces
    .map({'F': 0, 'M': 1})  # map to binary
    .astype('Int64')   # nullable integer
)

In [94]:
#checking for null values 


In [96]:
no_show_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 110514 entries, 0 to 110526
Data columns (total 15 columns):
 #   Column           Non-Null Count   Dtype 
---  ------           --------------   ----- 
 0   PatientID        110514 non-null  object
 1   AppointmentID    110514 non-null  int64 
 2   Gender           110514 non-null  Int64 
 3   ScheduledDate    110514 non-null  object
 4   AppointmentDate  110514 non-null  object
 5   Age              110514 non-null  Int64 
 6   Neighbourhood    110514 non-null  object
 7   Scholarship      110514 non-null  int64 
 8   Hypertension     110514 non-null  int64 
 9   Diabetes         110514 non-null  int64 
 10  Alcoholism       110514 non-null  int64 
 11  Handicap         110514 non-null  int64 
 12  SMS_received     110514 non-null  int64 
 13  No-show          110514 non-null  int64 
 14  WaitingDays      110514 non-null  Int64 
dtypes: Int64(3), int64(8), object(4)
memory usage: 13.8+ MB


In [99]:
no_show_df

Unnamed: 0,PatientID,AppointmentID,Gender,ScheduledDate,AppointmentDate,Age,Neighbourhood,Scholarship,Hypertension,Diabetes,Alcoholism,Handicap,SMS_received,No-show,WaitingDays
0,29872499824296.0,5642903,0,2016-04-29,2016-04-29,62,JARDIM DA PENHA,0,1,0,0,0,0,0,0
1,558997776694438.0,5642503,1,2016-04-29,2016-04-29,56,JARDIM DA PENHA,0,0,0,0,0,0,0,0
2,4262962299951.0,5642549,0,2016-04-29,2016-04-29,62,MATA DA PRAIA,0,0,0,0,0,0,0,0
3,867951213174.0,5642828,0,2016-04-29,2016-04-29,8,PONTAL DE CAMBURI,0,0,0,0,0,0,0,0
4,8841186448183.0,5642494,0,2016-04-29,2016-04-29,56,JARDIM DA PENHA,0,1,1,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
110522,2572134369293.0,5651768,0,2016-05-03,2016-06-07,56,MARIA ORTIZ,0,0,0,0,0,1,0,35
110523,3596266328735.0,5650093,0,2016-05-03,2016-06-07,51,MARIA ORTIZ,0,0,0,0,0,1,0,35
110524,15576631729893.0,5630692,0,2016-04-27,2016-06-07,21,MARIA ORTIZ,0,0,0,0,0,1,0,41
110525,92134931435557.0,5630323,0,2016-04-27,2016-06-07,38,MARIA ORTIZ,0,0,0,0,0,1,0,41


In [101]:
no_show_df.isna().sum()

PatientID          0
AppointmentID      0
Gender             0
ScheduledDate      0
AppointmentDate    0
Age                0
Neighbourhood      0
Scholarship        0
Hypertension       0
Diabetes           0
Alcoholism         0
Handicap           0
SMS_received       0
No-show            0
WaitingDays        0
dtype: int64

In [103]:
no_show_df.rename(columns=lambda x: x.strip().lower().replace(" ", "_"), inplace=True)


In [105]:
no_show_df.rename(columns={"scheduleddate": "scheduled_date"},inplace=True)
no_show_df.rename(columns={"appointmentdate": "appointment_date"},inplace=True)
no_show_df.rename(columns={"patientid": "patient_id"},inplace=True)
no_show_df.rename(columns={"no-show": "no_show"},inplace=True)
no_show_df.rename(columns={"waitingdays": "waiting_days"},inplace=True)

In [107]:
no_show_df.columns

Index(['patient_id', 'appointmentid', 'gender', 'scheduled_date',
       'appointment_date', 'age', 'neighbourhood', 'scholarship',
       'hypertension', 'diabetes', 'alcoholism', 'handicap', 'sms_received',
       'no_show', 'waiting_days'],
      dtype='object')

In [109]:
no_show_df.rename(columns={"appointmentid": "appointment_id"},inplace=True)

In [111]:
no_show_df

Unnamed: 0,patient_id,appointment_id,gender,scheduled_date,appointment_date,age,neighbourhood,scholarship,hypertension,diabetes,alcoholism,handicap,sms_received,no_show,waiting_days
0,29872499824296.0,5642903,0,2016-04-29,2016-04-29,62,JARDIM DA PENHA,0,1,0,0,0,0,0,0
1,558997776694438.0,5642503,1,2016-04-29,2016-04-29,56,JARDIM DA PENHA,0,0,0,0,0,0,0,0
2,4262962299951.0,5642549,0,2016-04-29,2016-04-29,62,MATA DA PRAIA,0,0,0,0,0,0,0,0
3,867951213174.0,5642828,0,2016-04-29,2016-04-29,8,PONTAL DE CAMBURI,0,0,0,0,0,0,0,0
4,8841186448183.0,5642494,0,2016-04-29,2016-04-29,56,JARDIM DA PENHA,0,1,1,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
110522,2572134369293.0,5651768,0,2016-05-03,2016-06-07,56,MARIA ORTIZ,0,0,0,0,0,1,0,35
110523,3596266328735.0,5650093,0,2016-05-03,2016-06-07,51,MARIA ORTIZ,0,0,0,0,0,1,0,35
110524,15576631729893.0,5630692,0,2016-04-27,2016-06-07,21,MARIA ORTIZ,0,0,0,0,0,1,0,41
110525,92134931435557.0,5630323,0,2016-04-27,2016-06-07,38,MARIA ORTIZ,0,0,0,0,0,1,0,41


In [113]:
assert no_show_df['appointment_id'].is_unique  # or better: rename to 'appointment_id'


In [115]:
#make appointment ID as index 
no_show_df=no_show_df.set_index('appointment_id')

In [117]:
no_show_df

Unnamed: 0_level_0,patient_id,gender,scheduled_date,appointment_date,age,neighbourhood,scholarship,hypertension,diabetes,alcoholism,handicap,sms_received,no_show,waiting_days
appointment_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
5642903,29872499824296.0,0,2016-04-29,2016-04-29,62,JARDIM DA PENHA,0,1,0,0,0,0,0,0
5642503,558997776694438.0,1,2016-04-29,2016-04-29,56,JARDIM DA PENHA,0,0,0,0,0,0,0,0
5642549,4262962299951.0,0,2016-04-29,2016-04-29,62,MATA DA PRAIA,0,0,0,0,0,0,0,0
5642828,867951213174.0,0,2016-04-29,2016-04-29,8,PONTAL DE CAMBURI,0,0,0,0,0,0,0,0
5642494,8841186448183.0,0,2016-04-29,2016-04-29,56,JARDIM DA PENHA,0,1,1,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5651768,2572134369293.0,0,2016-05-03,2016-06-07,56,MARIA ORTIZ,0,0,0,0,0,1,0,35
5650093,3596266328735.0,0,2016-05-03,2016-06-07,51,MARIA ORTIZ,0,0,0,0,0,1,0,35
5630692,15576631729893.0,0,2016-04-27,2016-06-07,21,MARIA ORTIZ,0,0,0,0,0,1,0,41
5630323,92134931435557.0,0,2016-04-27,2016-06-07,38,MARIA ORTIZ,0,0,0,0,0,1,0,41


In [119]:
# Strip leading/trailing spaces and standardize text
no_show_df.loc[:, 'neighbourhood'] = (
    no_show_df['neighbourhood']
    .astype(str)       # ensure string
    .str.strip()       # remove extra spaces
    .str.upper()       # standardize all to uppercase
)


# Requires unidecode library: pip install unidecode
from unidecode import unidecode

no_show_df.loc[:, 'neighbourhood'] = no_show_df['neighbourhood'].apply(unidecode)

#  Check for unique values
print(no_show_df['neighbourhood'].nunique())
print(no_show_df['neighbourhood'].value_counts().head(20))


81
neighbourhood
JARDIM CAMBURI       7717
MARIA ORTIZ          5804
RESISTENCIA          4430
JARDIM DA PENHA      3877
ITARARE              3514
CENTRO               3334
TABUAZEIRO           3131
SANTA MARTHA         3131
JESUS DE NAZARETH    2853
BONFIM               2773
SANTO ANTONIO        2744
SANTO ANDRE          2571
CARATOIRA            2565
JABOUR               2509
SAO PEDRO            2448
ILHA DO PRINCIPE     2266
NOVA PALESTINA       2264
ANDORINHAS           2258
DA PENHA             2217
ROMAO                2214
Name: count, dtype: int64


In [121]:
no_show_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 110514 entries, 5642903 to 5629448
Data columns (total 14 columns):
 #   Column            Non-Null Count   Dtype 
---  ------            --------------   ----- 
 0   patient_id        110514 non-null  object
 1   gender            110514 non-null  Int64 
 2   scheduled_date    110514 non-null  object
 3   appointment_date  110514 non-null  object
 4   age               110514 non-null  Int64 
 5   neighbourhood     110514 non-null  object
 6   scholarship       110514 non-null  int64 
 7   hypertension      110514 non-null  int64 
 8   diabetes          110514 non-null  int64 
 9   alcoholism        110514 non-null  int64 
 10  handicap          110514 non-null  int64 
 11  sms_received      110514 non-null  int64 
 12  no_show           110514 non-null  int64 
 13  waiting_days      110514 non-null  Int64 
dtypes: Int64(3), int64(7), object(4)
memory usage: 13.0+ MB


In [123]:
# Convert any value > 0 to 1, keep 0 as 0
no_show_df['handicap'] = (no_show_df['handicap'] > 0).astype('Int64')


In [125]:
null_counts = no_show_df.isna().sum()
print(null_counts[null_counts > 0]) 


Series([], dtype: int64)


In [127]:
no_show_df.shape

(110514, 14)

In [129]:
# Save the cleaned dataset to CSV for MySQL / Power BI ODBC
no_show_df.to_csv(
    'C:/Users/Dell/OneDrive/Desktop/Noshowdataset/noshow_cleaned_for_sql.csv',  # file name
    index=True,                    # include index (appointment_id) as primary key
    date_format='%Y-%m-%d'         # format dates as YYYY-MM-DD
)

print("✅ Dataset saved as 'noshow_cleaned_for_sql.csv', ready for MySQL and Power BI.")


✅ Dataset saved as 'noshow_cleaned_for_sql.csv', ready for MySQL and Power BI.


In [133]:
import pandas as pd
import sqlite3

# 1️⃣ Load CSV
file_path = r"C:\Users\Dell\OneDrive\Desktop\Noshowdataset\noshow_cleaned_for_sql.csv"
df = pd.read_csv(file_path)

# 2️⃣ Clean column names: strip spaces, lowercase
df.columns = df.columns.str.strip().str.lower()
print("Columns after cleaning:", df.columns.tolist())

# 3️⃣ Convert binary columns to int
binary_cols = ['gender', 'scholarship', 'hypertension', 'diabetes',
               'alcoholism', 'handicap', 'sms_received', 'no_show']
for col in binary_cols:
    df[col] = df[col].astype(int)

# 4️⃣ Convert numeric columns to int
numeric_cols = ['age', 'appointment_id', 'waiting_days']
for col in numeric_cols:
    df[col] = df[col].astype(int)

# 5️⃣ Convert dates to datetime
date_cols = ['scheduled_date', 'appointment_date']
for col in date_cols:
    df[col] = pd.to_datetime(df[col], errors='coerce').dt.date

# ✅ Check for nulls
print(df.isnull().sum())

# 6️⃣ Create SQLite DB and upload
db_path = r"C:\Users\Dell\OneDrive\Desktop\Noshowdataset\noshow_appointments.db"
conn = sqlite3.connect(db_path)

# Upload to SQLite
df.to_sql(
    name='noshow_appointments',
    con=conn,
    if_exists='replace',  # overwrite if table exists
    index=False
)

conn.close()
print(f"✅ Data uploaded to SQLite database at {db_path}")


Columns after cleaning: ['appointment_id', 'patient_id', 'gender', 'scheduled_date', 'appointment_date', 'age', 'neighbourhood', 'scholarship', 'hypertension', 'diabetes', 'alcoholism', 'handicap', 'sms_received', 'no_show', 'waiting_days']
appointment_id      0
patient_id          0
gender              0
scheduled_date      0
appointment_date    0
age                 0
neighbourhood       0
scholarship         0
hypertension        0
diabetes            0
alcoholism          0
handicap            0
sms_received        0
no_show             0
waiting_days        0
dtype: int64
✅ Data uploaded to SQLite database at C:\Users\Dell\OneDrive\Desktop\Noshowdataset\noshow_appointments.db
