In [1]:
import pandas as pd

# Load the CSV file (change path if needed)

In [2]:
df = pd.read_csv("Medical-Appoitment-No-Shows.csv")

In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 110527 entries, 0 to 110526
Data columns (total 14 columns):
 #   Column          Non-Null Count   Dtype  
---  ------          --------------   -----  
 0   PatientId       110527 non-null  float64
 1   AppointmentID   110527 non-null  int64  
 2   Gender          110527 non-null  object 
 3   ScheduledDay    110527 non-null  object 
 4   AppointmentDay  110527 non-null  object 
 5   Age             110527 non-null  int64  
 6   Neighbourhood   110527 non-null  object 
 7   Scholarship     110527 non-null  int64  
 8   Hipertension    110527 non-null  int64  
 9   Diabetes        110527 non-null  int64  
 10  Alcoholism      110527 non-null  int64  
 11  Handcap         110527 non-null  int64  
 12  SMS_received    110527 non-null  int64  
 13  No-show         110527 non-null  object 
dtypes: float64(1), int64(8), object(5)
memory usage: 11.8+ MB


In [4]:
df.head()

Unnamed: 0,PatientId,AppointmentID,Gender,ScheduledDay,AppointmentDay,Age,Neighbourhood,Scholarship,Hipertension,Diabetes,Alcoholism,Handcap,SMS_received,No-show
0,29872500000000.0,5642903,F,2016-04-29T18:38:08Z,2016-04-29T00:00:00Z,62,JARDIM DA PENHA,0,1,0,0,0,0,No
1,558997800000000.0,5642503,M,2016-04-29T16:08:27Z,2016-04-29T00:00:00Z,56,JARDIM DA PENHA,0,0,0,0,0,0,No
2,4262962000000.0,5642549,F,2016-04-29T16:19:04Z,2016-04-29T00:00:00Z,62,MATA DA PRAIA,0,0,0,0,0,0,No
3,867951200000.0,5642828,F,2016-04-29T17:29:31Z,2016-04-29T00:00:00Z,8,PONTAL DE CAMBURI,0,0,0,0,0,0,No
4,8841186000000.0,5642494,F,2016-04-29T16:07:23Z,2016-04-29T00:00:00Z,56,JARDIM DA PENHA,0,1,1,0,0,0,No


# 1. Rename columns: lowercase, no spaces

In [5]:
df.columns = [col.strip().lower().replace('-', '').replace(' ', '_') for col in df.columns]
df.rename(columns={
    'hipertension': 'hypertension',
    'handcap': 'handicap',
    'noshow': 'no_show'}, inplace=True)

In [7]:
df.head(2)

Unnamed: 0,patientid,appointmentid,gender,scheduledday,appointmentday,age,neighbourhood,scholarship,hypertension,diabetes,alcoholism,handicap,sms_received,no_show
0,29872500000000.0,5642903,F,2016-04-29T18:38:08Z,2016-04-29T00:00:00Z,62,JARDIM DA PENHA,0,1,0,0,0,0,No
1,558997800000000.0,5642503,M,2016-04-29T16:08:27Z,2016-04-29T00:00:00Z,56,JARDIM DA PENHA,0,0,0,0,0,0,No


# 2. Handle missing values (drop rows with any missing values)

In [8]:
df.dropna(inplace=True)

# 3. Remove duplicate rows

In [9]:
df.drop_duplicates(inplace=True)

# 4. Standardize text values

In [10]:
df['gender'] = df['gender'].str.strip().str.upper()

# 5. Convert date columns to datetime (standard format: dd-mm-yyyy)

In [11]:
df['scheduledday'] = pd.to_datetime(df['scheduledday'], errors='coerce').dt.strftime('%d-%m-%Y')
df['appointmentday'] = pd.to_datetime(df['appointmentday'], errors='coerce').dt.strftime('%d-%m-%Y')

In [12]:
df.head(3)

Unnamed: 0,patientid,appointmentid,gender,scheduledday,appointmentday,age,neighbourhood,scholarship,hypertension,diabetes,alcoholism,handicap,sms_received,no_show
0,29872500000000.0,5642903,F,29-04-2016,29-04-2016,62,JARDIM DA PENHA,0,1,0,0,0,0,No
1,558997800000000.0,5642503,M,29-04-2016,29-04-2016,56,JARDIM DA PENHA,0,0,0,0,0,0,No
2,4262962000000.0,5642549,F,29-04-2016,29-04-2016,62,MATA DA PRAIA,0,0,0,0,0,0,No


# 6. Fix data types

In [13]:
df['age'] = pd.to_numeric(df['age'], errors='coerce').astype('Int64')
df['no_show'] = df['no_show'].str.strip().str.lower().map({'no': False, 'yes': True})

# 7. Remove rows with invalid or missing age

In [14]:
df = df[df['age'].notnull() & (df['age'] >= 0)]

# Reset index

In [15]:
df.reset_index(drop=True, inplace=True)

# Save cleaned file

In [16]:
df.to_csv("Cleaned_Medical-Appoitment-No-Shows.csv", index=False)