In [3]:
# SECTION 1: Import Required Libraries
import pandas as pd


In [4]:
# SECTION 2: Load Dataset
df = pd.read_csv("KaggleV2-May-2016.csv")  # Make sure the CSV is uploaded in your Colab session
print("Initial Data Shape:", df.shape)


Initial Data Shape: (110527, 14)


In [5]:
# SECTION 3: Check for Missing Values
print("\nMissing Values:")
print(df.isnull().sum())



Missing Values:
PatientId         0
AppointmentID     0
Gender            0
ScheduledDay      0
AppointmentDay    0
Age               0
Neighbourhood     0
Scholarship       0
Hipertension      0
Diabetes          0
Alcoholism        0
Handcap           0
SMS_received      0
No-show           0
dtype: int64


In [6]:
# SECTION 4: Remove Duplicate Rows
duplicates = df.duplicated().sum()
print(f"\nDuplicate Rows Found: {duplicates}")
df = df.drop_duplicates()



Duplicate Rows Found: 0


In [7]:
# SECTION 5: Clean and Standardize Column Names
df.columns = df.columns.str.strip().str.lower().str.replace(' ', '_')
print("\nUpdated Column Names:")
print(df.columns)



Updated Column Names:
Index(['patientid', 'appointmentid', 'gender', 'scheduledday',
       'appointmentday', 'age', 'neighbourhood', 'scholarship', 'hipertension',
       'diabetes', 'alcoholism', 'handcap', 'sms_received', 'no-show'],
      dtype='object')


In [8]:
# SECTION 6: Standardize Text Fields (Gender)
df['gender'] = df['gender'].str.upper()
print("\nUnique Gender Values:", df['gender'].unique())



Unique Gender Values: ['F' 'M']


In [9]:
# SECTION 7: Convert Date Columns to Uniform Format (dd-mm-yyyy)
df['scheduledday'] = pd.to_datetime(df['scheduledday'], errors='coerce').dt.strftime('%d-%m-%Y')
df['appointmentday'] = pd.to_datetime(df['appointmentday'], errors='coerce').dt.strftime('%d-%m-%Y')


In [10]:
# SECTION 8: Handle Invalid Age Values
df = df[df['age'] >= 0].copy()  # Filter and copy to avoid warning
df['age'] = df['age'].astype(int)


In [11]:
# SECTION 9: Drop Rows with Missing Neighborhood
df = df[df['neighbourhood'].notna()]


In [12]:
# SECTION 10: Final Data Overview
print("\nFinal Data Types:")
print(df.dtypes)

print("\nFinal Data Shape:", df.shape)



Final Data Types:
patientid         float64
appointmentid       int64
gender             object
scheduledday       object
appointmentday     object
age                 int64
neighbourhood      object
scholarship         int64
hipertension        int64
diabetes            int64
alcoholism          int64
handcap             int64
sms_received        int64
no-show            object
dtype: object

Final Data Shape: (110526, 14)


In [13]:
# SECTION 11: Save the Cleaned Dataset
df.to_csv("cleaned_medical_appointments.csv", index=False)
print("✅ Cleaned dataset saved as 'cleaned_medical_appointments.csv'")


✅ Cleaned dataset saved as 'cleaned_medical_appointments.csv'


In [14]:
# SECTION 12: Download CSV in Google Colab
from google.colab import files
files.download("cleaned_medical_appointments.csv")


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>