In [None]:
# STEP 1: Import libraries
import pandas as pd
import numpy as np

In [None]:
# STEP 2: Upload dataset to Colab
from google.colab import files
uploaded = files.upload()

Saving Medical.csv to Medical.csv


In [None]:
# STEP 3: Load dataset
df = pd.read_csv('Medical.csv')
print("✅ Dataset loaded successfully!")
print("Shape:", df.shape)

✅ Dataset loaded successfully!
Shape: (110527, 14)


In [None]:
# STEP 4: Inspect dataset
print("\n--- Dataset Info ---")
df.info()
print("\n--- First 5 Rows ---")
print(df.head())


--- Dataset Info ---
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 110527 entries, 0 to 110526
Data columns (total 14 columns):
 #   Column          Non-Null Count   Dtype  
---  ------          --------------   -----  
 0   PatientId       110527 non-null  float64
 1   AppointmentID   110527 non-null  int64  
 2   Gender          110527 non-null  object 
 3   ScheduledDay    110527 non-null  object 
 4   AppointmentDay  110527 non-null  object 
 5   Age             110527 non-null  int64  
 6   Neighbourhood   110527 non-null  object 
 7   Scholarship     110527 non-null  int64  
 8   Hipertension    110527 non-null  int64  
 9   Diabetes        110527 non-null  int64  
 10  Alcoholism      110527 non-null  int64  
 11  Handcap         110527 non-null  int64  
 12  SMS_received    110527 non-null  int64  
 13  No-show         110527 non-null  object 
dtypes: float64(1), int64(8), object(5)
memory usage: 11.8+ MB

--- First 5 Rows ---
      PatientId  AppointmentID Gender         

In [None]:
# ✅ STEP 5: Identify and handle missing values (.isnull())
# ------------------------------------------------------
print("\nMissing Values Per Column:")
print(df.isnull().sum())

# Fill missing values
num_cols = df.select_dtypes(include=['int64', 'float64']).columns
df[num_cols] = df[num_cols].fillna(df[num_cols].median())  # numeric → median

cat_cols = df.select_dtypes(include=['object']).columns
for c in cat_cols:
    df[c].fillna(df[c].mode()[0], inplace=True)            # categorical → mode

print("\n✅ Missing values handled.")


Missing Values Per Column:
PatientId         0
AppointmentID     0
Gender            0
ScheduledDay      0
AppointmentDay    0
Age               0
Neighbourhood     0
Scholarship       0
Hipertension      0
Diabetes          0
Alcoholism        0
Handcap           0
SMS_received      0
No-show           0
dtype: int64


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df[c].fillna(df[c].mode()[0], inplace=True)            # categorical → mode



✅ Missing values handled.


In [None]:
# ✅ STEP 6: Remove duplicate rows (.drop_duplicates())
# ---------------------------------------------------
before = df.shape[0]
df = df.drop_duplicates()
after = df.shape[0]
print(f"✅ Removed {before - after} duplicate rows.")


✅ Removed 0 duplicate rows.


In [None]:
# STEP 7: Standardize text values (gender, country, etc.)
# ---------------------------------------------------
# Example: Gender column (modify column name if needed)
for col in df.columns:
    if 'gender' in col.lower():
        df[col] = df[col].astype(str).str.lower().str.strip()
        df[col] = df[col].replace({
            'm': 'male', 'f': 'female', 'male.': 'male', 'female.': 'female'
        })
        print(f"✅ Standardized text in '{col}' column.")

# Country or similar fields
for col in df.columns:
    if 'country' in col.lower() or 'city' in col.lower():
        df[col] = df[col].astype(str).str.title().str.strip()
        print(f"✅ Standardized text in '{col}' column.")

✅ Standardized text in 'Gender' column.


In [None]:
# STEP 8: Convert date formats to consistent type
# ---------------------------------------------------
for col in df.columns:
    if 'date' in col.lower():
        df[col] = pd.to_datetime(df[col], errors='coerce', dayfirst=True)
        df[col] = df[col].dt.strftime('%d-%m-%Y')  # dd-mm-yyyy format
        print(f"✅ Converted '{col}' to dd-mm-yyyy format.")


In [None]:
# STEP 9: Rename column headers to be clean and uniform
# ---------------------------------------------------
df.columns = [c.strip().lower().replace(' ', '_').replace('-', '_') for c in df.columns]
print("\n✅ Column headers renamed:")
print(df.columns.tolist())



✅ Column headers renamed:
['patientid', 'appointmentid', 'gender', 'scheduledday', 'appointmentday', 'age', 'neighbourhood', 'scholarship', 'hipertension', 'diabetes', 'alcoholism', 'handcap', 'sms_received', 'no_show']


In [None]:
# STEP 10: Check and fix data types
# ---------------------------------------------------
if 'age' in df.columns:
    df['age'] = pd.to_numeric(df['age'], errors='coerce').astype('Int64')
print("\n✅ Final datatypes:")
print(df.dtypes)


✅ Final datatypes:
patientid         float64
appointmentid       int64
gender             object
scheduledday       object
appointmentday     object
age                 Int64
neighbourhood      object
scholarship         int64
hipertension        int64
diabetes            int64
alcoholism          int64
handcap             int64
sms_received        int64
no_show            object
dtype: object


In [None]:
# STEP 11: Save cleaned dataset
# ---------------------------------------------------
df.to_csv('Medical_Cleaned.csv', index=False)
print("\n💾 Cleaned dataset saved as 'Medical_Cleaned.csv'")


💾 Cleaned dataset saved as 'Medical_Cleaned.csv'


In [None]:
# ✅ STEP 12: Download cleaned file
# ---------------------------------------------------
from google.colab import files
files.download('Medical_Cleaned.csv')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>