In [5]:
import os
import pandas as pd

# 1. Set the data directory (adjust to your local environment)
data_dir = '../data'  # If running locally, use os.path.join(os.getcwd(), 'data')

# 2. List all CSV files to load
files = ['doctors.csv', 'patients.csv', 'appointments.csv', 'treatments.csv', 'billing.csv']

# 3. Loop through files and preview each
for fname in files:
    path = os.path.join(data_dir, fname)
    print(f"\n=== Preview of {fname} ===")
    df = pd.read_csv(path)
    
    # Display first few rows
    display(df.head())
    
    # Show column names
    print("Columns:", df.columns.tolist())
    
    # Count missing values
    print("Missing values:\n", df.isnull().sum())
    
    # Descriptive statistics
    print("Descriptive statistics:")
    display(df.describe(include='all'))



=== Preview of doctors.csv ===


Unnamed: 0,doctor_id,first_name,last_name,specialization,phone_number,years_experience,hospital_branch,email
0,D001,David,Taylor,Dermatology,8322010158,17,Westside Clinic,dr.david.taylor@hospital.com
1,D002,Jane,Davis,Pediatrics,9004382050,24,Eastside Clinic,dr.jane.davis@hospital.com
2,D003,Jane,Smith,Pediatrics,8737740598,19,Eastside Clinic,dr.jane.smith@hospital.com
3,D004,David,Jones,Pediatrics,6594221991,28,Central Hospital,dr.david.jones@hospital.com
4,D005,Sarah,Taylor,Dermatology,9118538547,26,Central Hospital,dr.sarah.taylor@hospital.com


Columns: ['doctor_id', 'first_name', 'last_name', 'specialization', 'phone_number', 'years_experience', 'hospital_branch', 'email']
Missing values:
 doctor_id           0
first_name          0
last_name           0
specialization      0
phone_number        0
years_experience    0
hospital_branch     0
email               0
dtype: int64
Descriptive statistics:


Unnamed: 0,doctor_id,first_name,last_name,specialization,phone_number,years_experience,hospital_branch,email
count,10,10,10,10,10.0,10.0,10,10
unique,10,6,6,3,,,3,10
top,D001,David,Davis,Pediatrics,,,Central Hospital,dr.david.taylor@hospital.com
freq,1,2,3,5,,,4,1
mean,,,,,7919716000.0,21.5,,
std,,,,,1142890000.0,6.7536,,
min,,,,,6176384000.0,5.0,,
25%,,,,,6792438000.0,19.5,,
50%,,,,,8269752000.0,23.5,,
75%,,,,,8937722000.0,26.0,,



=== Preview of patients.csv ===


Unnamed: 0,patient_id,first_name,last_name,gender,date_of_birth,contact_number,address,registration_date,insurance_provider,insurance_number,email
0,P001,David,Williams,F,1955-06-04,6939585183,789 Pine Rd,2022-06-23,WellnessCorp,INS840674,david.williams@mail.com
1,P002,Emily,Smith,F,1984-10-12,8228188767,321 Maple Dr,2022-01-15,PulseSecure,INS354079,emily.smith@mail.com
2,P003,Laura,Jones,M,1977-08-21,8397029847,321 Maple Dr,2022-02-07,PulseSecure,INS650929,laura.jones@mail.com
3,P004,Michael,Johnson,F,1981-02-20,9019443432,123 Elm St,2021-03-02,HealthIndia,INS789944,michael.johnson@mail.com
4,P005,David,Wilson,M,1960-06-23,7734463155,123 Elm St,2021-09-29,MedCare Plus,INS788105,david.wilson@mail.com


Columns: ['patient_id', 'first_name', 'last_name', 'gender', 'date_of_birth', 'contact_number', 'address', 'registration_date', 'insurance_provider', 'insurance_number', 'email']
Missing values:
 patient_id            0
first_name            0
last_name             0
gender                0
date_of_birth         0
contact_number        0
address               0
registration_date     0
insurance_provider    0
insurance_number      0
email                 0
dtype: int64
Descriptive statistics:


Unnamed: 0,patient_id,first_name,last_name,gender,date_of_birth,contact_number,address,registration_date,insurance_provider,insurance_number,email
count,50,50,50,50,50,50.0,50,50,50,50,50
unique,50,10,10,2,49,,4,50,4,50,39
top,P001,David,Wilson,M,1993-04-13,,321 Maple Dr,2022-06-23,MedCare Plus,INS840674,michael.taylor@mail.com
freq,1,7,9,31,2,,19,1,18,1,3
mean,,,,,,7817283000.0,,,,,
std,,,,,,957359400.0,,,,,
min,,,,,,6141952000.0,,,,,
25%,,,,,,7065593000.0,,,,,
50%,,,,,,7749927000.0,,,,,
75%,,,,,,8599947000.0,,,,,



=== Preview of appointments.csv ===


Unnamed: 0,appointment_id,patient_id,doctor_id,appointment_date,appointment_time,reason_for_visit,status
0,A001,P034,D009,2023-08-09,15:15:00,Therapy,Scheduled
1,A002,P032,D004,2023-06-09,14:30:00,Therapy,No-show
2,A003,P048,D004,2023-06-28,8:00:00,Consultation,Cancelled
3,A004,P025,D006,2023-09-01,9:15:00,Consultation,Cancelled
4,A005,P040,D003,2023-07-06,12:45:00,Emergency,No-show


Columns: ['appointment_id', 'patient_id', 'doctor_id', 'appointment_date', 'appointment_time', 'reason_for_visit', 'status']
Missing values:
 appointment_id      0
patient_id          0
doctor_id           0
appointment_date    0
appointment_time    0
reason_for_visit    0
status              0
dtype: int64
Descriptive statistics:


Unnamed: 0,appointment_id,patient_id,doctor_id,appointment_date,appointment_time,reason_for_visit,status
count,200,200,200,200,200,200,200
unique,200,48,10,158,40,5,4
top,A001,P012,D005,2023-08-16,11:00:00,Checkup,No-show
freq,1,10,29,5,9,45,52



=== Preview of treatments.csv ===


Unnamed: 0,treatment_id,appointment_id,treatment_type,description,cost,treatment_date
0,T001,A001,Chemotherapy,Basic screening,3941.97,2023-08-09
1,T002,A002,MRI,Advanced protocol,4158.44,2023-06-09
2,T003,A003,MRI,Standard procedure,3731.55,2023-06-28
3,T004,A004,MRI,Basic screening,4799.86,2023-09-01
4,T005,A005,ECG,Standard procedure,582.05,2023-07-06


Columns: ['treatment_id', 'appointment_id', 'treatment_type', 'description', 'cost', 'treatment_date']
Missing values:
 treatment_id      0
appointment_id    0
treatment_type    0
description       0
cost              0
treatment_date    0
dtype: int64
Descriptive statistics:


Unnamed: 0,treatment_id,appointment_id,treatment_type,description,cost,treatment_date
count,200,200,200,200,200.0,200
unique,200,200,5,3,,158
top,T001,A001,Chemotherapy,Standard procedure,,2023-08-16
freq,1,1,49,77,,5
mean,,,,,2756.24925,
std,,,,,1298.125308,
min,,,,,534.03,
25%,,,,,1563.4125,
50%,,,,,2828.165,
75%,,,,,3836.6275,



=== Preview of billing.csv ===


Unnamed: 0,bill_id,patient_id,treatment_id,bill_date,amount,payment_method,payment_status
0,B001,P034,T001,2023-08-09,3941.97,Insurance,Pending
1,B002,P032,T002,2023-06-09,4158.44,Insurance,Paid
2,B003,P048,T003,2023-06-28,3731.55,Insurance,Paid
3,B004,P025,T004,2023-09-01,4799.86,Insurance,Failed
4,B005,P040,T005,2023-07-06,582.05,Credit Card,Pending


Columns: ['bill_id', 'patient_id', 'treatment_id', 'bill_date', 'amount', 'payment_method', 'payment_status']
Missing values:
 bill_id           0
patient_id        0
treatment_id      0
bill_date         0
amount            0
payment_method    0
payment_status    0
dtype: int64
Descriptive statistics:


Unnamed: 0,bill_id,patient_id,treatment_id,bill_date,amount,payment_method,payment_status
count,200,200,200,200,200.0,200,200
unique,200,48,200,158,,3,3
top,B001,P012,T001,2023-08-16,,Credit Card,Pending
freq,1,10,1,5,,75,69
mean,,,,,2756.24925,,
std,,,,,1298.125308,,
min,,,,,534.03,,
25%,,,,,1563.4125,,
50%,,,,,2828.165,,
75%,,,,,3836.6275,,
