In [9]:
import numpy as np
import pandas as pd
from faker import Faker

# Faker instance
fake = Faker("en_GB")

In [11]:
#Number of rows per table
n_doctors = 1000
n_patients = 1000
n_visits = 1000

In [27]:
# Generating Doctors Table
doctor_ids = np.arange(1, n_doctors + 1)
doctor_names = [fake.name() for _ in range(n_doctors)]
specialties = np.random.choice(
    ["Cardiology", "Dermatology", "Neurology", "Pediatrics", "Radiology", None],
    n_doctors,
    p=[0.2, 0.2, 0.2, 0.2, 0.1, 0.1],  # 10% of rows will have NULL in Specialty
)
experience_levels = np.random.randint(1, 31, n_doctors)
ages = np.random.randint(30, 65, n_doctors)
salaries = np.random.uniform(50000, 200000, n_doctors).round(2)
cities = np.random.choice(["London", "Manchester", "Birmingham", "Leeds", "Glasgow"], n_doctors)
phone_numbers = [f"+44 {np.random.randint(7000, 7999)} {np.random.randint(100000, 999999)}" for _ in range(n_doctors)]

doctors_df = pd.DataFrame({
    "Doctor_ID": doctor_ids,
    "Name": doctor_names,
    "Specialty": specialties,
    "Experience_Level": experience_levels,
    "Age": ages,
    "Salary": salaries,
    "City": cities,
    "Phone_Number": phone_numbers,
})

# Saving Doctors Table to CSV
doctors_df.to_csv("doctors_table.csv", index=False)
print("Doctors table saved to 'doctors_table.csv'.")

Doctors table saved to 'doctors_table.csv'.


In [28]:
# Generating Patients Table
patient_ids = np.arange(1, n_patients + 1)
patient_names = [fake.name() for _ in range(n_patients)]
ages = np.random.randint(1, 90, n_patients)
genders = np.random.choice(["Male", "Female", "Other"], n_patients, p=[0.48, 0.48, 0.04])
blood_types = np.random.choice(["A", "B", "AB", "O"], n_patients)
heights = np.random.uniform(140, 200, n_patients).round(2)
weights = np.random.uniform(50, 120, n_patients).round(2)
assigned_doctors = np.random.choice(doctor_ids, n_patients)
last_checkup_dates = np.random.choice(
    [fake.date_between(start_date="-5y", end_date="today"), None],
    n_patients,
    p=[0.5, 0.5],  # 50% of rows will have NULL in Last_Checkup_Date
)
smoking_status = np.random.choice(
    ["Smoker", "Non-Smoker", None],
    n_patients,
    p=[0.3, 0.4, 0.3],  # 30% of rows will have NULL in Smoking_Status
)

patients_df = pd.DataFrame({
    "Patient_ID": patient_ids,
    "Name": patient_names,
    "Age": ages,
    "Gender": genders,
    "Blood_Type": blood_types,
    "Height": heights,
    "Weight": weights,
    "Doctor_ID": assigned_doctors,
    "Last_Checkup_Date": last_checkup_dates,
    "Smoking_Status": smoking_status,
})

# Saving Patients Table to CSV
patients_df.to_csv("patients_table.csv", index=False)
print("Patients table saved to 'patients_table.csv'.")

Patients table saved to 'patients_table.csv'.


In [30]:
# Generating Visits Table
visit_ids = np.arange(1, n_visits + 1)
visit_patient_ids = np.random.choice(patient_ids, n_visits)
visit_doctor_ids = np.random.choice(doctor_ids, n_visits)
visit_dates = pd.date_range(start="2022-01-01", end="2023-12-31", periods=n_visits).strftime("%Y-%m-%d")
visit_types = np.random.choice(["Checkup", "Emergency", "Follow-up", "Surgery"], n_visits)
diagnoses = np.random.choice(
    ["Flu", "Fracture", "Diabetes", "Hypertension", "Asthma", None],
    n_visits,
    p=[0.2, 0.2, 0.2, 0.2, 0.1, 0.1],  # 10% of rows will have NULL in Diagnosis
)
severity_levels = np.random.randint(1, 6, n_visits)
costs = np.random.uniform(50, 5000, n_visits).round(2)

visits_df = pd.DataFrame({
    "Visit_ID": visit_ids,
    "Patient_ID": visit_patient_ids,
    "Doctor_ID": visit_doctor_ids,
    "Visit_Date": visit_dates,
    "Visit_Type": visit_types,
    "Diagnosis": diagnoses,
    "Severity_Level": severity_levels,
    "Cost": costs,
})

# Saving Visits Table to CSV
visits_df.to_csv("visits_table.csv", index=False)
print("Visits table saved to 'visits_table.csv'.")


Visits table saved to 'visits_table.csv'.
