<a href="https://colab.research.google.com/github/Foluwa/sql_assessment/blob/main/SQL_Assessment.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [10]:
!pip install Faker



# TOPIC:  Hospital Management Database

In [11]:
import random
import datetime
import sqlite3
import numpy as np
import pandas as pd
from faker import Faker

In [12]:
# Initialize faker library
fake = Faker()

# Declare records
num_records = 1000
procedure_records = 20

# Medical fields for specialization
medical_fields = ['Anesthesiology', 'Cardiology', 'Dermatology', 'Endocrinology', 'Gastroenterology',
                  'Hematology', 'Neurology', 'Oncology', 'Orthopedics', 'Pediatrics', 'Psychiatry',
                  'Radiology', 'Surgery', 'Urology'
]

# PatientFeedbackScore
patient_feedback = ['Poor', 'Fair', 'Good', 'Very Good', 'Excellent']

# List of dignosis types
diagnosis = ['Cold', 'Flu', 'Fever', 'Headache', 'Stomachache', 'Gastritis']

# Procedure names
procedures = ['X-ray', 'Blood Test', 'Physical Therapy', 'Appendectomy', 'Cesarean Section', 'Colonoscopy',
              'Endoscopy', 'Laparoscopy', 'Angiography', 'Cardiac Catheterization', 'Echocardiogram',
              'Electrocardiogram (ECG or EKG)', 'Biopsy', 'Dialysis', 'Blood Transfusion',
              'Bone Marrow Transplant', 'Colonoscopy',  'Ultrasound', 'Mammogram', 'Pap Smear']
procedure_names = [random.choice(procedures) for _ in range(20)]

# Gender list
gender = ['Male', 'Female']

# Ethnicity List
ethnicity = ['Caucasian', 'African American', 'Asian', 'Hispanic', 'Other']

# Patient severity level list
severity_level = ['Mild', 'Moderate', 'Severe']

# Unique Doctors IDs
doctor_ids = range(1, num_records + 1)

# Unique Patients IDs
unique_patient_ids = set()
while len(unique_patient_ids) < num_records:
    unique_patient_ids.add(random.randint(10000000, 99999999))
patient_ids = list(unique_patient_ids)

# Unique Procedure IDs
unique_procedure_ids = set()
while len(unique_procedure_ids) < 20:
    unique_procedure_ids.add(random.randint(100, 1000))
procedure_ids = list(unique_procedure_ids)

# Procedure cost
procedure_cost = [random.randint(100, 10000) for _ in range(20)]

In [13]:
def random_date(start_date, end_date):
    """ Define function to generate random date within a range """
    return start_date + datetime.timedelta(
        seconds=random.randint(0, int((end_date - start_date).total_seconds())))

def format_phone_number(phone_number):
    """ Function to format phone number with country code """
    # Add country code if missing
    if not phone_number.startswith('+'):
        phone_number = '+1' + phone_number[1:]
    return phone_number

def generate_patient_data(num_records):
    """ Function to generate patient data """
    patient_data = {
        'PatientID': patient_ids,
        'PatientName': [fake.name() for _ in range(num_records)],
        'Gender': [fake.random_element(gender) for _ in range(num_records)],
        'DateOfBirth': [fake.date_of_birth(minimum_age=18, maximum_age=90) for _ in range(num_records)],
        'PhoneNumber': [format_phone_number(fake.phone_number()) for _ in range(num_records)],
        'Ethnicity': [fake.random_element(ethnicity) for _ in range(num_records)],
        'AdmissionDate': [random_date(datetime.datetime(2020, 1, 1), datetime.datetime(2023, 1, 1)) for _ in range(num_records)],
        'InsuranceID': [fake.random_number(digits=10) for _ in range(num_records)],
        'SeverityLevel': [random.choice(severity_level) for _ in range(num_records)],
        'HospitalStayDuration': np.random.randint(1, 30, size=num_records),
        'TotalCost': np.round(np.random.uniform(1000, 10000, size=num_records), 2)
    }
    return patient_data

# Generate patient data
patients_df = pd.DataFrame(generate_patient_data(num_records))

# Set PatientID as primary key
patients_df.set_index('PatientID', inplace=True)

# Display the DataFrame
patients_df.head()

Unnamed: 0_level_0,PatientName,Gender,DateOfBirth,PhoneNumber,Ethnicity,AdmissionDate,InsuranceID,SeverityLevel,HospitalStayDuration,TotalCost
PatientID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
30703625,Michael Little,Female,1978-09-10,+101-268-679-1268x657,Hispanic,2022-11-14 23:19:41,3603761430,Mild,28,3099.05
64215050,Andrea Mckinney,Female,2002-12-05,+199.857.0095x22290,African American,2020-05-18 12:03:01,2237200935,Moderate,12,8142.57
15501324,Anne Vega,Female,1941-01-15,+139.261.4438x66501,Other,2021-01-01 12:23:28,8527970236,Mild,3,8345.23
85405712,Ashley Miller,Female,1947-04-16,+133.662.8306x539,Asian,2021-08-13 06:00:55,4084972953,Moderate,15,4579.57
29878289,John Simpson,Male,1993-11-15,+171-633-6714,Asian,2021-05-22 14:29:08,5736107959,Severe,12,2157.3


In [14]:
def generate_doctor_data(num_records):
    """ Function to generate data for Doctors Table """
    data = {
        'DoctorID': doctor_ids,
        'DoctorName': [fake.name() for _ in range(num_records)],
        'Specialization': [random.choice(medical_fields) for _ in range(num_records)],
        'ExperienceYears': np.random.randint(1, 30, size=num_records),
        'AverageRating': np.round(np.random.uniform(1, 5, size=num_records), 2),
        'PatientFeedbackScore': [random.choice(patient_feedback) for _ in range(num_records)],
        'AverageConsultationDuration': np.random.randint(10, 60, size=num_records)
    }
    return data

def create_doctors_dataframe(num_records):
    """ Function to create Doctors DataFrame """
    data = generate_doctor_data(num_records)
    doctors_df = pd.DataFrame(data)
    return doctors_df

# Generate Doctors DataFrame with 100 records
doctors_df = create_doctors_dataframe(num_records)

# Set DoctorID as primary key
doctors_df.set_index('DoctorID', inplace=True)

# Display the DataFrame
doctors_df.head()

Unnamed: 0_level_0,DoctorName,Specialization,ExperienceYears,AverageRating,PatientFeedbackScore,AverageConsultationDuration
DoctorID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
1,Barbara Michael,Oncology,3,4.87,Good,11
2,Eric Walker,Urology,10,3.36,Good,49
3,Timothy Carroll,Endocrinology,16,2.06,Fair,17
4,Raymond Green,Endocrinology,8,3.41,Very Good,19
5,Timothy Thompson,Psychiatry,16,2.95,Poor,39


In [15]:
def gen_appointments_data():
    """ Generate data for Appointments Table """
    appointments_data = {
        'AppointmentID': range(1, num_records+1),
        'PatientID': [random.choice(patient_ids) for _ in range(num_records)],
        'DoctorID': [random.choice(doctor_ids) for _ in range(num_records)],
        'AppointmentDate': [datetime.datetime(2024, np.random.randint(1, 13), np.random.randint(1, 29)) for _ in range(num_records)],
        'ConsultationFee': np.round(np.random.uniform(50, 200, size=num_records), 2),
        'Diagnosis': [random.choice(diagnosis) for _ in range(num_records)],
        'TreatmentDuration': np.random.randint(10, 60, size=num_records)
    }
    return appointments_data

appointments_data = gen_appointments_data()

# Create DataFrame for Appointments Table
appointments_df = pd.DataFrame(appointments_data)

# Create a compound key by combining PatientID and AppointmentDate
appointments_df['AppointmentDate'] = pd.to_datetime(appointments_df['AppointmentDate']).dt.date
appointments_df['CompoundKey'] = appointments_df['PatientID'].astype(str) + '-' + appointments_df['AppointmentDate'].astype(str)

# Set PatientID and DoctorID as foreign keys
appointments_df.set_index(['PatientID', 'AppointmentDate'], inplace=True)
appointments_df['DoctorID'] = appointments_df['DoctorID'].astype('int64')
appointments_df = appointments_df.join(doctors_df, on='DoctorID')

# Display the DataFrame
appointments_df.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,AppointmentID,DoctorID,ConsultationFee,Diagnosis,TreatmentDuration,CompoundKey,DoctorName,Specialization,ExperienceYears,AverageRating,PatientFeedbackScore,AverageConsultationDuration
PatientID,AppointmentDate,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
73924549,2024-09-14,1,181,132.23,Cold,26,73924549-2024-09-14,Lisa Washington,Gastroenterology,14,2.29,Very Good,55
71086190,2024-12-11,2,669,99.2,Fever,24,71086190-2024-12-11,Christopher Gray,Orthopedics,28,4.39,Fair,49
99982560,2024-06-23,3,950,121.58,Headache,11,99982560-2024-06-23,Elizabeth Alvarez,Orthopedics,17,1.73,Fair,34
20483834,2024-09-05,4,838,108.28,Stomachache,12,20483834-2024-09-05,Edward Harrell PhD,Hematology,12,1.68,Poor,57
19838244,2024-10-14,5,463,179.44,Fever,55,19838244-2024-10-14,Rita Powell,Anesthesiology,29,2.85,Poor,24


In [16]:
def create_procedure_data():
    """ Procedures Table """
    procedures_data = {
        'ProcedureID': procedure_ids,
        'ProcedureName': procedure_names,
        'ProcedureCost': procedure_cost
    }
    return procedures_data

procedures_data = create_procedure_data()
procedures_df = pd.DataFrame(procedures_data)
procedures_df.set_index('ProcedureID', inplace=True)

# Display the DataFrame
procedures_df.head()

Unnamed: 0_level_0,ProcedureName,ProcedureCost
ProcedureID,Unnamed: 1_level_1,Unnamed: 2_level_1
644,Dialysis,6994
394,Colonoscopy,3747
529,Cesarean Section,8873
146,Dialysis,9666
532,Colonoscopy,9813


In [17]:
def generate_future_date():
    """ Function to generate future appointment dates """
    return datetime.datetime.now() + datetime.timedelta(days=random.randint(1, 30))

def generate_appointment_procedures_data():
    """ Generate data for AppointmentProcedures Table (Junction Table) """
    appointment_procedures_data = {
        'PatientID': [random.choice(patient_ids) for _ in range(num_records)],
        'AppointmentDate': [generate_future_date().date() for _ in range(num_records)],
        'ProcedureID': [random.choice(procedure_ids) for _ in range(num_records)],
        'Quantity': [random.randint(1, 5) for _ in range(num_records)]
    }
    return appointment_procedures_data

appointment_procedures_data = generate_appointment_procedures_data()

# Create DataFrame for AppointmentProcedures Table
appointment_procedures_df = pd.DataFrame(appointment_procedures_data)

# Display the DataFrame
appointment_procedures_df.head()

Unnamed: 0,PatientID,AppointmentDate,ProcedureID,Quantity
0,19117494,2024-03-06,989,1
1,63367114,2024-03-13,644,3
2,72919077,2024-03-05,813,2
3,61897911,2024-03-27,644,1
4,63596674,2024-03-18,529,3


In [18]:
def export_to_sqlite(patients_df, doctors_df, appointments_df, procedures_df, appointment_procedures_df, db_file='hospital_data.db'):
    """ Function to export CSV to SQLite DB """
    # Connect to SQLite database
    conn = sqlite3.connect(db_file)
    cursor = conn.cursor()

    # Export Patients DataFrame to SQL
    patients_df.to_sql('Patients', conn, if_exists='replace', index=True,  )

    # Export Doctors DataFrame to SQL
    doctors_df.to_sql('Doctors', conn, if_exists='replace', index=True)

    # Export Appointments DataFrame to SQL
    appointments_df.to_sql('Appointments', conn, if_exists='replace', index=True)

    # Export Procedures DataFrame to SQL
    procedures_df.to_sql('Procedures', conn, if_exists='replace', index=True)

    # Export AppointmentProcedures DataFrame to SQL
    appointment_procedures_df.to_sql('AppointmentProcedures', conn, if_exists='replace', index=True)

    # Commit changes and close connection
    conn.commit()
    return conn.close()

# Export to SQL
export_to_sqlite(patients_df, doctors_df, appointments_df, procedures_df, appointment_procedures_df)