<a href="https://colab.research.google.com/github/CrisGiacomazzi/Indigenous-Health-Access/blob/main/Datasets_indigenous.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [4]:
#DATASET 1

import pandas as pd
import numpy as np

# Set random seed for reproducibility
np.random.seed(42)

# Number of observations
n = 260

# Column 1: Patient ID (random numbers between 001 and 260)
patient_ids = [f"{i:03}" for i in range(1, n + 1)]

# Column 2: Indigenous Community
# First Nations: 5.8%, Métis: 3.5%, Inuit: 0.4%
indigenous_communities = np.random.choice(
    ["First Nations", "Métis", "Inuit"],
    size=n,
    p=[0.58, 0.35, 0.07]
)

# Adjust probabilities to ensure they sum to 1 (since 5.8% + 3.5% + 0.4% = 9.7%)
# The remaining 90.3% will be distributed proportionally among the three groups
adjusted_probs = [0.058 / 0.097, 0.035 / 0.097, 0.004 / 0.097]
indigenous_communities = np.random.choice(
    ["First Nations", "Métis", "Inuit"],
    size=n,
    p=adjusted_probs
)

# Column 3: HLQ Before (ordinal scale between 1 and 4)
hlq_before = np.random.randint(1, 5, size=n)

# Column 4: HLQ After (10% improvement compared to HLQ Before)
# If the score is 4, it remains unchanged; otherwise, increase by 10%
hlq_after = np.where(
    hlq_before == 4,  # Condition: if score is 4
    hlq_before,       # Keep the score as 4
    hlq_before + 1    # Otherwise, increase by 1 (10% improvement on a 1-4 scale)
)

# Create DataFrame
data = pd.DataFrame({
    "Patient ID": patient_ids,
    "Indigenous Community": indigenous_communities,
    "HLQ Before": hlq_before,
    "HLQ After": hlq_after
})

# Save to CSV
data.to_csv("health_literacy_data.csv", index=False)

# Provide download link (if running in a notebook or web environment)
from IPython.display import FileLink
FileLink("health_literacy_data.csv")

In [5]:
# DATASET 2

# Set random seed for reproducibility
np.random.seed(42)

# Number of observations
n = 260

# Column 1: Patient ID (from 001 to 260)
patient_ids = [f"{i:03}" for i in range(1, n + 1)]

# Column 2: Racism_discrimination_before (15% yes)
racism_before = np.random.choice(
    ["Yes", "No"],
    size=n,
    p=[0.15, 0.85]
)

# Column 3: Racism_discrimination_after (5% yes)
racism_after = np.random.choice(
    ["Yes", "No"],
    size=n,
    p=[0.05, 0.95]
)

# Column 4: Cultural_healthcare_services_before (13% yes)
cultural_before = np.random.choice(
    ["Yes", "No"],
    size=n,
    p=[0.13, 0.87]
)

# Column 5: Cultural_healthcare_services_after (23% yes)
cultural_after = np.random.choice(
    ["Yes", "No"],
    size=n,
    p=[0.23, 0.77]
)

# Column 6: Traditional_methods_before (10% yes)
traditional_before = np.random.choice(
    ["Yes", "No"],
    size=n,
    p=[0.10, 0.90]
)

# Column 7: Traditional_methods_after (20% yes)
traditional_after = np.random.choice(
    ["Yes", "No"],
    size=n,
    p=[0.20, 0.80]
)

# Create DataFrame
data = pd.DataFrame({
    "Patient ID": patient_ids,
    "Racism_discrimination_before": racism_before,
    "Racism_discrimination_after": racism_after,
    "Cultural_healthcare_services_before": cultural_before,
    "Cultural_healthcare_services_after": cultural_after,
    "Traditional_methods_before": traditional_before,
    "Traditional_methods_after": traditional_after
})

# Save to Excel
data.to_excel("health_literacy_dataset_2.xlsx", index=False)

# Provide download link (if running in a notebook or web environment)
from IPython.display import FileLink
FileLink("health_literacy_dataset_2.xlsx")

In [7]:
# DATASET 3
# Set random seed for reproducibility
np.random.seed(42)

# Number of observations
n = 260

# Column 1: Patient ID (from 001 to 260)
patient_ids = [f"{i:03}" for i in range(1, n + 1)]

# Column 2: Number_Vis_ER_before (random values between 1 and 6, with a median of 4)
# Using a triangular distribution to ensure a median of 4
number_vis_er_before = np.round(np.random.triangular(left=1, mode=4, right=6, size=n)).astype(int)

# Column 3: Number_Vis_ER_after (15% decrease from Number_Vis_ER_before)
number_vis_er_after = np.round(number_vis_er_before * 0.85).astype(int)
number_vis_er_after = np.clip(number_vis_er_after, 1, 6)  # Ensure values stay within 1-6

# Column 4: Cost_ER_before (random between $386 and $1000)
cost_er_before = np.round(np.random.uniform(386, 1000, size=n), 2)

# Column 5: Cost_ER_after (20% decrease from Cost_ER_before)
cost_er_after = np.round(cost_er_before * 0.80, 2)

# Create DataFrame
data = pd.DataFrame({
    "Patient ID": patient_ids,
    "Number_Vis_ER_before": number_vis_er_before,
    "Number_Vis_ER_after": number_vis_er_after,
    "Cost_ER_before": cost_er_before,
    "Cost_ER_after": cost_er_after
})

# Convert DataFrame to JSON
json_data = data.to_json(orient="records", indent=4)

# Save to JSON file
with open("health_literacy_dataset_3.json", "w") as f:
    f.write(json_data)

# Provide download link (if running in a notebook or web environment)
from IPython.display import FileLink
FileLink("health_literacy_dataset_3.json")