# Distribution

In [1]:
# import libraries

import json
import random

import pandas as pd

### Health records

In [2]:
# load gender and age data

data_folder = '../../data/demographics'

with open(f"{data_folder}/population/country.json", 'r') as file:
    country = json.load(file)

with open(f"{data_folder}/population/life_insurance.json", 'r') as file:
    life_insurance = json.load(file)

In [3]:
# load electronic health records data

data_folder = '../../data/health_records'

patients = pd.read_csv(f"{data_folder}/patients.csv")

In [4]:
# gender distribution in population

total_population = country['total']
male_percentage = round(country['males'] / total_population, 3)
female_percentage = round(country['females'] / total_population, 3)

print(f"Percentage of males: {male_percentage}")
print(f"Percentage of females: {female_percentage}")

Percentage of males: 0.478
Percentage of females: 0.522


In [5]:
# age distribution in population

age_groups = country['age_groups']
population_values = [group['value'] for group in age_groups]
total_population = sum(population_values)

age_percentages = []
for i in range(len(age_groups)):
    group = age_groups[i]
    population = population_values[i]
    percentage = round(population / total_population, 3)
    age_percentages.append((group['from'], group['to'], percentage))

age_percentages

[(15, 19, 0.062),
 (20, 24, 0.067),
 (25, 29, 0.067),
 (30, 34, 0.07),
 (35, 39, 0.073),
 (40, 44, 0.085),
 (45, 49, 0.096),
 (50, 54, 0.092),
 (55, 59, 0.088),
 (60, 64, 0.086),
 (65, 69, 0.08),
 (70, 74, 0.073),
 (75, 79, 0.061)]

In [6]:
# age distribution in life insurance policyholders

life_percentages = [
    (life_insurance['youth']['from'], life_insurance['youth']['to'], life_insurance['youth']['percentage']),
    (life_insurance['adults']['from'], life_insurance['adults']['to'], life_insurance['adults']['percentage']),
    (life_insurance['seniors']['from'], life_insurance['seniors']['to'], life_insurance['seniors']['percentage']),
]

life_percentages

[(15, 34, 0.046), (35, 54, 0.571), (55, 79, 0.383)]

In [7]:
# target age percentages - for each life insurance group, break down by population age groups

target_percentages = []

for policy_group in life_percentages:
    policy_from, policy_to, policy_percentage = policy_group
    sub_population_groups = []
    total_population_percentage = 0

    for pop_group in age_percentages:
        pop_from, pop_to, pop_percentage = pop_group
        if pop_to >= policy_from and pop_from <= policy_to:
            sub_population_groups.append((pop_from, pop_to, pop_percentage))
            total_population_percentage += pop_percentage
            
    for pop_from, pop_to, pop_percentage in sub_population_groups:
        weighted_percentage = round((pop_percentage / total_population_percentage) * policy_percentage, 3)
        target_percentages.append((pop_from, pop_to, weighted_percentage))

target_percentages

[(15, 19, 0.011),
 (20, 24, 0.012),
 (25, 29, 0.012),
 (30, 34, 0.012),
 (35, 39, 0.12),
 (40, 44, 0.14),
 (45, 49, 0.158),
 (50, 54, 0.152),
 (55, 59, 0.087),
 (60, 64, 0.085),
 (65, 69, 0.079),
 (70, 74, 0.072),
 (75, 79, 0.06)]

In [8]:
# count available records for each age group

counts = []
for age_group in age_groups:
    counts.append({
        'from': age_group['from'],
        'to': age_group['to'],
        'count': 0,
        'M': 0,
        'F': 0
    })

for patient in patients.itertuples():
    age = 2023 - int(patient.BIRTHDATE.split('-')[0])
    for item in counts:
        if age >= item['from'] and age <= item['to']:
            item['count'] += 1
            item[patient.GENDER] += 1
            break

total_count = sum([item['count'] for item in counts])
print(f"Total count: {total_count}")

counts

Total count: 889


[{'from': 15, 'to': 19, 'count': 70, 'M': 39, 'F': 31},
 {'from': 20, 'to': 24, 'count': 71, 'M': 26, 'F': 45},
 {'from': 25, 'to': 29, 'count': 73, 'M': 38, 'F': 35},
 {'from': 30, 'to': 34, 'count': 77, 'M': 38, 'F': 39},
 {'from': 35, 'to': 39, 'count': 71, 'M': 35, 'F': 36},
 {'from': 40, 'to': 44, 'count': 60, 'M': 30, 'F': 30},
 {'from': 45, 'to': 49, 'count': 73, 'M': 38, 'F': 35},
 {'from': 50, 'to': 54, 'count': 77, 'M': 29, 'F': 48},
 {'from': 55, 'to': 59, 'count': 75, 'M': 29, 'F': 46},
 {'from': 60, 'to': 64, 'count': 68, 'M': 26, 'F': 42},
 {'from': 65, 'to': 69, 'count': 67, 'M': 32, 'F': 35},
 {'from': 70, 'to': 74, 'count': 64, 'M': 33, 'F': 31},
 {'from': 75, 'to': 79, 'count': 43, 'M': 22, 'F': 21}]

In [9]:
# counts that would reflect the target percentages

raw_counts = []
for (_, _, percentage) in target_percentages:
    raw_counts.append(percentage * total_count)
targets = [int(x) for x in raw_counts]

current_total = sum(targets)
diff = total_count - current_total

remainders = [(i, raw_counts[i] - targets[i]) for i in range(len(targets))]
remainders.sort(key=lambda x: x[1], reverse=True)

for i in range(diff):
    targets[remainders[i][0]] += 1

targets

[10, 11, 11, 11, 107, 124, 140, 135, 77, 76, 70, 64, 53]

In [10]:
# all possible percentages to trim all counts proportionally

trim_percentages = []

for index, item in enumerate(counts):
    target = targets[index]
    trim = target - item['count']
    trim_percentage = round(trim / target, 3)
    trim_percentages.append(trim_percentage)

trim_percentages

[-6.0,
 -5.455,
 -5.636,
 -6.0,
 0.336,
 0.516,
 0.479,
 0.43,
 0.026,
 0.105,
 0.043,
 0.0,
 0.189]

In [11]:
def apply_trim(targets : list[int], trim_percentage : float) -> list[int]:
    '''
    Trims all target counts proportionally with a certain percentage
    '''
    trimmed = []
    for target in targets:
        trim = round(trim_percentage * target)
        value = target - trim
        trimmed.append(value)
    return trimmed

In [12]:
def valid_counts(trimmed : list[int], counts : list[int]) -> bool:
    '''
    Verifies if the trimmed counts are all equal or inferior to the real available counts
    '''
    for index, count in enumerate(trimmed):
        available_count = counts[index]['count']
        if count > available_count:
            return False
    return True

In [13]:
# trim the real counts to match the target proportions with minimal total loss

highest_total = 0
adjusted_counts = None

for trim_percentage in trim_percentages:
    trimmed = apply_trim(targets, trim_percentage)
    if valid_counts(trimmed, counts):
        total = sum(trimmed)
        if total > highest_total:
            adjusted_counts = trimmed

print(f"Initial total counts: {total_count}")
total_adjusted_counts = sum(adjusted_counts)
print(f"Adjusted total counts: {total_adjusted_counts}")

adjusted_counts

Initial total counts: 889
Adjusted total counts: 430


[5, 5, 5, 5, 52, 60, 68, 65, 37, 37, 34, 31, 26]

In [14]:
# optionally reduce the test dataset

for index in range(len(adjusted_counts)):
    adjusted_counts[index] = round(adjusted_counts[index] - 0.8 * adjusted_counts[index])

print(f"Initial total counts: {total_count}")
total_adjusted_counts = sum(adjusted_counts)
print(f"Adjusted total counts: {total_adjusted_counts}")

adjusted_counts

Initial total counts: 889
Adjusted total counts: 85


[1, 1, 1, 1, 10, 12, 14, 13, 7, 7, 7, 6, 5]

In [15]:
# final counts that will be considered for realistic retrieval of records

final_counts = [
    {'from': 15, 'to': 19, 'count': 5, 'M': 1, 'F': 0},
    {'from': 20, 'to': 24, 'count': 5, 'M': 0, 'F': 1},
    {'from': 25, 'to': 29, 'count': 5, 'M': 1, 'F': 0},
    {'from': 30, 'to': 34, 'count': 5, 'M': 0, 'F': 1},
    {'from': 35, 'to': 39, 'count': 52, 'M': 5, 'F': 5},
    {'from': 40, 'to': 44, 'count': 60, 'M': 6, 'F': 6},
    {'from': 45, 'to': 49, 'count': 68, 'M': 7, 'F': 7},
    {'from': 50, 'to': 54, 'count': 65, 'M': 6, 'F': 7},
    {'from': 55, 'to': 59, 'count': 37, 'M': 3, 'F': 4},
    {'from': 60, 'to': 64, 'count': 37, 'M': 3, 'F': 4},
    {'from': 65, 'to': 69, 'count': 34, 'M': 3, 'F': 4},
    {'from': 70, 'to': 74, 'count': 31, 'M': 3, 'F': 3},
    {'from': 75, 'to': 79, 'count': 26, 'M': 2, 'F': 3}
]

final_male_percentage = round(sum([item['M'] for item in final_counts]) / total_adjusted_counts, 3)
final_female_percentage = round(sum([item['F'] for item in final_counts]) / total_adjusted_counts, 3)

print(f"Real percentage of males: {male_percentage}")
print(f"Achieved: {final_male_percentage}\n")

print(f"Real percentage of females: {female_percentage}")
print(f"Achieved: {final_female_percentage}")

Real percentage of males: 0.478
Achieved: 0.471

Real percentage of females: 0.522
Achieved: 0.529


### Municipalities

In [16]:
# load municipality population data

data_folder = '../../data/demographics'

with open(f"{data_folder}/population/municipalities.json", 'r', encoding='utf-8') as file:
    municipalities = json.load(file)

In [17]:
# probabilities for each municipality

municipalities_population = sum(municipalities.values())

municipality_probabilities = {}
for name, population in municipalities.items():
    municipality_probabilities[name] = round(population / municipalities_population, 3)

municipality_probabilities

{'Alfândega da Fé': 0.001,
 'Amares': 0.005,
 'Braga': 0.036,
 'Bragança': 0.008,
 'Gondomar': 0.041,
 'Maia': 0.033,
 'Matosinhos': 0.042,
 'Monção': 0.004,
 'Paredes': 0.021,
 'Penafiel': 0.017,
 'Porto': 0.056,
 'Póvoa de Lanhoso': 0.005,
 'Santo Tirso': 0.016,
 'Valença': 0.003,
 'Valongo': 0.006,
 'Viana do Castelo': 0.021,
 'Vila Nova de Famalicão': 0.01,
 'Vila Pouca de Aguiar': 0.003,
 'Vila Real': 0.004,
 'Alenquer': 0.011,
 'Coimbra': 0.026,
 'Figueira da Foz': 0.015,
 'Guarda': 0.006,
 'Lourinhã': 0.006,
 'Lousã': 0.004,
 'Miranda do Corvo': 0.002,
 'Pombal': 0.012,
 'Soure': 0.004,
 'Tábua': 0.003,
 'Torres Vedras': 0.02,
 'Alcochete': 0.005,
 'Almada': 0.021,
 'Amadora': 0.042,
 'Barreiro': 0.019,
 'Lisboa': 0.133,
 'Loures': 0.049,
 'Montijo': 0.014,
 'Odivelas': 0.014,
 'Oeiras': 0.042,
 'Palmela': 0.005,
 'Seixal': 0.041,
 'Sesimbra': 0.013,
 'Setúbal': 0.03,
 'Vila Franca de Xira': 0.033,
 'Almodôvar': 0.002,
 'Alvito': 0.001,
 'Avis': 0.001,
 'Azambuja': 0.002,
 'Barr

### Occupations

In [18]:
# load category population data

data_folder = '../../data/demographics'

with open(f"{data_folder}/population/occupations.json", 'r', encoding='utf-8') as file:
    occupations = json.load(file)

In [19]:
# probabilities for each occupation category

occupation_probabilities = {}
for name, category in occupations.items():
    if name == 'population':
        continue
    occupation_probabilities[name] = round(category['population'] / occupations['population'], 3)

occupation_probabilities

{'armed forces occupations': 0.005,
 'managers': 0.061,
 'professionals': 0.182,
 'technicians and associate professionals': 0.106,
 'clerical support workers': 0.096,
 'service and sales workers': 0.184,
 'skilled agricultural, forestry and fishery workers': 0.022,
 'craft and related trades workers': 0.136,
 'plant and machine operators, and assemblers': 0.053}

### Applicants

In [20]:
def find_patient(applicants : dict, age_group : tuple[str], gender : str) -> bool:
    '''
    Finds a new pacient with a certain age and gender
    '''
    ids = applicants.keys()
    for patient in patients.itertuples():
        age = 2023 - int(patient.BIRTHDATE.split('-')[0])
        if age_group[0] <= age and age_group[1] >= age and patient.GENDER == gender:
            if patient.Id not in ids:
                applicants[patient.Id] = {
                    'age': age,
                    'gender': 'male' if gender == 'M' else 'female'
                }
                return True
    return False

In [21]:
# assign patients matching the intended distributions

applicants = {}

for item in final_counts:
    age_group = (item['from'], item['to'])
    males = item['M']
    females = item['F']
    for _ in range(males):
        find_patient(applicants, age_group, 'M')
    for _ in range(females):
        find_patient(applicants, age_group, 'F')

print(f"{len(applicants)} patients assigned")

85 patients assigned


In [22]:
def assign_municipality() -> str:
    '''
    Selects a random municipality based on the respective probabilities
    '''
    municipality = random.choices(
        population=list(municipality_probabilities.keys()),
        weights=list(municipality_probabilities.values())
    )[0]
    return municipality

In [23]:
def assign_occupation(age : int) -> tuple[str, int]:
    '''
    Selects a random occupation based on probabilities alongside the respective average daily steps
    '''
    if age >= 15 and age <= 19:
        occupation = 'Student'
        steps = random.randint(4000, 9000)
    elif age >= 65 and age <= 79:
        occupation = 'Retired'
        steps = random.randint(2000, 5000)
    else:
        category = random.choices(
            population=list(occupation_probabilities.keys()),
            weights=list(occupation_probabilities.values())
        )[0]
        options = occupations[category]['steps']
        occupation = random.choice(list(options.keys()))
        steps = occupations[category]['steps'][occupation]
        steps = random.randint(steps - 500, steps + 500)
    return occupation, steps

In [24]:
# load social media captions

data_folder = '../../data/social_media'

with open(f"{data_folder}/processed_captions.json", 'r', encoding='utf-8') as file:
    captions = json.load(file)

In [25]:
def gender_mentioned(tokens : str) -> str | None:
    '''
    Checks for gender terms in the caption
    '''
    men_or_man = ('men' in tokens or 'man' in tokens)
    women_or_woman = ('women' in tokens or 'woman' in tokens)
    if men_or_man and not women_or_woman:
        return 'male'
    elif women_or_woman and not men_or_man:
        return 'female'
    else:
        return 'other'

In [26]:
def assign_social_media_posts(gender : str) -> list[str]:
    '''
    Randomly selects pre-generated descriptions of social media posts
    '''
    posts = []
    num_posts = random.randint(1, 15)
    while len(posts) < num_posts:
        caption = random.choice(captions)
        tokens = caption.lower().split()
        category = gender_mentioned(tokens)
        if category == gender or category == 'other':
            posts.append(caption)
    return posts

In [27]:
# assign additional data to each applicant

for applicant in applicants.values():
    applicant['municipality'] = assign_municipality()
    applicant['occupation'], applicant['steps'] = assign_occupation(applicant['age'])
    applicant['posts'] = assign_social_media_posts(applicant['gender'])
    applicant['access'] = {
        'health': True,
        'posts': False,
        'steps': False
    }

In [50]:
# access probabilities

data_folder = '../../data/demographics'

with open(f"{data_folder}/population/social_media.json", 'r') as file:
    ig_usage = json.load(file)

access = 0.4

In [51]:
# assign access levels provided by each applicant

grouped = {f"{grp['from']}-{grp['to']}": [] for grp in ig_usage}
for id, applicant in applicants.items():
    for grp in ig_usage:
        if grp['from'] <= applicant['age'] <= grp['to']:
            key = f"{grp['from']}-{grp['to']}"
            grouped[key].append(id)
            break

total_applicants = len(applicants)

for grp in ig_usage:
    key = f"{grp['from']}-{grp['to']}"
    group_people = grouped[key]
    n = len(group_people)
    target_count = round(grp['percentage'] * total_applicants)
    assign_count = min(target_count, n)
    selected = random.sample(group_people, assign_count) if assign_count > 0 else []

    for id in group_people:
        applicants[id]['access'] = {
            'health': True,
            'posts': random.random() <= access if id in selected else False,
            'steps': id in selected
        }

In [53]:
# save applicants data

data_folder = '../../data/applicants'

with open(f"{data_folder}/applicants.json", 'w', encoding='utf-8') as file:
    json.dump(applicants, file, indent=4, ensure_ascii=False)