In [6]:
import pandas as pd

# Read the 'prevalence' sheet using openpyxl
df = pd.read_excel(
    'full_data.xlsx',
    sheet_name='Prevalence',
    engine='openpyxl'
)

# Save to CSV
df.to_csv('prevalence.csv', index=False)

print("Saved 'prevalence' sheet to 'prevalence.csv'.")


Saved 'prevalence' sheet to 'prevalence.csv'.


In [7]:
import pandas as pd

# 1. Load the sheet (use openpyxl for .xlsx)
df = pd.read_csv('prevalence.csv')

# 2. Forward-fill PMID so every row has the correct value
df['PMID'] = df['PMID'].ffill()

# 3. Save to CSV
df.to_csv('prevalence_filled.csv', index=False)

print("Done — all PMIDs filled and saved to 'prevalence_filled.csv'.")


Done — all PMIDs filled and saved to 'prevalence_filled.csv'.


In [9]:
import pandas as pd

# 1. Load and reset index
df = pd.read_csv('prevalence_filled.csv').reset_index(drop=True)

# 2. Drop rows missing any of the critical fields
df = df.dropna(subset=[
    'age_start', 'age_end', 'year_start', 'year_end',
    'Sample size', 'proportion',
    'Type (Any, early to intermediate, late-wet, late-dry)'
])

# 3. Cast age and year columns to integers
for col in ['age_start', 'age_end', 'year_start', 'year_end']:
    df[col] = df[col].astype(int)

# 4. Build 'area' from Nation (if non-NA) otherwise Region
df['area'] = df['Nation'].fillna(df['Region'])

# 5. Set constant data_type = 'p'
df['data_type'] = 'p'

# 6. Lowercase sex
df['sex'] = df['Sex'].str.lower()

# 7. Reset index into its own column, then select & reorder (including Type)
out = df.reset_index()[[
    'index',
    'sex',
    'area',
    'Sample size',
    'data_type',
    'Type (Any, early to intermediate, late-wet, late-dry)',
    'proportion',
    'standard error',
    'age_start',
    'age_end',
    'year_start',
    'year_end'
]].rename(columns={
    'Sample size': 'effective_sample_size',
    'proportion': 'value',
    'standard error': 'standard_error',
    'Type (Any, early to intermediate, late-wet, late-dry)': 'type'
})

# 8. Save final CSV
out.to_csv('prevalence_filled_selected.csv', index=False)

print("Saved with columns: index, sex, area, effective_sample_size, data_type, type, value, standard_error, age_start, age_end, year_start, year_end.")


Saved with columns: index, sex, area, effective_sample_size, data_type, type, value, standard_error, age_start, age_end, year_start, year_end.


In [12]:
import pandas as pd

# 1. Load
df = pd.read_csv('prevalence_filled_selected.csv')

# 2. Normalize: strip spaces, lower-case
df['type_norm'] = df['type'].str.strip().str.lower()

# 3. Map to canonical labels
type_map = {
    'late':                   'Late',
    'early to intermediate':  'Early to intermediate',
    'any':                    'Any',
    'late-dry':               'Late-dry',
    'late-wet':               'Late-wet',
    'intermediate':           'Intermediate',
    'early':                  'Early'
}
df['type'] = df['type_norm'].map(type_map)

# 4. (Optional) drop the helper column
df = df.drop(columns='type_norm')

# 5. Check
print(df['type'].value_counts())

# 6. Save
df.to_csv('prevalence_filled_selected.csv', index=False)


Late                     1555
Early to intermediate     978
Any                       939
Late-dry                  712
Late-wet                  707
Intermediate              612
Early                     254
Name: type, dtype: int64


In [14]:
import pandas as pd

# 1. Load the selected CSV
df = pd.read_csv('prevalence_filled_selected.csv')

# 2. Filter to only Late, Late-dry, Late-wet
keep_types = ['Late', 'Late-dry', 'Late-wet']
df_filtered = df[df['type'].isin(keep_types)].reset_index(drop=True)

# 3. Drop the 'type' column entirely
df_filtered = df_filtered.drop(columns='type')

# 4. Optional: confirm the column has been removed
print("Columns now present:", df_filtered.columns.tolist())

# 5. Save to a new CSV
df_filtered.to_csv('prevalence_late_categories.csv', index=False)

print("\nSaved filtered data without 'type' column to 'prevalence_late_categories.csv'")


Columns now present: ['index', 'sex', 'area', 'effective_sample_size', 'data_type', 'value', 'standard_error', 'age_start', 'age_end', 'year_start', 'year_end']

Saved filtered data without 'type' column to 'prevalence_late_categories.csv'


In [None]:
import pandas as pd

# Load the CSV
df = pd.read_csv('prevalence_late_categories.csv')

# Print unique area values
print(df['area'].unique())


['US' 'Austrailia' 'Netherland' 'Finland' 'Barbados' 'Italy' 'Greece'
 'Japan' 'France' 'Iceland' 'India' 'China' '7 countries' 'Norway'
 'Estonia' 'Northern Ireland' 'Spain' 'Greenland' 'Brazil' 'Taiwan'
 'Singapore' 'Thailand' 'UK' 'Germany' 'Kenya' 'Netherlands' 'South Korea'
 'Ireland' 'Algeria' 'Portugal' 'Slovakia' 'Russia' 'Iran']


In [17]:
import pandas as pd

# 1. Load the filtered CSV
df = pd.read_csv('prevalence_late_categories.csv')

# 2. Keep a copy of the original area strings
df['orig_area'] = df['area']

# 3. Define ISO3 mapping (Greenland → Denmark)
area_to_iso = {
    'US':               'USA',
    'Austrailia':       'AUS',
    'Netherland':       'NLD',
    'Netherlands':      'NLD',
    'Finland':          'FIN',
    'Barbados':         'BRB',
    'Italy':            'ITA',
    'Greece':           'GRC',
    'Japan':            'JPN',
    'France':           'FRA',
    'Iceland':          'ISL',
    'India':            'IND',
    'China':            'CHN',
    'Norway':           'NOR',
    'Estonia':          'EST',
    'Northern Ireland': 'GBR',
    'Spain':            'ESP',
    'Greenland':        'DNK',   # Denmark
    'Brazil':           'BRA',
    'Taiwan':           'TWN',
    'Singapore':        'SGP',
    'Thailand':         'THA',
    'UK':               'GBR',
    'United Kingdom':   'GBR',
    'South Korea':      'KOR',
    'Germany':          'DEU',
    'Kenya':            'KEN',
    'Ireland':          'IRL',
    'Algeria':          'DZA',
    'Portugal':         'PRT',
    'Slovakia':         'SVK',
    'Russia':           'RUS',
    'Iran':             'IRN'
}

# 4. Map area → ISO3
df['area'] = df['area'].map(area_to_iso)

# 5. Identify unmapped rows
mask_unmapped = df['area'].isna()
num_dropped = mask_unmapped.sum()
unmapped_vals = df.loc[mask_unmapped, 'orig_area'].unique().tolist()

# 6. Report which area strings were dropped
print(f"Dropped {num_dropped} rows with unmapped area.")
print("Unmapped area values were:", unmapped_vals)

# 7. Drop them and reset index
df = df[~mask_unmapped].reset_index(drop=True)

# 8. (Optional) remove helper column
df = df.drop(columns=['orig_area'])

# 9. Save updated CSV
df.to_csv('prevalence_late_categories_country_code.csv', index=False)



Dropped 90 rows with unmapped area.
Unmapped area values were: ['7 countries']


In [18]:
import pandas as pd
import numpy as np

# 1. Load the CSV
df.to_csv('prevalence_late_categories_country_code.csv', index=False)

# 2. Find rows where value == 0 and fix
mask_zero = df['value'] == 0
df.loc[mask_zero, 'value'] = 1 / (2 * df.loc[mask_zero, 'effective_sample_size'])
df.loc[mask_zero, 'standard_error'] = np.sqrt(
    df.loc[mask_zero, 'value'] * (1 - df.loc[mask_zero, 'value']) / df.loc[mask_zero, 'effective_sample_size']
)

# 3. Cast effective_sample_size to integer
df['effective_sample_size'] = df['effective_sample_size'].astype(int)

# 4. Drop rows where value > 1
mask_gt1 = df['value'] > 1
num_gt1 = mask_gt1.sum()
df = df[~mask_gt1]

# 5. Report counts
print(f"Adjusted {mask_zero.sum()} rows where 'value' was zero.")
print(f"Removed {num_gt1} rows where 'value' exceeded 1.")

# 6. Save back to CSV
df.to_csv('input_data.csv', index=False)


Adjusted 588 rows where 'value' was zero.
Removed 0 rows where 'value' exceeded 1.


In [None]:
import pandas as pd

# Read output template and drop covariates
template = pd.read_csv('output_template.csv')
template = template.drop(columns=['x_cv_ascertainment', 'x_cv_diagnostic_criteria', 
                                'x_cv_representative', 'x_ihme_fao_stimulants_kcal_26oct11',
                                'x_smoking_prev'])
template.to_csv('output_template.csv', index=False)




In [27]:
# Read output template and extract unique area keys
areas = pd.read_csv('output_template.csv')['area'].unique()
print(f"Found {len(areas)} unique areas:")
print(sorted(areas))

Found 186 unique areas:
['AFG', 'AGO', 'ALB', 'ARE', 'ARG', 'ARM', 'ATG', 'AUS', 'AUT', 'AZE', 'BDI', 'BEL', 'BEN', 'BFA', 'BGD', 'BGR', 'BHR', 'BHS', 'BIH', 'BLR', 'BLZ', 'BOL', 'BRA', 'BRB', 'BRN', 'BTN', 'BWA', 'CAF', 'CAN', 'CHE', 'CHL', 'CHN', 'CIV', 'CMR', 'COD', 'COG', 'COL', 'COM', 'CPV', 'CRI', 'CUB', 'CYP', 'CZE', 'DEU', 'DJI', 'DMA', 'DNK', 'DOM', 'DZA', 'ECU', 'EGY', 'ERI', 'ESP', 'EST', 'ETH', 'FIN', 'FJI', 'FRA', 'FSM', 'GAB', 'GBR', 'GEO', 'GHA', 'GIN', 'GMB', 'GNB', 'GNQ', 'GRC', 'GRD', 'GTM', 'GUY', 'HND', 'HRV', 'HTI', 'HUN', 'IDN', 'IND', 'IRL', 'IRN', 'IRQ', 'ISL', 'ISR', 'ITA', 'JAM', 'JOR', 'JPN', 'KAZ', 'KEN', 'KGZ', 'KHM', 'KIR', 'KOR', 'KWT', 'LAO', 'LBN', 'LBR', 'LBY', 'LCA', 'LKA', 'LSO', 'LTU', 'LUX', 'LVA', 'MAR', 'MDA', 'MDG', 'MDV', 'MEX', 'MHL', 'MKD', 'MLI', 'MLT', 'MMR', 'MNE', 'MNG', 'MOZ', 'MRT', 'MUS', 'MWI', 'MYS', 'NAM', 'NER', 'NGA', 'NIC', 'NLD', 'NOR', 'NPL', 'NZL', 'OMN', 'PAK', 'PAN', 'PER', 'PHL', 'PNG', 'POL', 'PRK', 'PRT', 'PRY', 'PSE', 'Q

In [22]:
# Read input data and extract unique area keys
input_areas = pd.read_csv('input_data.csv')['area'].unique()
print(f"\nFound {len(input_areas)} unique areas in input data:")
print(input_areas)



Found 30 unique areas in input data:
['USA' 'AUS' 'NLD' 'FIN' 'BRB' 'ITA' 'GRC' 'JPN' 'FRA' 'ISL' 'IND' 'CHN'
 'NOR' 'EST' 'GBR' 'ESP' 'DNK' 'BRA' 'TWN' 'SGP' 'THA' 'DEU' 'KEN' 'KOR'
 'IRL' 'DZA' 'PRT' 'SVK' 'RUS' 'IRN']


In [32]:
import json
import re

def load_jsonc(path):
    with open(path, 'r') as f:
        text = f.read()
    # 1) strip single-line "//…" comments
    text = re.sub(r'//.*?(?=\n)', '', text)
    # 2) strip block "/* … */" comments
    text = re.sub(r'/\*.*?\*/', '', text, flags=re.S)
    # 3) remove trailing commas before } or ]
    text = re.sub(r',\s*([\}\]])', r'\1', text)
    return json.loads(text)

hierarchy = load_jsonc('hierarchy.jsonc')
all_nodes = [node[0] for node in hierarchy['nodes']]

# Only keep valid 3-letter country codes
node_names = [area for area in all_nodes if len(area) == 3 and area.isupper()]

print(f"Found {len(node_names)} valid area codes in hierarchy:")
print(sorted(node_names))

# Find areas in node_names but not in areas
in_nodes_not_areas = set(node_names) - set(areas)
print("\nAreas in hierarchy but not in template:")
print(sorted(in_nodes_not_areas))

# Find areas in areas but not in node_names 
in_areas_not_nodes = set(areas) - set(node_names)
print("\nAreas in template but not in hierarchy:")
print(sorted(in_areas_not_nodes))




Found 189 valid area codes in hierarchy:
['AFG', 'AGO', 'ALB', 'ARE', 'ARG', 'ARM', 'ATG', 'AUS', 'AUT', 'AZE', 'BDI', 'BEL', 'BEN', 'BFA', 'BGD', 'BGR', 'BHR', 'BHS', 'BIH', 'BLR', 'BLZ', 'BOL', 'BRA', 'BRB', 'BRN', 'BTN', 'BWA', 'CAF', 'CAN', 'CHE', 'CHL', 'CHN', 'CIV', 'CMR', 'COD', 'COG', 'COL', 'COM', 'CPV', 'CRI', 'CUB', 'CYP', 'CZE', 'DEU', 'DJI', 'DNK', 'DOM', 'DZA', 'ECU', 'EGY', 'ERI', 'ESP', 'EST', 'ETH', 'FIN', 'FJI', 'FRA', 'FSM', 'GAB', 'GBR', 'GEO', 'GHA', 'GIN', 'GMB', 'GNB', 'GNQ', 'GRC', 'GRD', 'GTM', 'GUM', 'GUY', 'HND', 'HRV', 'HTI', 'HUN', 'IDN', 'IND', 'IRL', 'IRN', 'IRQ', 'ISL', 'ISR', 'ITA', 'JAM', 'JOR', 'JPN', 'KAZ', 'KEN', 'KGZ', 'KHM', 'KIR', 'KOR', 'KWT', 'LAO', 'LBN', 'LBR', 'LBY', 'LCA', 'LKA', 'LSO', 'LTU', 'LUX', 'LVA', 'MAR', 'MDA', 'MDG', 'MDV', 'MEX', 'MHL', 'MKD', 'MLI', 'MLT', 'MMR', 'MNE', 'MNG', 'MOZ', 'MRT', 'MUS', 'MWI', 'MYS', 'NAM', 'NER', 'NGA', 'NIC', 'NLD', 'NOR', 'NPL', 'NZL', 'OMN', 'PAK', 'PAN', 'PER', 'PHL', 'PNG', 'POL', 'PRI', 'PRK',

In [37]:
import json
from collections import defaultdict

def validate_graph_tree(graph):
    # Build level map and edge list
    level_map = {name: info["level"] for name, info in graph["nodes"]}
    edges = [(src, dst) for src, dst, _ in graph["edges"]]
    errors = []

    # Identify root (level 0)
    roots = [n for n, lvl in level_map.items() if lvl == 0]
    if len(roots) != 1:
        errors.append(f"Expected exactly one root, found: {roots}")
        root = roots[0] if roots else None
    else:
        root = roots[0]

    # Build parent mapping
    parents = defaultdict(list)
    for src, dst in edges:
        parents[dst].append(src)

    # Check parent counts and root parent
    for node, lvl in level_map.items():
        if node == root:
            if parents.get(node):
                errors.append(f"Root node '{root}' should have no parent, has: {parents[node]}")
        else:
            if node not in parents:
                errors.append(f"Node '{node}' (level {lvl}) has no parent")
            elif len(parents[node]) > 1:
                errors.append(f"Node '{node}' has multiple parents: {parents[node]}")

    # Check level consistency on edges
    for src, dst in edges:
        if src not in level_map or dst not in level_map:
            errors.append(f"Edge {src}->{dst} references unknown node(s)")
            continue
        if level_map[dst] != level_map[src] + 1:
            errors.append(f"Edge {src}->{dst} has invalid level jump: {level_map[src]} -> {level_map[dst]}")

    # Check connectivity and absence of cycles
    visited = set()
    rec_stack = set()

    def dfs(node):
        if node in rec_stack:
            errors.append(f"Cycle detected at node '{node}'")
            return
        rec_stack.add(node)
        visited.add(node)
        for s, d in edges:
            if s == node:
                dfs(d)
        rec_stack.remove(node)

    if root:
        dfs(root)
        missing = set(level_map) - visited
        if missing:
            errors.append(f"Unreachable nodes from root '{root}': {missing}")

    return errors

def load_jsonc(path):
    with open(path, 'r') as f:
        text = f.read()
    # 1) strip single-line "//…" comments
    text = re.sub(r'//.*?(?=\n)', '', text)
    # 2) strip block "/* … */" comments
    text = re.sub(r'/\*.*?\*/', '', text, flags=re.S)
    # 3) remove trailing commas before } or ]
    text = re.sub(r',\s*([\}\]])', r'\1', text)
    return json.loads(text)

graph = load_jsonc('hierarchy.jsonc')


errs = validate_graph_tree(graph)
if not errs:
    print("Graph is a valid tree structure from root -> levels 1 -> 2 -> 3.")
else:
    print("Validation errors found:")
    for err in errs:
        print(" -", err)
    


Validation errors found:
 - Node 'ATG' (level 3) has no parent
 - Node 'BHS' (level 3) has no parent
 - Node 'BRB' (level 3) has no parent
 - Node 'BLZ' (level 3) has no parent
 - Node 'CUB' (level 3) has no parent
 - Node 'DOM' (level 3) has no parent
 - Node 'GRD' (level 3) has no parent
 - Node 'GUY' (level 3) has no parent
 - Node 'HTI' (level 3) has no parent
 - Node 'JAM' (level 3) has no parent
 - Node 'PRI' (level 3) has no parent
 - Node 'LCA' (level 3) has no parent
 - Node 'VCT' (level 3) has no parent
 - Node 'SUR' (level 3) has no parent
 - Node 'TTO' (level 3) has no parent
 - Node 'VIR' (level 3) has no parent
 - Node 'BOL' (level 3) has no parent
 - Node 'ECU' (level 3) has no parent
 - Node 'PER' (level 3) has no parent
 - Node 'COL' (level 3) has no parent
 - Node 'CRI' (level 3) has no parent
 - Node 'SLV' (level 3) has no parent
 - Node 'GTM' (level 3) has no parent
 - Node 'HND' (level 3) has no parent
 - Node 'MEX' (level 3) has no parent
 - Node 'NIC' (level 3) h