In [2]:
import pandas as pd

In [3]:
# Read the sample dataset
data = pd.read_csv("Data.csv")

# Compute proportions of each category in the sample dataset
sex_proportions = data['Sex'].value_counts(normalize=True)
age_group_proportions = data['Age_category'].value_counts(normalize=True)
education_level_proportions = data['Highest_education_level'].value_counts(normalize=True)
print(sex_proportions)
print(age_group_proportions)
print(education_level_proportions)

1    0.515
2    0.485
Name: Sex, dtype: float64
2    0.590
1    0.355
3    0.055
Name: Age_category, dtype: float64
2    0.49
3    0.22
0    0.16
1    0.13
Name: Highest_education_level, dtype: float64


In [4]:
# Population characteristics from Table 2 (given)
population_characteristics = {
    'Sex': {1: 25324, 2: 24676},
    'Age_category': {1: 17955, 2: 29642, 3: 2403},
    'Highest_education_level': {0: 7490, 1: 5655, 2: 24400, 3: 12455}
}

In [5]:
# Initialize an empty dictionary to store the required agents for each category
required_agents = {}

# Iterate over each category and its corresponding proportions in population_characteristics
for category, proportions in population_characteristics.items():
    # Determine the multiplier based on the dataset size
    # The multiplier is capped at 50000 and is proportional to one-tenth of the dataset size
    # we can Adjust the divisor (currently 10) as needed for different scaling
#     multiplier = min(50000, len(data) / 10)

    # Calculate the required agents for each category based on proportions and multiplier
    category_agents = {}
    for k, v in proportions.items():
        # Multiply the proportion by the multiplier and round to the nearest integer
        category_agents[k] = round(v * 250)
    
    # Store the calculated required agents for the current category in the required_agents dictionary
    required_agents[category] = category_agents
print(required_agents)

{'Sex': {1: 6331000, 2: 6169000}, 'Age_category': {1: 4488750, 2: 7410500, 3: 600750}, 'Highest_education_level': {0: 1872500, 1: 1413750, 2: 6100000, 3: 3113750}}


In [6]:
# Initialize empty list to store synthesized population
synthesized_population = []
for category in required_agents:
    for category_value, count in required_agents[category].items():
        # Select individuals randomly from the sample dataset
        selected_individuals = data[data[category] == category_value].sample(n=count, replace=True)
        # Append selected individuals to synthesized population
        synthesized_population.extend(selected_individuals.to_dict('records'))
print(synthesized_population[:10])

[{'Sex': 1, 'Age_category': 3, 'Highest_education_level': 0}, {'Sex': 1, 'Age_category': 2, 'Highest_education_level': 2}, {'Sex': 1, 'Age_category': 2, 'Highest_education_level': 3}, {'Sex': 1, 'Age_category': 1, 'Highest_education_level': 0}, {'Sex': 1, 'Age_category': 2, 'Highest_education_level': 3}, {'Sex': 1, 'Age_category': 3, 'Highest_education_level': 0}, {'Sex': 1, 'Age_category': 2, 'Highest_education_level': 2}, {'Sex': 1, 'Age_category': 2, 'Highest_education_level': 2}, {'Sex': 1, 'Age_category': 2, 'Highest_education_level': 2}, {'Sex': 1, 'Age_category': 1, 'Highest_education_level': 2}]


In [7]:
# Convert the synthesized population into a DataFrame
synthesized_df = pd.DataFrame(synthesized_population)

# Calculate the frequencies of each category in the synthesized population
synthesized_sex_freq = synthesized_df['Sex'].value_counts()
synthesized_age_freq = synthesized_df['Age_category'].value_counts()
synthesized_education_freq = synthesized_df['Highest_education_level'].value_counts()

# Compare the frequencies with the population characteristics from Table 2
print("Validation Results:")
print("Sex Frequencies Match:", synthesized_sex_freq.equals(population_characteristics['Sex']))
print("Age Group Frequencies Match:", synthesized_age_freq.equals(population_characteristics['Age_category']))
print("Education Level Frequencies Match:", synthesized_education_freq.equals(population_characteristics['Highest_education_level']))


Validation Results:
Sex Frequencies Match: False
Age Group Frequencies Match: False
Education Level Frequencies Match: False
