<a href="https://colab.research.google.com/github/Chu-Yichen/INST0001_Database-System/blob/main/Individual_%26_Survey.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import csv
import random
from datetime import datetime, timedelta

# Function to get a random date between two dates, function is used throughout to randomly choose a date within a given range.
def random_date(start, end):
    delta = end - start
    return start + timedelta(days=random.randint(0, delta.days))

# Generate a random birth date, birth dates are chosen between January 1, 1944 and January 1, 2010.
def generate_birth_date():
    start = datetime(1944, 1, 1)
    end = datetime(2010, 1, 1)
    return random_date(start, end).strftime("%Y-%m-%d")

# Calculate an individual's age based on their birth date and a given current year.
def calculate_age(birth_date_str, current_year):
    birth_year = int(birth_date_str.split("-")[0])
    return current_year - birth_year

"""
Generate an individual record.
personal_id is unique.(primary key)
date_of_birth is randomly chosen from 1944-01-01 to 2010-01-01.
Sex is assigned with a distribution of 49% male, 49% female, and 2% 'prefer not to say'.
We record the created_year (used later for survey eligibility).
"""
def generate_individual(region_id, personal_id, current_year):
    sex = random.choices(["male", "female", "prefer not to say"], weights=[49, 49, 2])[0]
    return {
        "personal_id": personal_id,
        "date_of_birth": generate_birth_date(),
        "sex": sex,
        "lives_in_region_id": region_id,
        "created_year": current_year  # Constraint: Used only to determine if an individual is a returning respondent.
    }

"""
Generate a survey for a new respondent.
Average unpaid work hours vary by sex (women: 10-40 hours, men: 4-13 hours, others: 20). [Derived from the UN Report]
Numeracy proficiency is set to 1 with 75% chance and literacy to 1 with 67% chance.[Derived from the FRED empirical data]
Poverty flag is determined by a probability that decreases each year -- to show the improvement (starting at 80% in 2019, down to a minimum of 50%). [Derived from the WORLDBANK empirical Data]
Employment status is derived from age (e.g., below 18: "below_minimum_legal_work_age"; above 65: "unemployed")--.
probability of unemployed when random generation: 18%  (Average of 5 countries -- trading economics)
Returns the survey record and a history dict (to allow improvements for returning respondents).
"""
def generate_new_survey(individual, survey_year, next_survey_id):
    sex = individual["sex"]
    if sex == "female":
        avg_time = random.randint(10, 40)
    elif sex == "male":
        avg_time = random.randint(4, 13)
    else:
        avg_time = 20
    numeracy = 1 if random.random() < 0.75 else 0
    literacy = 1 if random.random() < 0.67 else 0
    # Constraint: Poverty probability decreases by 0.05 each year from 0.8 (but not below 0.5).
    prob_poverty = max(0.8 - (survey_year - 2019) * 0.05, 0.5)
    poverty = 1 if random.random() < prob_poverty else 0
    age = calculate_age(individual["date_of_birth"], survey_year)
    if age < 18:
        emp_status = "below_minimum_legal_work_age"
    elif age > 65:
        emp_status = "unemployed"
    else:
        emp_status = "unemployed" if random.random() < 0.18 else "employed"
    survey = {
        "survey_id": next_survey_id,
        "average_time_spent_on_unpaid_domestic_and_care_work": avg_time,
        "proficiency_in_numeracy": numeracy,
        "proficiency_in_literacy": literacy,
        "below_poverty_line": poverty,
        "employment_status": emp_status,
        "survey_year": survey_year,
        "takes_by_personal_id": individual["personal_id"]
    }
    history = {"numeracy": numeracy, "literacy": literacy, "poverty": poverty, "avg_time": avg_time}
    return survey, history

"""
Generate a survey for a returning respondent.
For returning respondents, we allow improvements:
Average time is recalculated with a reduction (0–15 hours for women, 0–4 hours for men).
If previously not proficient in numeracy or literacy, there's a 50% chance to improve.
For poverty, a previous 1 may become 0 with 50% chance.
Employment status is recalculated based on age.
Returns the updated survey and new history for future improvements.
"""
def generate_returning_survey(individual, prev_history, survey_year, next_survey_id):
    sex = individual["sex"]
    if sex == "female":
        base_time = random.randint(30, 40)
        reduction = random.randint(0, 15)
    elif sex == "male":
        base_time = random.randint(8, 13)
        reduction = random.randint(0, 4)
    else:
        base_time = 20
        reduction = random.randint(0, 10)
    avg_time = max(base_time - reduction, 0)
    numeracy = 1 if (prev_history["numeracy"] == 0 and random.random() < 0.5) else prev_history["numeracy"]
    literacy = 1 if (prev_history["literacy"] == 0 and random.random() < 0.5) else prev_history["literacy"]
    poverty = 0 if (prev_history["poverty"] == 1 and random.random() < 0.5) else prev_history["poverty"]
    age = calculate_age(individual["date_of_birth"], survey_year)
    if age < 18:
        emp_status = "below minimum working age"
    elif age > 65:
        emp_status = "unemployed"
    else:
        emp_status = "unemployed" if random.random() < 0.18 else "employed"
    survey = {
        "survey_id": next_survey_id,
        "average_time_spent_on_unpaid_domestic_and_care_work": avg_time,
        "proficiency_in_numeracy": numeracy,
        "proficiency_in_literacy": literacy,
        "below_poverty_line": poverty,
        "employment_status": emp_status,
        "survey_year": survey_year,
        "takes_by_personal_id": individual["personal_id"]
    }
    new_history = {"numeracy": numeracy, "literacy": literacy, "poverty": poverty, "avg_time": avg_time}
    return survey, new_history

# Write INDIVIDUAL table data to a CSV file, and ignore extra fields like 'created_year' that are not needed in the final table.
def write_individuals_to_csv(individuals, filename="individuals.csv"):
    fieldnames = ["personal_id", "date_of_birth", "sex", "lives_in_region_id"]
    with open(filename, "w", newline="", encoding="utf-8") as f:
        writer = csv.DictWriter(f, fieldnames=fieldnames, extrasaction='ignore')
        writer.writeheader()
        writer.writerows(individuals)

# Write SURVEY table data to a CSV file.
def write_surveys_to_csv(surveys, filename="surveys.csv"):
    fieldnames = ["survey_id", "average_time_spent_on_unpaid_domestic_and_care_work",
                  "proficiency_in_numeracy", "proficiency_in_literacy", "below_poverty_line",
                  "employment_status", "survey_year", "takes_by_personal_id"]
    with open(filename, "w", newline="", encoding="utf-8") as f:
        writer = csv.DictWriter(f, fieldnames=fieldnames)
        writer.writeheader()
        writer.writerows(surveys)

"""
Main function to generate and write INDIVIDUAL and SURVEY data.
# - We generate 200 surveys per region annually (2019-2024).
# - For 2019, all surveys are from new respondents.
# - For subsequent years, 80% are new respondents and 20% are returning respondents, ensuring that returning respondents are strictly those created in previous years.
# - After each year, new respondents are added to the pool for potential future surveys.
"""
def main():
    regions = [
        {"region_id": "SSD001", "region_type": "urban", "region_name": "Juba", "country_id": "SOU"},
        {"region_id": "SSD002", "region_type": "urban", "region_name": "Wau", "country_id": "SOU"},
        {"region_id": "SSD003", "region_type": "rural", "region_name": "Bor", "country_id": "SOU"},
        {"region_id": "SSD004", "region_type": "rural", "region_name": "Yambio", "country_id": "SOU"},
        {"region_id": "RCB001", "region_type": "urban", "region_name": "Kinshasa", "country_id": "DEM"},
        {"region_id": "RCB002", "region_type": "urban", "region_name": "Lubumbashi", "country_id": "DEM"},
        {"region_id": "RCB003", "region_type": "rural", "region_name": "Goma", "country_id": "DEM"},
        {"region_id": "RCB004", "region_type": "rural", "region_name": "Bukavu", "country_id": "DEM"},
        {"region_id": "PSE001", "region_type": "urban", "region_name": "Ramallah", "country_id": "PAL"},
        {"region_id": "PSE002", "region_type": "urban", "region_name": "Gaza", "country_id": "PAL"},
        {"region_id": "PSE003", "region_type": "rural", "region_name": "Jenin", "country_id": "PAL"},
        {"region_id": "PSE004", "region_type": "rural", "region_name": "Tubas", "country_id": "PAL"},
        {"region_id": "YEM001", "region_type": "urban", "region_name": "Sanaa", "country_id": "YEM"},
        {"region_id": "YEM002", "region_type": "urban", "region_name": "Aden", "country_id": "YEM"},
        {"region_id": "YEM003", "region_type": "rural", "region_name": "Ibb", "country_id": "YEM"},
        {"region_id": "YEM004", "region_type": "rural", "region_name": "Taiz", "country_id": "YEM"},
        {"region_id": "LEB001", "region_type": "urban", "region_name": "Beirut", "country_id": "LEB"},
        {"region_id": "LEB002", "region_type": "urban", "region_name": "Tripoli", "country_id": "LEB"},
        {"region_id": "LEB003", "region_type": "rural", "region_name": "Baalbek", "country_id": "LEB"},
        {"region_id": "LEB004", "region_type": "rural", "region_name": "Byblos", "country_id": "LEB"}
    ]

    individuals = []       # List to store INDIVIDUAL records.
    surveys = []           # List to store SURVEY records.
    region_returnees = {}  # Mapping: region_id -> list of personal_ids from previous years.
    individual_history = {}  # Mapping: personal_id -> latest survey history for improvement tracking.

    next_personal_id = 1
    next_survey_id = 1

    # Initialize our pool of returnees for each region.
    for region in regions:
        region_returnees[region["region_id"]] = []

    # For each region, generate 200 surveys annually.
    for region in regions:
        reg_id = region["region_id"]
        for year in range(2019, 2025):
            total_surveys = 200
            # For 2019, all responses are from new individuals.
            if year == 2019:
                new_count = total_surveys
                returning_count = 0
            else:
                new_count = int(total_surveys * 0.8)   # 80% new respondents
                returning_count = total_surveys - new_count  # 20% must be from previous years -- For longitudinal analysis to track progress towards SDGs.

            new_ids = []
            # Generate new respondents and their surveys.
            for _ in range(new_count):
                individual = generate_individual(reg_id, next_personal_id, current_year=year)
                next_personal_id += 1
                individuals.append(individual)
                new_ids.append(individual["personal_id"])
                survey, history = generate_new_survey(individual, year, next_survey_id)
                next_survey_id += 1
                surveys.append(survey)
                individual_history[individual["personal_id"]] = history

            # For returning respondents, select only from individuals created in previous years.
            eligible_returnees = [pid for pid in region_returnees[reg_id]]
            sample_count = min(returning_count, len(eligible_returnees))
            if sample_count > 0:
                returning_ids = random.sample(eligible_returnees, sample_count)
                for pid in returning_ids:
                    # Retrieve the individual's record.
                    individual = next(ind for ind in individuals if ind["personal_id"] == pid)
                    prev_history = individual_history.get(pid, {"numeracy": 1, "literacy": 1, "poverty": 0, "avg_time": 0})
                    survey, new_history = generate_returning_survey(individual, prev_history, year, next_survey_id)
                    next_survey_id += 1
                    surveys.append(survey)
                    individual_history[pid] = new_history

            # Add this year's new respondents to the pool for future surveys.
            region_returnees[reg_id].extend(new_ids)

    # Write the INDIVIDUAL and SURVEY tables to CSV files.
    write_individuals_to_csv(individuals, filename="individuals.csv")
    write_surveys_to_csv(surveys, filename="surveys.csv")

    print(f"Generated {len(individuals)} individuals and {len(surveys)} survey records.")

if __name__ == '__main__':
    main()


Unnamed: 0,Equation
0,"\mu(R \mid s_1 = \text{Friendly}, s_2 = \text{..."
1,"\mu(R \mid s_1 = \text{Aloof}, s_2 = \text{Unv..."
2,"\mu(R \mid s_1 = \text{Friendly}, s_2 = \text{..."
3,"\mu(R \mid s_1 = \text{Friendly}, s_2 = \text{..."
4,"\mu(R \mid s_1 = \text{Aloof}, s_2 = \text{Cre..."
5,"\mu(R \mid s_1 = \text{Aloof}, s_2 = \text{Inc..."
