# Activities by Employers/Companies

This section generates dummy activity data associated with previously generated company and people data.

- Links `Company Name` from `company_data.json`.
- Selects RSVP'd and Attended participants from `dummy_data.json` (based on `Youth_ID`).
- Produces `activities_data.json` and `activities_data.csv`, appending if they already exist.


In [1]:
# Cell 1: Imports & Constants for Activities
import json
import os
import random
import string
import csv
from datetime import datetime, timedelta
from faker import Faker
from datetime import datetime, timezone

fake = Faker()

activity_types = [
    "Social Responsibility", 
    "Networking", 
    "Recruitment", 
    "Product Launch", 
    "Community Outreach", 
    "Employee Engagement", 
    "Workshop", 
    "Training Session"
]

target_audiences = ["Employees", "Public", "Partners", "Students", "Volunteers", "Customers", "Youth"]
recurrence_options = ["Daily", "Weekly", "Monthly", "Annually", "Ad Hoc"]
feedback_mechanisms = ["Survey", "Feedback Form", "None"]

# Helper functions
def pick(lst):
    return random.choice(lst)

def random_boolean():
    return random.choice([True, False])

def flatten_and_remove_prefix(d, parent_key='', sep='_'):
    items = []
    for k, v in d.items():
        new_key = f"{parent_key}{sep}{k}" if parent_key else k
        if isinstance(v, dict):
            items.extend(flatten_and_remove_prefix(v, new_key, sep).items())
        elif isinstance(v, list):
            # For CSV, if list of strings/IDs, just join by comma
            # If complex structures, could be json.dumps
            if v and isinstance(v[0], dict):
                val_str = json.dumps(v, ensure_ascii=False)
            else:
                val_str = ", ".join(map(str, v))
            parts = new_key.split(sep, 1)
            if len(parts) > 1:
                new_key = parts[1]
            items.append((new_key, val_str))
        else:
            val_str = str(v) if v is not None else ""
            parts = new_key.split(sep, 1)
            if len(parts) > 1:
                new_key = parts[1]
            items.append((new_key, val_str))
    return dict(items)


In [7]:
# Cell 2: Load existing company and people data

# Set the number of youth to load (custom)
# For example, set this to 100 to load exactly 100 youth if available
num_youth_to_load = 555  # Adjust this number as needed

# Load companies
company_filename = '../data/companies.json'
companies = []
if os.path.exists(company_filename):
    with open(company_filename, 'r', encoding='utf-8') as f:
        try:
            companies = json.load(f)
        except json.JSONDecodeError:
            companies = []

# Load people
people_filename = '../data/people.json'
people = []
if os.path.exists(people_filename):
    with open(people_filename, 'r', encoding='utf-8') as f:
        try:
            people = json.load(f)
        except json.JSONDecodeError:
            people = []

# Sample a custom number of people
if people:
    if num_youth_to_load > len(people):
        # If requested number exceeds available, load all
        num_youth_to_load = len(people)
    people = random.sample(people, k=num_youth_to_load)

# Extract company names
company_names = []
for c in companies:
    basic_info = c.get("Basic Information", {})
    cname = basic_info.get("company_name")
    if cname:
        company_names.append(cname)

# Extract Youth IDs from selected people
youth_ids = []
for p in people:
    yid = p.get("Youth_ID")
    if yid:
        youth_ids.append(yid)

print(f"Loaded {len(companies)} companies.")
print(f"Total people after sampling: {len(people)}")
print(f"Number of Youth IDs available: {len(youth_ids)}")


Loaded 250 companies.
Total people after sampling: 500
Number of Youth IDs available: 500


In [8]:
# Cell 3: Define activity generation function

def generate_activity_data(num_records=5):
    # If no companies or youth_ids loaded, handle gracefully
    if not company_names:
        print("No companies available. Please ensure company_data.json has entries.")
        return []
    if not youth_ids:
        print("No people (Youth IDs) available. Please ensure dummy_data.json has entries.")
        return []
    
    activities = []
    for _ in range(num_records):
        
        # Generate a unique activity_id
        # For example, combine timestamp + random characters
        unique_part = "".join(random.choices(string.ascii_uppercase + string.digits, k=6))
        activity_id = f"ACT-{int(datetime.now(timezone.utc).timestamp())}-{unique_part}"
        
        company_name = pick(company_names)
        activity_name = fake.catch_phrase()  # just a random phrase as activity name
        activity_type = pick(activity_types)
        description = fake.sentence(nb_words=15)
        target_audience = pick(target_audiences)
        
        # Random start and end date
        start_date = datetime.now() + timedelta(days=random.randint(1,30))
        end_date = start_date + timedelta(days=random.randint(1,5))
        
        # Location (could be physical or virtual)
        # 50% chance virtual
        if random_boolean():
            location = "Virtual (Online)"
        else:
            # use random city/street from faker as location
            location = fake.address().replace("\n", ", ")
        
        participants_expected = random.randint(10, 500)
        
        key_partners = []
        if random_boolean():
            key_partners = [fake.company() for _ in range(random.randint(1,3))]
        
        activity_goals = fake.sentence(nb_words=10)
        
        # RSVP'd: pick random subset of youth_ids
        rsvp_count = random.randint(1, min(20, len(youth_ids)))
        rsvpd_list = random.sample(youth_ids, k=rsvp_count)
        
        # Attended: subset of RSVP'd
        attended_count = random.randint(0, rsvp_count)
        attended_list = random.sample(rsvpd_list, k=attended_count)
        
        # Logistics
        is_recurring = random_boolean()
        frequency = pick(recurrence_options) if is_recurring else None
        budget = None
        if random_boolean():
            budget = f"${random.randint(1000,100000)}"
        resources = []
        if random_boolean():
            resources = ["Training materials", "Equipment", "Refreshments"]
        
        # Impact Measurement
        success_metrics = "Number of participants, Engagement levels"
        feedback_mechanism = pick(feedback_mechanisms)
        if feedback_mechanism == "None":
            feedback_mechanism = None
        
        activity_record = {
            "activity_id": activity_id,
            "Activity Details": {
                "company_name": company_name,
                "activity_name": activity_name,
                "activity_type": activity_type,
                "description": description,
                "target_audience": target_audience,
                "start_date": start_date.isoformat(),
                "end_date": end_date.isoformat(),
                "location": location,
                "number_of_participants_expected": participants_expected,
                "key_partners_sponsors": key_partners,
                "activity_goals_outcomes": activity_goals,
                "rsvpd_list": rsvpd_list,
                "attended_people": attended_list
            },
            "Logistics": {
                "is_recurring": is_recurring,
                "frequency_of_recurrence": frequency,
                "budget": budget,
                "resources_needed_provided": resources
            },
            "Impact Measurement": {
                "success_metrics": success_metrics,
                "feedback_mechanism": feedback_mechanism
            }
        }
        
        activities.append(activity_record)
    return activities


In [9]:
# Cell 4: Generate and append activities to files
num_records = 77
new_activities = generate_activity_data(num_records)

if new_activities:
    # Append to JSON
    activities_json = '../data/activities_data.json'
    existing_activities = []
    if os.path.exists(activities_json):
        with open(activities_json, 'r', encoding='utf-8') as f:
            try:
                existing_activities = json.load(f)
            except json.JSONDecodeError:
                existing_activities = []
    existing_activities.extend(new_activities)

    with open(activities_json, 'w', encoding='utf-8') as jf:
        json.dump(existing_activities, jf, indent=4, ensure_ascii=False)
    
    # Flatten and write CSV
    final_records = [flatten_and_remove_prefix(a) for a in new_activities]
    activities_csv = '../data/activities_data.csv'
    file_exists = os.path.exists(activities_csv)
    
    fieldnames_set = set()
    for fr in final_records:
        fieldnames_set.update(fr.keys())
    fieldnames = sorted(fieldnames_set)
    
    mode = 'a' if file_exists else 'w'
    with open(activities_csv, mode=mode, newline='', encoding='utf-8') as csvfile:
        writer = csv.DictWriter(csvfile, fieldnames=fieldnames, quoting=csv.QUOTE_ALL)
        if not file_exists:
            writer.writeheader()
        for fr in final_records:
            writer.writerow(fr)
    
    print(f"{num_records} activity records appended to {activities_json} and {activities_csv}.")
else:
    print("No activities generated.")


77 activity records appended to ../data/activities_data.json and ../data/activities_data.csv.


In [12]:
# Cell: Analysis and Updating People Data

import json
import csv
import os

# Define a function to categorize age
def age_group(age):
    if age < 18:
        return "Under 18"
    elif 18 <= age <= 25:
        return "18-25"
    elif 26 <= age <= 40:
        return "26-40"
    else:
        return "Over 40"

# Load people data
people_filename = '../data/people.json'
with open(people_filename, 'r', encoding='utf-8') as f:
    people_data = json.load(f)

# Create a dict keyed by Youth_ID for easy lookup
people_by_id = {}
for person in people_data:
    yid = person.get("Youth_ID")
    if yid:
        people_by_id[yid] = person
        # Initialize lists for activities (if not present)
        person["activities_rsvpd"] = []
        person["activities_attended"] = []

# Load activities
activities_filename = '../data/activities_data.json'
with open(activities_filename, 'r', encoding='utf-8') as f:
    activities_data = json.load(f)

# Counters for analysis
total_rsvpd = 0
total_attended = 0

gender_count_rsvpd = {"Male":0, "Female":0, "Not specified":0}
gender_count_attended = {"Male":0, "Female":0, "Not specified":0}

age_groups_count_attended = {"Under 18":0, "18-25":0, "26-40":0, "Over 40":0}

current_year = 2024

# Process each activity
for activity in activities_data:
    activity_id = activity.get("activity_id")
    details = activity.get("Activity Details", {})
    rsvpd_list = details.get("rsvpd_list", [])
    attended_list = details.get("attended_people", [])
    
    # Update global counts
    total_rsvpd += len(rsvpd_list)
    total_attended += len(attended_list)
    
    # Update people's activities lists
    for yid in rsvpd_list:
        if yid in people_by_id:
            people_by_id[yid]["activities_rsvpd"].append(activity_id)
    for yid in attended_list:
        if yid in people_by_id:
            people_by_id[yid]["activities_attended"].append(activity_id)
    
    # Gender and age calculations
    for yid in rsvpd_list:
        person = people_by_id.get(yid)
        if person:
            gender = person.get("PII", {}).get("gender", "Not specified")
            if gender not in gender_count_rsvpd:
                gender_count_rsvpd[gender] = 0
            gender_count_rsvpd[gender] += 1
    
    for yid in attended_list:
        person = people_by_id.get(yid)
        if person:
            # Gender count for attended
            gender = person.get("PII", {}).get("gender", "Not specified")
            if gender not in gender_count_attended:
                gender_count_attended[gender] = 0
            gender_count_attended[gender] += 1
            
            # Age group for attended
            yob = person.get("PII", {}).get("year_of_birth")
            if yob:
                age = current_year - yob
                ag = age_group(age)
                age_groups_count_attended[ag] += 1

# Determine the major age group of attendees
# Find the age group with the highest count
major_age_group = max(age_groups_count_attended, key=age_groups_count_attended.get)

# Print analysis results
print("Analysis Results:")
print(f"Total RSVPd: {total_rsvpd}")
print(f"Total Attended: {total_attended}")
print("Gender breakdown for RSVPd:", gender_count_rsvpd)
print("Gender breakdown for Attended:", gender_count_attended)
print("Attended Age Groups:", age_groups_count_attended)
print(f"Major Age Group of Attendees: {major_age_group}")

# Update people.json with the new fields (activities_rsvpd, activities_attended)
with open(people_filename, 'w', encoding='utf-8') as f:
    json.dump(people_data, f, indent=4, ensure_ascii=False)

# Also update people.csv to include these columns
people_csv = './data/people.csv'
file_exists = os.path.exists(people_csv)
if file_exists:
    # Load current CSV and rewrite with new columns
    # We assume we have all other columns from the previous generation
    # We'll need to know all fields. Let's just load one person to guess fields
    fieldnames = list(people_data[0].keys())
    
    # Flatten PII or other nested fields if any? 
    # If the existing CSV was flattened previously, we may need to re-flatten.
    # For simplicity, let's assume CSV was a direct flatten. We'll re-flatten now.
    
    # We'll create a flatten function for person record similar to before:
    def flatten_person(person):
        items = {}
        def recurse(d, prefix=""):
            for k,v in d.items():
                new_key = f"{prefix}_{k}" if prefix else k
                if isinstance(v, dict):
                    recurse(v, new_key)
                elif isinstance(v, list) and not isinstance(v, str):
                    # For activities_rsvpd and activities_attended just join by comma
                    if new_key in ["activities_rsvpd", "activities_attended"]:
                        items[new_key] = ", ".join(map(str,v))
                    else:
                        # For other lists, also join by comma
                        items[new_key] = ", ".join(map(str,v))
                else:
                    items[new_key] = v if v is not None else ""
        recurse(person)
        return items
    
    # Flatten all people
    flattened_people = [flatten_person(p) for p in people_data]
    
    # Collect fieldnames from flattened data
    all_fields = set()
    for fp in flattened_people:
        all_fields.update(fp.keys())
    all_fields = sorted(all_fields)
    
    # Write updated CSV
    with open(people_csv, 'w', newline='', encoding='utf-8') as csvfile:
        writer = csv.DictWriter(csvfile, fieldnames=all_fields, quoting=csv.QUOTE_ALL)
        writer.writeheader()
        for fp in flattened_people:
            writer.writerow(fp)
    
    print("people.csv updated with activities_rsvpd and activities_attended columns.")
else:
    print("people.csv not found, no CSV update performed.")


Analysis Results:
Total RSVPd: 799
Total Attended: 395
Gender breakdown for RSVPd: {'Male': 248, 'Female': 315, 'Not specified': 236}
Gender breakdown for Attended: {'Male': 124, 'Female': 146, 'Not specified': 125}
Attended Age Groups: {'Under 18': 0, '18-25': 46, '26-40': 103, 'Over 40': 246}
Major Age Group of Attendees: Over 40
people.csv not found, no CSV update performed.


In [13]:
# Cell: Save Per-Activity Analysis to a JSON File

import json

current_year = 2024

def age_group(age):
    if age < 18:
        return "Under 18"
    elif 18 <= age <= 25:
        return "18-25"
    elif 26 <= age <= 40:
        return "26-40"
    else:
        return "Over 40"

activities_filename = '../data/activities_data.json'
people_filename = '../data/people.json'
analysis_filename = '../data/activity_analysis.json'

# Load people data
with open(people_filename, 'r', encoding='utf-8') as f:
    people_data = json.load(f)

# Create a lookup by Youth_ID
people_by_id = {p["Youth_ID"]: p for p in people_data if "Youth_ID" in p}

# Load activities
with open(activities_filename, 'r', encoding='utf-8') as f:
    activities_data = json.load(f)

activity_analysis_data = {}

for activity in activities_data:
    activity_id = activity.get("activity_id")
    details = activity.get("Activity Details", {})
    rsvpd_list = details.get("rsvpd_list", [])
    attended_list = details.get("attended_people", [])
    
    # Counts
    total_rsvpd = len(rsvpd_list)
    total_attended = len(attended_list)
    
    # Gender breakdown
    gender_count_rsvpd = {"Male":0, "Female":0, "Not specified":0}
    gender_count_attended = {"Male":0, "Female":0, "Not specified":0}
    
    # Age groups for attended
    age_groups_count_attended = {"Under 18":0, "18-25":0, "26-40":0, "Over 40":0}
    
    for yid in rsvpd_list:
        person = people_by_id.get(yid)
        if person:
            gender = person.get("PII", {}).get("gender", "Not specified")
            if gender not in gender_count_rsvpd:
                gender_count_rsvpd[gender] = 0
            gender_count_rsvpd[gender] += 1
    
    for yid in attended_list:
        person = people_by_id.get(yid)
        if person:
            gender = person.get("PII", {}).get("gender", "Not specified")
            if gender not in gender_count_attended:
                gender_count_attended[gender] = 0
            gender_count_attended[gender] += 1
            
            yob = person.get("PII", {}).get("year_of_birth")
            if yob:
                age = current_year - yob
                ag = age_group(age)
                age_groups_count_attended[ag] += 1
    
    # Determine major age group for this activity
    if sum(age_groups_count_attended.values()) > 0:
        major_age_group = max(age_groups_count_attended, key=age_groups_count_attended.get)
    else:
        major_age_group = None
    
    activity_analysis_data[activity_id] = {
        "activity_name": details.get("activity_name"),
        "total_rsvpd": total_rsvpd,
        "total_attended": total_attended,
        "gender_breakdown_rsvpd": gender_count_rsvpd,
        "gender_breakdown_attended": gender_count_attended,
        "age_groups_attended": age_groups_count_attended,
        "major_age_group_attendees": major_age_group
    }

# Save the per-activity analysis
with open(analysis_filename, 'w', encoding='utf-8') as f:
    json.dump(activity_analysis_data, f, indent=4, ensure_ascii=False)

print(f"Per-activity analysis saved to {analysis_filename}")


Per-activity analysis saved to ../data/activity_analysis.json


In [15]:
# Cell: User Input to Query Activity Details and Analysis

import json

activities_filename = '../data/activities_data.json'
analysis_filename = '../data/activity_analysis.json'

# Load activities
with open(activities_filename, 'r', encoding='utf-8') as f:
    activities_data = json.load(f)

# Load activity analysis
with open(analysis_filename, 'r', encoding='utf-8') as f:
    activity_analysis_data = json.load(f)

# User input for activity name
search_name = input("Enter activity name/title to search: ").strip()

# Find activity by name
found_activities = [a for a in activities_data if a.get("Activity Details", {}).get("activity_name") == search_name]

if not found_activities:
    print("No activity found with that name.")
else:
    # Assuming activity names are unique or taking the first if multiple found
    activity = found_activities[0]
    activity_id = activity.get("activity_id")
    
    details = activity.get("Activity Details", {})
    analysis = activity_analysis_data.get(activity_id, {})
    
    print("\nActivity Details:")
    print(f"ID: {activity_id}")
    print(f"Name: {details.get('activity_name')}")
    print(f"Type: {details.get('activity_type')}")
    print(f"Description: {details.get('description')}")
    print(f"Target Audience: {details.get('target_audience')}")
    print(f"Start Date: {details.get('start_date')}")
    print(f"End Date: {details.get('end_date')}")
    print(f"Location: {details.get('location')}")
    print(f"Number of Participants Expected: {details.get('number_of_participants_expected')}")
    print(f"Key Partners/Sponsors: {details.get('key_partners_sponsors')}")
    print(f"Activity Goals/Outcomes: {details.get('activity_goals_outcomes')}")
    print(f"RSVP'd List Count: {len(details.get('rsvpd_list', []))}")
    print(f"Attended List Count: {len(details.get('attended_people', []))}")
    
    # Print analysis results
    if analysis:
        print("\nActivity Analysis:")
        print(f"Total RSVPd: {analysis.get('total_rsvpd')}")
        print(f"Total Attended: {analysis.get('total_attended')}")
        print("Gender breakdown for RSVPd:", analysis.get('gender_breakdown_rsvpd'))
        print("Gender breakdown for Attended:", analysis.get('gender_breakdown_attended'))
        print("Attended Age Groups:", analysis.get('age_groups_attended'))
        print(f"Major Age Group of Attendees: {analysis.get('major_age_group_attendees')}")
    else:
        print("\nNo analysis data found for this activity.")



Activity Details:
ID: ACT-1737152194-RW1Z79
Name: Cross-group needs-based archive
Type: Networking
Description: Assume glass within budget fly set whom discuss safe nor hope.
Target Audience: Students
Start Date: 2025-02-16T01:16:34.794345
End Date: 2025-02-21T01:16:34.794345
Location: Virtual (Online)
Number of Participants Expected: 168
Key Partners/Sponsors: []
Activity Goals/Outcomes: Particular including change guess evidence clearly close child if compare.
RSVP'd List Count: 9
Attended List Count: 1

Activity Analysis:
Total RSVPd: 9
Total Attended: 1
Gender breakdown for RSVPd: {'Male': 0, 'Female': 4, 'Not specified': 5}
Gender breakdown for Attended: {'Male': 0, 'Female': 1, 'Not specified': 0}
Attended Age Groups: {'Under 18': 0, '18-25': 0, '26-40': 1, 'Over 40': 0}
Major Age Group of Attendees: 26-40
