In [1]:
import os
import pandas as pd
from typing import Dict, List
import xml.etree.ElementTree as ET

In [2]:
import survey_parser as SP

In [3]:
online_csv_name = 'Online Table.csv'
in_person_csv_name = 'Inperson Table.csv'
metadata_file_name = 'metadata.xml'
directory = r'C:\Users\clark.roll\python coding\code_personal\Data512\Course Project\Part 2\dataset'

# Full paths for each file
online_path = os.path.join(directory, online_csv_name)
in_person_path = os.path.join(directory, in_person_csv_name)
metadata_path = os.path.join(directory, metadata_file_name)

In [4]:
metadata_questions, metadata_options = SP.parse_survey_metadata(metadata_path)
meta_opts_df = pd.DataFrame(metadata_options)

In [5]:
QUESTION_COL_NAME_MAP = {
    'Question 2': 'age',
    'Question 3': 'gender',
    'Question 4': 'race',
    'Question 5': 'zip_code',
    'Question 6': 'education_level',
    'Question 7': 'income',
    'Question 8': 'general_health_status',
    'Question 9': 'outside_activity_engagement',
    'Question 10': 'outside_activity_frequency',
    'Question 11': 'air_quality_notification_received',
    'Question 12': 'seek_air_quality_info',
    'Question 13': 'info_source_for_smoke_notifications',
    'Question 14': 'days_checked_for_smoke_info',
    'Question 15': 'reduced_outdoor_activities_due_to_smoke',
    'Question 16': 'consecutive_days_reduced_outdoor_activity',
    'Question 17': 'min_aqi_reduce_activity',
    'Question 18': 'min_aqi_eliminate_activity',
    'Question 19': 'motivating_info_to_reduce_outdoor_activity',
    'Question 20': 'motivating_message_type_for_mitigation',
    'Question 21': 'motivating_message_content',
    'Question 22': 'preferred_warning_timing',
    'Question 23': 'future_mitigation_actions',
    'Question 24': 'perception_of_smoke_as_hazard',
    'Question 25': 'compare_smoke_with_other_disasters',
    'Question 26': 'consider_evacuating_due_to_smoke',
    'Question 27': 'smoke_related_health_experience',
    'Question 28': 'symptoms_during_smoke_event',
    'Question 29': 'mitigation_strategies_for_health_issues'
}

In [6]:
# Load and structure both CSV files
in_person_df = SP.load_and_structure_survey_csv(in_person_path)
online_df = SP.load_and_structure_survey_csv(online_path)

# Add a column to distinguish in-person and online data
in_person_df['survey_type'] = 'in_person'
online_df['survey_type'] = 'online'

# Combine both datasets into one DataFrame
combined_df = pd.concat([in_person_df, online_df], ignore_index=True)

# Apply the metadata mapping to the combined DataFrame
combined_df_mapped = SP.apply_metadata_mapping(combined_df.copy(),
                                               metadata_questions,
                                               metadata_options)

# Display the mapped DataFrame for inspection
combined_df_mapped

Unnamed: 0,participant_id,Question 2 Option 1,Question 3 Option 1,Question 3 Option 2,Question 3 Option 3,Question 4 Option 1,Question 4 Option 2,Question 4 Option 3,Question 4 Option 4,Question 4 Option 5,...,Question 28 Option 15,Question 29 Option 1,Question 29 Option 2,Question 29 Option 3,Question 29 Option 4,Question 29 Option 5,Question 29 Option 6,Question 29 Option 7,Question 29 Option 8,survey_type
0,Participant 1,62.0,1,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,in_person
1,Participant 2,62.0,1,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,in_person
2,Participant 3,25.0,0,1,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,1,in_person
3,Participant 4,53.0,1,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,in_person
4,Participant 5,44.0,0,1,0,1,0,0,0,0,...,0,0,0,1,0,0,0,0,0,in_person
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2355,Participant 1742,39.0,1,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,online
2356,Participant 1743,48.0,1,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,online
2357,Participant 1744,21.0,0,1,0,1,0,0,0,1,...,0,0,0,0,0,0,0,0,0,online
2358,Participant 1745,,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,online


In [7]:
# Step 2: Split into individual question dataframes
split_questions = SP.split_all_questions(combined_df_mapped)

In [11]:
processed_question_dfs = {}
binary_questions = SP.identify_binary_questions(split_questions)

for question_prefix, question_df in split_questions.items():
    if question_prefix in SP.QUESTION_COL_NAME_MAP:
        # Rename columns
        renamed_df = SP.rename_option_columns(question_df, meta_opts_df)
        new_column_name = SP.QUESTION_COL_NAME_MAP[question_prefix]

        if question_prefix in binary_questions:
            # Collapse binary columns to a single column
            processed_df = SP.collapse_binary_columns_to_single(renamed_df, new_column_name)
        else:
            # If not binary, rename the first relevant column to the mapped name
            processed_df = renamed_df.rename(
                columns={renamed_df.columns[-1]: new_column_name}  # Example rename logic
            )

        # Add processed dataframe to the dictionary
        processed_question_dfs[question_prefix] = processed_df
    else:
        print(f"Skipping question: {question_prefix}")

In [12]:
# Step 4: Combine selected questions into a demographics dataframe
demographic_questions = [
    "Question 2", "Question 3", "Question 4", 
    "Question 5", "Question 6", "Question 7"
]

# Initialize the demographics dataframe
demographic_df = None
for question in demographic_questions:
    if question in processed_question_dfs:
        if demographic_df is None:
            demographic_df = processed_question_dfs[question]
        else:
            demographic_df = demographic_df.merge(
                processed_question_dfs[question],
                on=["participant_id", "survey_type"],
                how="outer"
            )
    else:
        print(f"Question {question} not found in processed data.")

In [16]:
activity_questions = ['Question 9','Question 10','Question 14','Question 15',
                      'Question 16','Question 17','Question 18','Question 19']
# Initialize the demographics dataframe
activity_df = None
for question in activity_questions:
    if question in processed_question_dfs:
        if activity_df is None:
            activity_df = processed_question_dfs[question]
        else:
            activity_df = activity_df.merge(
                processed_question_dfs[question],
                on=["participant_id", "survey_type"],
                how="outer"
            )
    else:
        print(f"Question {question} not found in processed data.")

In [18]:
activity_df

Unnamed: 0,participant_id,survey_type,outside_activity_engagement,outside_activity_frequency,days_checked_for_smoke_info,reduced_outdoor_activities_due_to_smoke,consecutive_days_reduced_outdoor_activity,min_aqi_reduce_activity,min_aqi_eliminate_activity,motivating_info_to_reduce_outdoor_activity
0,Participant 1,in_person,Yes – Please list the activities _____________...,Daily,0 days,No,0 days,I am not familiar with this rating,I am not familiar with this rating,Your own visual observation (seeing the smoke ...
1,Participant 1,online,Yes – Please list the activities _____________...,Daily,6-7 days,Yes,1 to 2 days,Red – Unhealthy,Purple – Very Unhealthy,Smoke-related health problem statistics
2,Participant 10,in_person,Yes – Please list the activities _____________...,Daily,,No,0 days,I am not familiar with this rating,Maroon – Hazardous,Your own visual observation (seeing the smoke ...
3,Participant 10,online,Yes – Please list the activities _____________...,Daily,6-7 days,Yes,3 days,Orange – Unhealthy for Sensitive Groups,Red – Unhealthy,Your own visual observation (seeing the smoke ...
4,Participant 100,in_person,Yes – Please list the activities _____________...,Daily,,No,0 days,I am not familiar with this rating,I am not familiar with this rating,Your own visual observation (seeing the smoke ...
...,...,...,...,...,...,...,...,...,...,...
2355,Participant 995,online,Yes – Please list the activities _____________...,Once per week,4-5 days,Yes,6 days and more,Orange – Unhealthy for Sensitive Groups,Orange – Unhealthy for Sensitive Groups,Air quality information
2356,Participant 996,online,No (skip to question 11),,,Yes,3 days,Red – Unhealthy,Maroon – Hazardous,Your own visual observation (seeing the smoke ...
2357,Participant 997,online,Yes – Please list the activities _____________...,A few times per week,6-7 days,No,0 days,Purple – Very Unhealthy,Maroon – Hazardous,Air quality information
2358,Participant 998,online,Yes – Please list the activities _____________...,Once per week,0 days,Yes,3 days,Orange – Unhealthy for Sensitive Groups,Red – Unhealthy,Air quality information


In [13]:
demographic_df

Unnamed: 0,participant_id,survey_type,age,gender,race,zip_code,education_level,income
0,Participant 1,in_person,62.0,A woman,White / Caucasian,83676.0,"Some college, no degree","$75,000 to $99,999"
1,Participant 1,online,40.0,A woman,,83703.0,Associates degree,"$50,000 to $74,999"
2,Participant 10,in_person,30.0,A woman,White / Caucasian,83706.0,"High school graduate, diploma or GED","$25,000 to $49,999"
3,Participant 10,online,42.0,A woman,White / Caucasian,83702.0,"Some college, no degree","$25,000 to $49,999"
4,Participant 100,in_person,25.0,A man,White / Caucasian,,"Some college, no degree","$50,000 to $74,999"
...,...,...,...,...,...,...,...,...
2355,Participant 995,online,54.0,A man,White / Caucasian,83709.0,Associates degree,"$100,000 or more"
2356,Participant 996,online,22.0,A woman,White / Caucasian,83706.0,"Some college, no degree","$25,000 to $49,999"
2357,Participant 997,online,62.0,A man,White / Caucasian,83706.0,Associates degree,
2358,Participant 998,online,18.0,A woman,White / Caucasian,83704.0,8th grade or less,"$50,000 to $74,999"


In [None]:
air_quality_questions = [
    'air_quality_notification_received', 'seek_air_quality_info', 'info_source_for_smoke_notifications',
    'days_checked_for_smoke_info', 'reduced_outdoor_activities_due_to_smoke', 
    'consecutive_days_reduced_outdoor_activity', 'min_aqi_reduce_activity', 
    'min_aqi_eliminate_activity', 'motivating_info_to_reduce_outdoor_activity',
    'motivating_message_type_for_mitigation', 'motivating_message_content', 'preferred_warning_timing'
]

In [None]:
activity_questions = ['general_health_status', 'outside_activity_engagement', 'outside_activity_frequency']

In [None]:
air_activity_questions = ['outside_activity_engagement', 'outside_activity_frequency','reduced_outdoor_activities_due_to_smoke', 
    'consecutive_days_reduced_outdoor_activity', 'min_aqi_reduce_activity', 
    'min_aqi_eliminate_activity', 'motivating_info_to_reduce_outdoor_activity',]

In [None]:
air_activity_questions = ['Question 9','Question 10', 'Question 14','Question 15','Question 16','Question 17','Question 18','Question 19']