In [34]:
import pandas as pd

In [None]:
def process_survey_data(file_path):
    # Load the CSV file
    survey_data = pd.read_csv(file_path)

    # Extract header as a list
    headers = survey_data.columns.tolist()

    # Find the index where questions start
    start_index = headers.index('Familiarity with Lane Marking Specifications:') + 1

    # Categorize questions based on repeating pattern
    spec_questions = headers[start_index:]
    specifications = [spec_questions[i:i+4] for i in range(0, len(spec_questions), 4)]

    # Function to convert values like "1 - Not specific at all" to numeric ratings
    def convert_to_numeric(series):
        return series.replace({
            r"1\s*-\s*.*": 1,
            r"2\s*-\s*.*": 2,
            r"3\s*-\s*.*": 3,
            r"4\s*-\s*.*": 4,
            r"5\s*-\s*.*": 5
        }, regex=True).astype(float)

    # Initialize summary table
    summary = []

    # Process each specification group
    for i, questions in enumerate(specifications):
        spec_name = f"Specification {i+1}"
        
        # Convert ratings to numeric
        specificity = convert_to_numeric(survey_data[questions[0]])
        clarity = convert_to_numeric(survey_data[questions[1]])
        adaptability = convert_to_numeric(survey_data[questions[2]])
        
        # Drop NaNs and log how many were dropped for each category
        specificity_non_na = specificity.dropna()
        clarity_non_na = clarity.dropna()
        adaptability_non_na = adaptability.dropna()
        
        # print(f"{spec_name} - Specificity NaNs dropped: {specificity.isna().sum() - specificity_non_na.isna().sum()}")
        # print(f"{spec_name} - Clarity NaNs dropped: {clarity.isna().sum() - clarity_non_na.isna().sum()}")
        # print(f"{spec_name} - Adaptability NaNs dropped: {adaptability.isna().sum() - adaptability_non_na.isna().sum()}")
        
        # Calculate averages without NaNs
        spec_avg = specificity_non_na.mean()
        clarity_avg = clarity_non_na.mean()
        adapt_avg = adaptability_non_na.mean()
        
        # Compile feedback themes
        feedback = survey_data[questions[3]].dropna().tolist()
        feedback_themes = "; ".join(feedback) if feedback else "No major comments"

        # Append to summary table
        summary.append({
            'Specification Group': spec_name,
            'Specificity Avg': round(spec_avg, 2),
            'Clarity Avg': round(clarity_avg, 2),
            'Adaptability Avg': round(adapt_avg, 2),
            'Key Feedback Themes': feedback_themes
        })

    # Convert summary to DataFrame for easy display
    summary_df = pd.DataFrame(summary)
    return summary_df

# Example usage:
# summary_df = process_survey_data('lane_marking.csv')
# summary_df


In [36]:
summary_df = process_survey_data('lane_marking.csv')
overall_averages = summary_df[['Specificity Avg', 'Clarity Avg', 'Adaptability Avg']].mean()

# Concatenate all Key Feedback Themes into a single list
all_feedback_themes = summary_df['Key Feedback Themes'].tolist()

In [37]:
overall_averages

Specificity Avg     3.753333
Clarity Avg         3.618333
Adaptability Avg    3.647500
dtype: float64

In [33]:
all_feedback_themes

['If the specification is for human then it is clear. If this is for machine vision then it is not; Requirements shown only deal with detection but nothing about reaction.  ',
 'Same as before, the specification is good for human but not for machine vision; Double Yellow Lines means "double solid yellow lines"?  Will that need to be specified?  Does broken yellow line has length information?',
 'Besides width, does length need to be considered to make it a solid continuous edge?',
 'There are various designs of crosswalk marking and it should be captured somewhere in the requirement.  ',
 'Can visibility and reflectivity be combined as one requirement?',
 'Not sure what a chevron pattern is; Width is only part of the marking and other additional parameters might be needed.',
 'No major comments',
 'No definition of what "dotted" line mean?',
 'What color is the line?',
 '"Black for contrast on light-colored pavements." does this mean that black line replaces all other color lines for l

In [38]:
summary_df = process_survey_data('roadway_users.csv')
overall_averages = summary_df[['Specificity Avg', 'Clarity Avg', 'Adaptability Avg']].mean()

# Concatenate all Key Feedback Themes into a single list
all_feedback_themes = summary_df['Key Feedback Themes'].tolist()
overall_averages

Specificity Avg     3.600667
Clarity Avg         3.611333
Adaptability Avg    3.444000
dtype: float64

In [39]:
all_feedback_themes

['stop distance is based on decel.  Need to define what kind of braking is required.  2meter buffer is from what object to what object?  How close does the pedestrian need to be close to the shoulder to warn the driver?  After you pass the crosswalk, does "within 5 radius" still apply?  Better to define what "small size" mean.  Nothing mentioned about visibility and other factors.',
 'Does 1.5 meter apply for the condition to overtake or while overtake or after overtake?  What response is required for the "response time", braking, alert driver, etc?',
 'Is "detect" and "identify" interchangeable?  Nothing about weather or other conditions.',
 'How close to the shoulder.  How fast to reduce to 10km/h.  Nothing about weather or other conditions.',
 'What is the "following distance"?  Reaction time refers to braking?  20 meter range is for same lane of travel?  Nothing about the weather and other conditions.',
 '10 meter range is for same lane of travel?  Nothing about the weather and oth