In [1]:
import pandas as pd
import numpy as np
import os
import io

In [4]:
def calculate_majority_answers(
    persona_filepath='data/2022_India_persona_groups_cleaned.csv',
    raw_data_filepath='data/2022_india_cleaned.csv',
    output_filepath='data/2022_indian_majority_answers_by_persona.csv'
):
    """
    Calculates the majority answer for each persona group and saves the result.
    """
    print("Loading data files...")
    personas_df = pd.read_csv(persona_filepath)
    raw_data_df = pd.read_csv(raw_data_filepath)

    # Identify the demographic columns to group by
    demographic_columns = personas_df.columns.drop('Counts').tolist()
    
    # Identify the question columns to find the majority answer for
    question_columns = [col for col in raw_data_df.columns if ':' in col and col not in demographic_columns]
    
    print(f"Grouping by {len(demographic_columns)} demographic columns.")
    print(f"Analyzing {len(question_columns)} question columns.")

    # This function finds the most frequent value (the mode)
    def get_majority(series):
        modes = series.mode()
        return modes[0] if not modes.empty else np.nan

    # Create the aggregation rules for each question
    agg_dict = {col: get_majority for col in question_columns}

    print("Calculating majority answers for each persona group...")
    # This single line groups all matching rows and calculates the majority for each question
    majority_answers_df = raw_data_df.groupby(demographic_columns).agg(agg_dict).reset_index()

    # Merge with the original personas dataframe to ensure the output matches perfectly
    final_df = pd.merge(personas_df, majority_answers_df, on=demographic_columns, how='left')
    
    # Save the final result
    final_df.to_csv(output_filepath, index=False)
    print(f"\nSuccessfully created output file at: {output_filepath}")
    
    return final_df

In [5]:
result_df = calculate_majority_answers()
    
print("\n--- Preview of the first 5 rows of the output ---")
print(result_df.head())

Loading data files...
Grouping by 10 demographic columns.
Analyzing 160 question columns.
Calculating majority answers for each persona group...

Successfully created output file at: data/2022_indian_majority_answers_by_persona.csv

--- Preview of the first 5 rows of the output ---
   A_YEAR: Year of survey B_COUNTRY: ISO 3166-1 numeric country code  \
0                    2023                                      India   
1                    2023                                      India   
2                    2023                                      India   
3                    2023                                      India   
4                    2023                                      India   

  N_REGION_ISO: Region ISO 3166-2 H_URBRURAL: Urban-Rural Q260: Sex  \
0                     IN-BR Bihar                   Rural    Female   
1                     IN-BR Bihar                   Rural    Female   
2                     IN-BR Bihar                   Rural    Female   
