In [30]:
import pandas as pd
wvs_filepath='data/2022_india_cleaned.csv'
df = pd.read_csv(wvs_filepath)

In [33]:
import pandas as pd
import numpy as np
import json
import os
import io

def process_questions_config(filepath='data/questions.json'):
    # (This helper function remains the same)
    with open(filepath, 'r') as f:
        questions_data = json.load(f)
    answer_mappings = {}
    num_options_map = {}
    for qid, details in questions_data.items():
        if details.get("scale", False):
            num_options_map[qid] = 10
            answer_mappings[qid] = {details["options"][0]: 1, details["options"][-1]: 10}
        else:
            valid_options = [opt for opt in details["options"] if opt.lower() != "don't know"]
            num_options_map[qid] = len(valid_options)
            answer_mappings[qid] = {option: i + 1 for i, option in enumerate(valid_options)}
    return answer_mappings, num_options_map

def analyze_survey_alignment(
    wvs_filepath='data/2022_indian_majority_answers_by_persona.csv',
    gemma_filepath='survey_answers_wide.csv',
    questions_filepath='data/questions.json',
    metric_type='soft',
    region_wise=False 
):
    """
    Loads, aligns, and calculates survey metrics, either for the whole dataset or region-wise.
    """
    if metric_type not in ['hard', 'soft']:
        raise ValueError("Metric type must be either 'hard' or 'soft'")

    # --- Data Loading and Standardization ---
    answer_mappings_by_q, num_options_map = process_questions_config(questions_filepath)
    flat_answer_mapping = {}
    for q_map in answer_mappings_by_q.values():
        flat_answer_mapping.update(q_map)

    wvs_df = pd.read_csv(wvs_filepath)
    gemma_df = pd.read_csv(gemma_filepath)
    
    #print(wvs_df['N_REGION_ISO: Region ISO 3166-2'].unique())
    
    #print(gemma_df['region'].unique())
    

    print("Standardizing WVS column names...")
    rename_map = {col: col.split(':')[0].strip() for col in wvs_df.columns if ':' in col}
    wvs_df.rename(columns=rename_map, inplace=True)

    demographic_mapping = {
        'N_REGION_ISO': 'region', 'H_URBRURAL': 'urban_rural', 'X003R': 'age',
        'Q260': 'gender', 'Q272': 'language', 'Q273': 'marital_status',
        'Q275R': 'education_level', 'Q287': 'social_class'
    }
    wvs_df.rename(columns=demographic_mapping, inplace=True)

    #print(wvs_df['region'].unique())

    not_scale_questions = ["Q42", "Q90", "Q149", "Q150", "Q151"]
    demographic_cols = list(demographic_mapping.values())
    selected_questions = [q for q in gemma_df.columns if q.startswith('Q')]

    print("Converting text answers to numeric codes...")
    for df in [wvs_df, gemma_df]:
        for col in selected_questions:
            if col in df.columns:
                df[col] = df[col].apply(
                    lambda x: flat_answer_mapping.get(str(x).strip(), x) if isinstance(x, str) else x
                )

    # --- Persona Alignment ---
    merge_cols = [col for col in demographic_cols if col in gemma_df.columns and col in wvs_df.columns]
    print(f"Aligning survey and model data on columns: {merge_cols}")
    merged_df = pd.merge(wvs_df, gemma_df, on=merge_cols, how='inner')
    print(f"Found {len(merged_df)} matching personas between the two datasets.")

    if len(merged_df) == 0:
        print("No matching personas found.")
        return {}

    # --- Metric Calculation ---

    # NEW: Main conditional block for region-wise vs. complete analysis
    if region_wise:
        print("\nCalculating metrics region-wise...")
        results_by_region = {}
        # Ensure the 'region' column exists before grouping
        if 'region' not in merged_df.columns:
            raise ValueError("Region column not found in merged data. Cannot perform region-wise analysis.")
            
        unique_regions = merged_df['region'].unique()
        #print(unique_regions)
        
        for region in unique_regions:
            region_df = merged_df[merged_df['region'] == region]
            
            hard_metric_scores, soft_metric_scores = [], []
            for q in selected_questions:
                survey_col, gemma_col = f"{q}_x", f"{q}_y"
                if survey_col not in region_df.columns or gemma_col not in region_df.columns:
                    continue
                
                # Perform calculation on the region-specific dataframe
                survey_answers = pd.to_numeric(region_df[survey_col], errors='coerce')
                model_answers = pd.to_numeric(region_df[gemma_col], errors='coerce')
                valid_indices = (survey_answers.notna()) & (model_answers.notna()) & (survey_answers >= 0)
                if not valid_indices.any(): continue
                
                survey_answers = survey_answers[valid_indices]
                model_answers = model_answers[valid_indices]

                # Metric Logic (same as before)
                if metric_type == 'hard':
                    scores = (survey_answers == model_answers).astype(int)
                    hard_metric_scores.extend(scores)
                else:
                    if q in not_scale_questions:
                        scores = (survey_answers == model_answers).astype(int)
                        soft_metric_scores.extend(scores)
                    else:
                        num_options = num_options_map.get(q)
                        if not num_options or num_options <= 1: continue
                        error = np.abs(survey_answers - model_answers)
                        normalized_error = error / (num_options - 1)
                        scores = 1 - normalized_error
                        soft_metric_scores.extend(scores)
            
            # Store results for the current region
            region_results = {}
            if hard_metric_scores: region_results['hard_metric'] = np.mean(hard_metric_scores)
            if soft_metric_scores: region_results['soft_metric_unified'] = np.mean(soft_metric_scores)
            results_by_region[region] = region_results
            
        return results_by_region

    else: # MODIFIED: Original logic is now in the else block
        print(f"\nCalculating metrics for complete dataset with mode: '{metric_type}'...")
        hard_metric_scores, soft_metric_scores = [], []
        for q in selected_questions:
            # (Calculation logic is the same as the inner loop above, but on merged_df)
            survey_col, gemma_col = f"{q}_x", f"{q}_y"
            if survey_col not in merged_df.columns or gemma_col not in merged_df.columns: continue
            survey_answers = pd.to_numeric(merged_df[survey_col], errors='coerce')
            model_answers = pd.to_numeric(merged_df[gemma_col], errors='coerce')
            valid_indices = (survey_answers.notna()) & (model_answers.notna()) & (survey_answers >= 0)
            if not valid_indices.any(): continue
            survey_answers = survey_answers[valid_indices]
            model_answers = model_answers[valid_indices]

            if metric_type == 'hard':
                scores = (survey_answers == model_answers).astype(int)
                hard_metric_scores.extend(scores)
            else:
                if q in not_scale_questions:
                    scores = (survey_answers == model_answers).astype(int)
                    soft_metric_scores.extend(scores)
                else:
                    num_options = num_options_map.get(q)
                    if not num_options or num_options <= 1: continue
                    error = np.abs(survey_answers - model_answers)
                    normalized_error = error / (num_options - 1)
                    scores = 1 - normalized_error
                    soft_metric_scores.extend(scores)

        results = {}
        if hard_metric_scores: results['hard_metric'] = np.mean(hard_metric_scores)
        if soft_metric_scores: results['soft_metric_unified'] = np.mean(soft_metric_scores)
        if not results: print("\nNo scores were calculated.")
        return results

In [34]:
results_soft = analyze_survey_alignment(region_wise=True)
print("\n--- FINAL RESULTS (Region-Wise) ---")
print(json.dumps(results_soft, indent=4))

Standardizing WVS column names...
Converting text answers to numeric codes...
Aligning survey and model data on columns: ['region', 'urban_rural', 'age', 'gender', 'language', 'marital_status', 'education_level', 'social_class']
Found 74 matching personas between the two datasets.

Calculating metrics region-wise...

--- FINAL RESULTS (Region-Wise) ---
{
    "IN-BR Bihar": {
        "soft_metric_unified": 0.49566531456295243
    },
    "IN-DL Delhi": {
        "soft_metric_unified": 0.5384818691193574
    },
    "IN-HR Haryana": {
        "soft_metric_unified": 0.5542461653572766
    },
    "IN-MH Maharashtra": {
        "soft_metric_unified": 0.6021673291690327
    },
    "IN-PB Punjab": {
        "soft_metric_unified": 0.5697480267152399
    },
    "IN-TG Telangana": {
        "soft_metric_unified": 0.5008028259473346
    },
    "IN-UP Uttar Pradesh": {
        "soft_metric_unified": 0.5821424873149011
    },
    "IN-WB West Bengal": {
        "soft_metric_unified": 0.579252577319587

In [35]:
results_hard = analyze_survey_alignment(metric_type='hard',  region_wise=True)
print("\n--- FINAL RESULTS (Region-Wise) ---")
print(json.dumps(results_hard, indent=4))


Standardizing WVS column names...
Converting text answers to numeric codes...
Aligning survey and model data on columns: ['region', 'urban_rural', 'age', 'gender', 'language', 'marital_status', 'education_level', 'social_class']
Found 74 matching personas between the two datasets.

Calculating metrics region-wise...

--- FINAL RESULTS (Region-Wise) ---
{
    "IN-BR Bihar": {
        "hard_metric": 0.3185397279885469
    },
    "IN-DL Delhi": {
        "hard_metric": 0.3691722169362512
    },
    "IN-HR Haryana": {
        "hard_metric": 0.3400673400673401
    },
    "IN-MH Maharashtra": {
        "hard_metric": 0.3577512776831346
    },
    "IN-PB Punjab": {
        "hard_metric": 0.34972677595628415
    },
    "IN-TG Telangana": {
        "hard_metric": 0.3815028901734104
    },
    "IN-UP Uttar Pradesh": {
        "hard_metric": 0.37698042870456666
    },
    "IN-WB West Bengal": {
        "hard_metric": 0.34536082474226804
    }
}
