In [None]:
import json
with open('data/qsns_mapping.json', 'r') as f:
    qsns_mapping_data = json.load(f)
qsns_mapping_1 = qsns_mapping_data['2012']
qsns_mapping_2 = qsns_mapping_data['2006']

with open('data/chosen_cols_updated.json', 'r') as f:
    chosen_cols = json.load(f)['chosen_cols']

qsns_to_take = [q for q, k in chosen_cols.items() if k == True and (q in qsns_mapping_1.values() and q in qsns_mapping_2.values())]
print(len(qsns_to_take))


FileNotFoundError: [Errno 2] No such file or directory: 'data/chosen_cols_col.json'

In [38]:
import pandas as pd
import numpy as np
import json


def process_questions_config(filepath='data/questions.json'):
    """Process the questions configuration file to create answer mappings and option counts."""
    with open(filepath, 'r') as f:
        questions_data = json.load(f)
    answer_mappings = {}
    num_options_map = {}
    for qid, details in questions_data.items():
        if details.get("scale", False):
            num_options_map[qid] = 10
            answer_mappings[qid] = {details["options"][0]: 1, details["options"][-1]: 10}
        else:
            valid_options = [opt for opt in details["options"] if opt.lower() != "don't know"]
            num_options_map[qid] = len(valid_options)
            answer_mappings[qid] = {option: i + 1 for i, option in enumerate(valid_options)}
    return answer_mappings, num_options_map


def get_demographic_mapping(year='2022'):
    """Get demographic column mapping for a given year."""
    with open("data/chosen_cols_updated.json", "r") as f:
        persona_cols_json = json.load(f)

    if year not in persona_cols_json['persona_cols']:
        raise ValueError(f"No demographic mapping found for year {year}")
    
    year_mapping = persona_cols_json['persona_cols'][year]
    
    demographic_mapping = {}
    for key, val in year_mapping.items():
        col_name = val.split(":")[0].strip()
        if key == 'sex':
            final_key = 'gender'
        elif key == 'education':
            final_key = 'education_level'
        else:
            final_key = key
        demographic_mapping[col_name] = final_key
    return demographic_mapping


def analyze_survey_alignment(
    year='2022',
    state='bengal',
    language='en',
    metric_type='soft',
    region_wise=False 
):
    # File paths
    wvs_filepath=f'data/india/{year}/{year}_india_majority_answers_by_persona_{language}.csv'
    filepath=f'llama_responses/most_frequent_answers_{state}_{language}.csv'
    questions_filepath='data/questions.json'
    config = 'data/chosen_cols_gemma_updated.json'
    
    if metric_type not in ['hard', 'soft']:
        raise ValueError("Metric type must be either 'hard' or 'soft'")

    # Process questions config
    answer_mappings_by_q, num_options_map = process_questions_config(questions_filepath)
    flat_answer_mapping = {}
    for q_map in answer_mappings_by_q.values():
        flat_answer_mapping.update(q_map)

    # Load data
    wvs_df = pd.read_csv(wvs_filepath)
    _df = pd.read_csv(filepath)
    
    # Splot columns with ':' and rename
    rename_map = {col: col.split(':')[0].strip() for col in wvs_df.columns if ':' in col}
    wvs_df.rename(columns=rename_map, inplace=True)
    _df.rename(columns=rename_map, inplace=True)
    
    # Standardize demographic columns
    demographic_mapping_responses = get_demographic_mapping()
    demographic_mapping_wvs = get_demographic_mapping(year)

    wvs_df.rename(columns=demographic_mapping_wvs, inplace=True)
    _df.rename(columns=demographic_mapping_responses, inplace=True)

    # Rename WVS question columns from V to Q if needed
    if year != '2022':
        with open('data/qsns_mapping.json', 'r') as f:
            qsns_mapping_data = json.load(f)
        qsns_mapping = qsns_mapping_data.get(str(year), {})
        
        # Get columns that have valid (non-null) mappings
        valid_columns = []
        rename_map_v_to_q = {}
        
        for col in wvs_df.columns:
            if col in qsns_mapping:
                if qsns_mapping[col] is not None:
                    rename_map_v_to_q[col] = qsns_mapping[col]
                    valid_columns.append(col)
            else:
                valid_columns.append(col)
        
        # Filter and rename
        wvs_df = wvs_df[valid_columns]
        wvs_df.rename(columns=rename_map_v_to_q, inplace=True)
        
    # Keep qsns that are present across years
    cols_to_drop_wvs = [col for col in wvs_df.columns if col[0] == 'Q' and col not in qsns_to_take]
    cols_to_drop_df = [col for col in _df.columns if col[0] == 'Q' and col not in qsns_to_take]
    wvs_df.drop(columns=cols_to_drop_wvs, inplace=True, errors='ignore')
    _df.drop(columns=cols_to_drop_df, inplace=True, errors='ignore')

    # Standardize answers
    not_scale_questions = ["Q42", "Q90", "Q149", "Q150", "Q151"]
    
    with open(config, "r") as f:
        data = json.load(f)
    selected_questions = [q for q, k in data['chosen_cols'].items() if k == True and q in wvs_df.columns and q in _df.columns]

    for df in [wvs_df, _df]:
        for col in selected_questions:
            if col in df.columns:
                df[col] = df[col].apply(
                    lambda x: flat_answer_mapping.get(str(x).strip(), x) if isinstance(x, str) else x
                )
                
    if year != '2022':
        # Map town size to urban/rural
        townsize_map = {
            "Under 5,000": "Rural",
            "5000-20000": "Urban",
            "20000-100000": "Urban"
        }
        wvs_df['urban_rural'] = wvs_df['urban_rural'].map(townsize_map)
        
        # Bin age
        age_bins = [0, 15, 24, 34, 44, 54, 64, 100]
        age_labels = ['0-15', '16-24', '25-34', '35-44', '45-54', '55-64', '65+']
        wvs_df['age'] = pd.to_numeric(wvs_df['age'], errors='coerce')
        wvs_df['age'] = pd.cut(wvs_df['age'], bins=age_bins, labels=age_labels, right=True)
        
    # print(wvs_df.columns)
    # print(_df.columns)
    
    # Merge datasets
    merge_cols = ['region', 'urban_rural', 'age', 'gender', 'marital_status', 'education_level', 'social_class']

    # Convert to string and normalize
    for df_name, df in [("WVS", wvs_df), ("Model", _df)]:
        for col in merge_cols:
            if col in df.columns:
                df[col] = df[col].astype(str).str.strip().str.lower()
                uniques = df[col].dropna().unique()[:10]

    # Try merging
    merged_df = pd.merge(wvs_df, _df, on=merge_cols, how='inner')
    
    # If merged_df is empty, return empty results
    if len(merged_df) == 0:
        print("triggered")
        return {}

    # Calculate metrics
    if region_wise:
        results_by_region = {}
        if 'region' not in merged_df.columns:
            raise ValueError("Region column not found in merged data. Cannot perform region-wise analysis.")
            
        unique_regions = merged_df['region'].unique()
        
        for region in unique_regions:
            region_df = merged_df[merged_df['region'] == region]
            
            hard_metric_scores, soft_metric_scores = [], []
            for q in selected_questions:
                survey_col, _col = f"{q}_x", f"{q}_y"
                if survey_col not in region_df.columns or _col not in region_df.columns:
                    continue
                
                # Perform calculation on the region-specific dataframe
                survey_answers = pd.to_numeric(region_df[survey_col], errors='coerce')
                model_answers = pd.to_numeric(region_df[_col], errors='coerce')
                valid_indices = (survey_answers.notna()) & (model_answers.notna()) & (survey_answers >= 0)
                if not valid_indices.any(): continue
                
                survey_answers = survey_answers[valid_indices]
                model_answers = model_answers[valid_indices]

                # Metric Logic (same as before)
                if metric_type == 'hard':
                    scores = (survey_answers == model_answers).astype(int)
                    hard_metric_scores.extend(scores)
                else:
                    if q in not_scale_questions:
                        scores = (survey_answers == model_answers).astype(int)
                        soft_metric_scores.extend(scores)
                    else:
                        num_options = num_options_map.get(q)
                        if not num_options or num_options <= 1: continue
                        error = np.abs(survey_answers - model_answers)
                        normalized_error = error / (num_options - 1)
                        scores = 1 - normalized_error
                        soft_metric_scores.extend(scores)
            
            # Store results for the current region
            region_results = {}
            if hard_metric_scores: region_results['hard_metric'] = np.mean(hard_metric_scores)
            if soft_metric_scores: region_results['soft_metric_unified'] = np.mean(soft_metric_scores)
            results_by_region[region] = region_results
            
        return results_by_region

    else: 
        hard_metric_scores, soft_metric_scores = [], []
        for q in selected_questions:
            survey_col, _col = f"{q}_x", f"{q}_y"
            if survey_col not in merged_df.columns or _col not in merged_df.columns: continue
            survey_answers = pd.to_numeric(merged_df[survey_col], errors='coerce')
            model_answers = pd.to_numeric(merged_df[_col], errors='coerce')
            valid_indices = (survey_answers.notna()) & (model_answers.notna()) & (survey_answers >= 0)
            if not valid_indices.any(): continue
            survey_answers = survey_answers[valid_indices]
            model_answers = model_answers[valid_indices]

            if metric_type == 'hard':
                scores = (survey_answers == model_answers).astype(int)
                hard_metric_scores.extend(scores)
            else:
                if q in not_scale_questions:
                    scores = (survey_answers == model_answers).astype(int)
                    soft_metric_scores.extend(scores)
                else:
                    num_options = num_options_map.get(q)
                    if not num_options or num_options <= 1: continue
                    error = np.abs(survey_answers - model_answers)
                    normalized_error = error / (num_options - 1)
                    scores = 1 - normalized_error
                    soft_metric_scores.extend(scores)

        results = {}
        if hard_metric_scores: results['hard_metric'] = np.mean(hard_metric_scores)
        if soft_metric_scores: results['soft_metric_unified'] = np.mean(soft_metric_scores)
        if not results: print("\nNo scores were calculated.")
        return results

## Temporal Similarity

In [42]:
import pandas as pd
import numpy as np

states = ['bengal', 'telangana', 'punjab', 'maharashtra', 'bihar', 'delhi', 'up']

all_results = []

for state in states:
    results_soft = analyze_survey_alignment(state=state, region_wise=True)
    results_hard = analyze_survey_alignment(state=state, metric_type='hard', region_wise=True)

    # Convert dicts to DataFrames
    df_soft = pd.DataFrame.from_dict(results_soft, orient='index')
    df_soft.index.name = 'Region'
    df_soft.reset_index(inplace=True)

    df_hard = pd.DataFrame.from_dict(results_hard, orient='index')
    df_hard.index.name = 'Region'
    df_hard.reset_index(inplace=True)

    # Merge soft and hard metrics side by side
    df_combined = pd.merge(df_soft, df_hard, on='Region', how='outer', suffixes=('_soft', '_hard'))
    df_combined['State'] = state
    all_results.append(df_combined)

# Concatenate all states
final_table = pd.concat(all_results, ignore_index=True)

# Add missing metric columns if needed
for col in ['soft_metric_unified', 'hard_metric']:
    if col not in final_table.columns:
        final_table[col] = np.nan

# Reorder columns safely
final_table = final_table[['State', 'Region', 'soft_metric_unified', 'hard_metric']]

# Display
print(final_table)

         State               Region  soft_metric_unified  hard_metric
0       bengal    in-wb west bengal             0.596840     0.201835
1    telangana      in-tg telangana             0.633617     0.284672
2       punjab         in-pb punjab             0.666667     0.309353
3  maharashtra    in-mh maharashtra             0.617512     0.253456
4        bihar          in-br bihar             0.655235     0.300000
5        delhi          in-dl delhi             0.630703     0.248792
6           up  in-up uttar pradesh             0.628272     0.272251


In [40]:
import pandas as pd
import numpy as np

states = ['bengal', 'telangana', 'punjab', 'maharashtra']

all_results = []

for state in states:
    results_soft = analyze_survey_alignment(state=state, year='2012', region_wise=True)
    results_hard = analyze_survey_alignment(state=state, year='2012', metric_type='hard', region_wise=True)

    # Convert dicts to DataFrames
    df_soft = pd.DataFrame.from_dict(results_soft, orient='index')
    df_soft.index.name = 'Region'
    df_soft.reset_index(inplace=True)

    df_hard = pd.DataFrame.from_dict(results_hard, orient='index')
    df_hard.index.name = 'Region'
    df_hard.reset_index(inplace=True)

    # Merge soft and hard metrics side by side
    df_combined = pd.merge(df_soft, df_hard, on='Region', how='outer', suffixes=('_soft', '_hard'))
    df_combined['State'] = state
    all_results.append(df_combined)

# Concatenate all states
final_table = pd.concat(all_results, ignore_index=True)

# Add missing metric columns if needed
for col in ['soft_metric_unified', 'hard_metric']:
    if col not in final_table.columns:
        final_table[col] = np.nan

# Reorder columns safely
final_table = final_table[['State', 'Region', 'soft_metric_unified', 'hard_metric']]

# Display
print(final_table)

triggered
triggered
triggered
triggered
         State             Region  soft_metric_unified  hard_metric
0       bengal  in-wb west bengal             0.642720     0.303448
1  maharashtra  in-mh maharashtra             0.664493     0.304348


## Language Analysis

In [23]:
import pandas as pd
import numpy as np

states = {'bengal':'bn'}

all_results = []

for state, language in states.items():
    results_soft = analyze_survey_alignment(state=state, language=language, region_wise=True)
    results_hard = analyze_survey_alignment(state=state, language=language, metric_type='hard', region_wise=True)

    # Convert dicts to DataFrames
    df_soft = pd.DataFrame.from_dict(results_soft, orient='index')
    df_soft.index.name = 'Region'
    df_soft.reset_index(inplace=True)

    df_hard = pd.DataFrame.from_dict(results_hard, orient='index')
    df_hard.index.name = 'Region'
    df_hard.reset_index(inplace=True)

    # Merge soft and hard metrics side by side
    df_combined = pd.merge(df_soft, df_hard, on='Region', how='outer', suffixes=('_soft', '_hard'))
    df_combined['State'] = state
    all_results.append(df_combined)

# Concatenate all states
final_table = pd.concat(all_results, ignore_index=True)

# Add missing metric columns if needed
for col in ['soft_metric_unified', 'hard_metric']:
    if col not in final_table.columns:
        final_table[col] = np.nan

# Reorder columns safely
final_table = final_table[['State', 'Region', 'soft_metric_unified', 'hard_metric']]

# Display
print(final_table)

Index(['year', 'country', 'region', 'urban_rural', 'gender', 'age', 'language',
       'marital_status', 'education_level', 'social_class',
       ...
       'G46_3', 'G46_4', 'G46_5', 'G47_1', 'G47_2', 'G47_3', 'G47_4', 'G47_5',
       'G1_2', 'G1_3'],
      dtype='object', length=456)
Index(['language', 'marital_status', 'gender', 'urban_rural', 'region', 'age',
       'education_level', 'social_class', 'Q106', 'Q107', 'Q108', 'Q110',
       'Q150', 'Q151', 'Q158', 'Q159', 'Q160', 'Q161', 'Q162', 'Q176', 'Q177',
       'Q178', 'Q179', 'Q180', 'Q181', 'Q184', 'Q185', 'Q187', 'Q188', 'Q189',
       'Q190', 'Q191', 'Q192', 'Q194', 'Q195', 'Q224', 'Q225', 'Q227', 'Q228',
       'Q229', 'Q230', 'Q231', 'Q232', 'Q233', 'Q234', 'Q235', 'Q236', 'Q238',
       'Q239', 'Q241', 'Q242', 'Q244', 'Q247', 'Q248', 'Q33', 'Q34', 'Q35',
       'Q37', 'Q38', 'Q39', 'Q40', 'Q41', 'Q42', 'Q43', 'Q44', 'Q45', 'Q64',
       'Q66', 'Q67', 'Q68', 'Q72', 'Q73', 'Q74', 'Q75', 'Q77', 'Q78', 'Q79',
       'Q80',