In [1]:
import pandas as pd

df = pd.read_csv("pew/India Religion Public Data - Pew Research Center (All Vars).csv")

rows, cols = df.shape
print(f"Rows: {rows}, Columns: {cols}")

Rows: 29999, Columns: 308


  df = pd.read_csv("pew/India Religion Public Data - Pew Research Center (All Vars).csv")


In [2]:
cols

308

In [5]:
import json

with open("data/chosen_cols.json", "r") as f:
    data = json.load(f)
    chosen_cols = [colname for colname in data["chosen_cols"] if data["chosen_cols"][colname] is True]
    persona_cols = data["persona_cols"] 

## Chosen Persona Features

In [6]:
import pandas as pd

grouped = df[persona_cols].groupby(persona_cols).size().reset_index(name='Counts')
grouped.to_csv("data/2022_india_persona_groups.csv", index=False)

## Chosen Columns

In [7]:
from IPython.display import display
import pandas as pd

def is_chosen_col(col):
    qsn = col.split(':')[0].strip()
    return qsn in chosen_cols

chosen_col_names = persona_cols + [col for col in df.columns if is_chosen_col(col)]
chosen = df[chosen_col_names]
chosen.to_csv("data/2022_india_cleaned.csv", index=False)

rows, cols = chosen.shape
print(f"Rows: {rows}, Columns: {cols}")

Rows: 1692, Columns: 178


In [8]:
len(persona_cols), len(chosen_cols)

(14, 164)

In [9]:
users_per_region = chosen.groupby('N_REGION_ISO: Region ISO 3166-2').size().reset_index(name='User Count')
print(users_per_region)

  N_REGION_ISO: Region ISO 3166-2  User Count
0                     IN-BR Bihar         241
1                     IN-DL Delhi         229
2                   IN-HR Haryana         115
3               IN-MH Maharashtra         202
4                    IN-PB Punjab         122
5                 IN-TG Telangana         210
6             IN-UP Uttar Pradesh         360
7               IN-WB West Bengal         213


## Charts

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
import os
import re

os.makedirs("plots2", exist_ok=True)

df1 = pd.read_csv("data/2022_cleaned.csv")
df1["Country"] = "India"

df2 = pd.read_csv("data/2022_russia_cleaned.csv")
df2["Country"] = "Russia"

df3 = pd.read_csv("data/2022_US_cleaned.csv")
df3["Country"] = "US"

df4 = pd.read_csv("data/2022_japan_cleaned.csv")
df4["Country"] = "Japan"

df5 = pd.read_csv("data/2022_australia_cleaned.csv")
df5["Country"] = "Australia"

df = pd.concat([df1, df2, df3, df4, df5], ignore_index=True)

def sanitize_filename(name: str) -> str:
    return re.sub(r'[^A-Za-z0-9_\-]', '_', name)

cols_to_show = [col for col in df.columns if is_chosen_col(col)]
cols_to_compare = [
    'N_REGION_ISO: Region ISO 3166-2'
]

for persona_value in cols_to_compare:
    print(f"Comparing Different {persona_value} Options")

    for col in cols_to_show:
        if col == persona_value:
            continue

        ct = df.groupby(persona_value)[col].value_counts().unstack(fill_value=0)
        ct_pct = ct.div(ct.sum(axis=1), axis=0) * 100

        plt.figure(figsize=(12, 6))
        sns.heatmap(ct_pct, annot=True, fmt=".1f", cmap="YlGnBu")
        plt.title(f'Option Percentage Heatmap for {col} by {persona_value}')
        plt.xlabel(col)
        plt.ylabel(persona_value)
        plt.tight_layout()

        safe_col = sanitize_filename(col)
        safe_persona = sanitize_filename(persona_value)
        filename = f"plots2/heatmap_{safe_col}_by_{safe_persona}.png"

        plt.savefig(filename, dpi=300)
        plt.close()

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
import os
import re

os.makedirs("plots2", exist_ok=True)

df1 = pd.read_csv("data/2022_cleaned.csv")
df1["Country"] = "India"

df2 = pd.read_csv("data/2022_russia_cleaned.csv")
df2["Country"] = "Russia"

df3 = pd.read_csv("data/2022_US_cleaned.csv")
df3["Country"] = "US"

df4 = pd.read_csv("data/2022_japan_cleaned.csv")
df4["Country"] = "Japan"

df5 = pd.read_csv("data/2022_australia_cleaned.csv")
df5["Country"] = "Australia"

df = pd.concat([df1, df2, df3, df4, df5], ignore_index=True)

def sanitize_filename(name: str) -> str:
    return re.sub(r'[^A-Za-z0-9_\-]', '_', name)

cols_to_show = [col for col in df.columns if is_chosen_col(col)]
cols_to_compare = [
    'Country',
    # 'N_REGION_ISO: Region ISO 3166-2',
    # 'H_URBRURAL: Urban-Rural',
    # 'Q260: Sex',
    # 'X003R: Age recoded (6 intervals)',
    # 'Q272: Language at home',
    # 'Q273: Marital status',
    # 'Q275R: Highest educational level: Respondent (recoded into 3 groups)',
    # 'Q279: Employment status',
    # 'Q287: Social class (subjective)',
    # 'Q289: Religious denominations - major groups'
]

for persona_value in cols_to_compare:
    print(f"Comparing Different {persona_value} Options")

    for col in cols_to_show:
        if col == persona_value:
            continue

        ct = df.groupby(persona_value)[col].value_counts().unstack(fill_value=0)
        ct_pct = ct.div(ct.sum(axis=1), axis=0) * 100

        plt.figure(figsize=(12, 6))
        sns.heatmap(ct_pct, annot=True, fmt=".1f", cmap="YlGnBu")
        plt.title(f'Option Percentage Heatmap for {col} by {persona_value}')
        plt.xlabel(col)
        plt.ylabel(persona_value)
        plt.tight_layout()

        safe_col = sanitize_filename(col)
        safe_persona = sanitize_filename(persona_value)
        filename = f"plots2/heatmap_{safe_col}_by_{safe_persona}.png"

        plt.savefig(filename, dpi=300)
        plt.close()

Comparing Different Country Options
