In [1]:
import pandas as pd

# Load data
df = pd.read_csv(r"C:\Users\Antoni\OneDrive\Pulpit\CuBoulder\2 sem\Study\github\llm-cultural-bias\data\raw\test.csv")

# Recode age into 4 groups
def recode_age(age):
    if 18 <= age <= 29:
        return '18-29'
    elif 30 <= age <= 49:
        return '30-49'
    elif 50 <= age <= 64:
        return '50-64'
    else:
        return '65+'

df['AGE_GROUP'] = df['Q262'].apply(recode_age)

# Recode sex
df['SEX_LABEL'] = df['Q260'].map({1: 'Male', 2: 'Female'})

# Recode education
df['EDU_LABEL'] = df['Q275R'].map({1: 'Lower', 2: 'Middle', 3: 'Higher'})

# ============================================
# PART 1: Count observations per country
# ============================================
print("="*60)
print("TOTAL OBSERVATIONS PER COUNTRY")
print("="*60)
print(df['B_COUNTRY_ALPHA'].value_counts())

# ============================================
# PART 2: Count by each demographic feature
# ============================================
print("\n" + "="*60)
print("BREAKDOWN BY DEMOGRAPHIC FEATURES")
print("="*60)

for country in ['CHN', 'SVK', 'USA']:
    print(f"\n--- {country} ---")
    country_df = df[df['B_COUNTRY_ALPHA'] == country]
    print(f"Total N: {len(country_df)}")
    print(f"\nSex:\n{country_df['SEX_LABEL'].value_counts().sort_index()}")
    print(f"\nAge Group:\n{country_df['AGE_GROUP'].value_counts().reindex(['18-29', '30-49', '50-64', '65+'])}")
    print(f"\nEducation:\n{country_df['EDU_LABEL'].value_counts().reindex(['Lower', 'Middle', 'Higher'])}")

# ============================================
# PART 3: Count all persona combinations
# ============================================
print("\n" + "="*60)
print("PERSONA BASKET COUNTS (Country × Sex × Age × Edu)")
print("="*60)

persona_counts = df.groupby(['B_COUNTRY_ALPHA', 'SEX_LABEL', 'AGE_GROUP', 'EDU_LABEL']).size().reset_index(name='N')
print(f"\nTotal unique personas: {len(persona_counts)}")
print(f"Expected max: 3 × 2 × 4 × 3 = {3*2*4*3}")
print(f"\nPersonas with N < 10: {len(persona_counts[persona_counts['N'] < 10])}")
print(f"Personas with N < 30: {len(persona_counts[persona_counts['N'] < 30])}")
print(f"Personas with N < 50: {len(persona_counts[persona_counts['N'] < 50])}")
print(f"Personas with N >= 50: {len(persona_counts[persona_counts['N'] >= 50])}")

print(f"\nMin N: {persona_counts['N'].min()}")
print(f"Max N: {persona_counts['N'].max()}")
print(f"Mean N: {persona_counts['N'].mean():.1f}")
print(f"Median N: {persona_counts['N'].median():.1f}")

print("\n--- Smallest persona baskets ---")
print(persona_counts.nsmallest(15, 'N').to_string(index=False))

print("\n--- All personas sorted by N ---")
print(persona_counts.sort_values('N').to_string(index=False))

TOTAL OBSERVATIONS PER COUNTRY
B_COUNTRY_ALPHA
CHN    2956
USA    2506
SVK    1065
Name: count, dtype: int64

BREAKDOWN BY DEMOGRAPHIC FEATURES

--- CHN ---
Total N: 2956

Sex:
SEX_LABEL
Female    1622
Male      1334
Name: count, dtype: int64

Age Group:
AGE_GROUP
18-29     553
30-49    1247
50-64     841
65+       315
Name: count, dtype: int64

Education:
EDU_LABEL
Lower     1614
Middle     675
Higher     667
Name: count, dtype: int64

--- SVK ---
Total N: 1065

Sex:
SEX_LABEL
Female    579
Male      486
Name: count, dtype: int64

Age Group:
AGE_GROUP
18-29     95
30-49    342
50-64    337
65+      291
Name: count, dtype: int64

Education:
EDU_LABEL
Lower     382
Middle    484
Higher    199
Name: count, dtype: int64

--- USA ---
Total N: 2506

Sex:
SEX_LABEL
Female    1160
Male      1346
Name: count, dtype: int64

Age Group:
AGE_GROUP
18-29     605
30-49    1035
50-64     534
65+       332
Name: count, dtype: int64

Education:
EDU_LABEL
Lower       42
Middle    1174
Higher    1290
Nam