In [1]:
#How does financial literacy vary across age groups and genders in the 2024 NFCS data?
#How does financial literacy vary across regions and divisions in the 2024 NFCS data?

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# === 1) Load original dataset ===
# cleaned_df = pd.read_csv("2009 Cleaned.csv", na_values=["", " ", "NA", "N/A", "null", ".", "na"])
original_df = pd.read_csv("NFCS 2009 State Data 220712 v2.csv", na_values=["", " ", "NA", "N/A", "null", ".", "na"])

YES_NO = {1: "Yes", 2: "No", 98: "Don't know", 99: "Prefer not to say"}
AGREE_1_TO_7 = {
    1: "1 - Strongly disagree", 2: "2", 3: "3",
    4: "4 - Neither agree nor disagree", 5: "5", 6: "6",
    7: "7 - Strongly agree", 98: "Don't know", 99: "Prefer not to say"
}
CONFIDENCE_1_TO_10 = {
    1: "1 - Not at all confident", 2: "2", 3: "3", 4: "4", 5: "5",
    6: "6", 7: "7", 8: "8", 9: "9", 10: "10 - Extremely confident",
    98: "Don't know", 99: "Prefer not to say"
}
FREQUENCY_4PT = {
    1: "Very difficult / At least once a year*",  # Use only when appropriate
    2: "Somewhat difficult / Once every few years*",
    3: "Not at all difficult / Rarely*",
    4: "Never*",  # starred to remind: some items only use 1–3 + DK/PNTS
    98: "Don't know", 99: "Prefer not to say"
}
TEN_SCALE_SATISFACTION = {
    1: "1 - Not at all satisfied", 2: "2", 3: "3", 4: "4", 5: "5",
    6: "6", 7: "7", 8: "8", 9: "9", 10: "10 - Extremely satisfied",
    98: "Don't know", 99: "Prefer not to say"
}
RISK_TOLERANCE_4 = {
    1: "Take substantial risks for substantial returns",
    2: "Above-average risks for above-average returns",
    3: "Average risks for average returns",
    4: "Not willing to take any risks",
    98: "Don't know", 99: "Prefer not to say"
}
TRUE_FALSE = {1: "True", 2: "False", 98: "Don't know", 99: "Prefer not to say"}

# ----------------------
# Variable-specific scales
# ----------------------
# Geography
STATEQ = {
    1:"Alabama",2:"Alaska",3:"Arizona",4:"Arkansas",5:"California",6:"Colorado",7:"Connecticut",8:"Delaware",
    9:"District of Columbia",10:"Florida",11:"Georgia",12:"Hawaii",13:"Idaho",14:"Illinois",15:"Indiana",16:"Iowa",
    17:"Kansas",18:"Kentucky",19:"Louisiana",20:"Maine",21:"Maryland",22:"Massachusetts",23:"Michigan",24:"Minnesota",
    25:"Mississippi",26:"Missouri",27:"Montana",28:"Nebraska",29:"Nevada",30:"New Hampshire",31:"New Jersey",
    32:"New Mexico",33:"New York",34:"North Carolina",35:"North Dakota",36:"Ohio",37:"Oklahoma",38:"Oregon",
    39:"Pennsylvania",40:"Rhode Island",41:"South Carolina",42:"South Dakota",43:"Tennessee",44:"Texas",45:"Utah",
    46:"Vermont",47:"Virginia",48:"Washington",49:"West Virginia",50:"Wisconsin",51:"Wyoming"
}
CENSUSDIV = {
    1:"New England",2:"Middle Atlantic",3:"East North Central",4:"West North Central",
    5:"South Atlantic",6:"East South Central",7:"West South Central",8:"Mountain",9:"Pacific"
}
CENSUSREG = {1:"Northeast",2:"Midwest",3:"South",4:"West"}

# Demographics
A3_GENDER = {1: "Male", 2: "Female"}  # A3
A3Ar_w_AGE = {1:"18-24",2:"25-34",3:"35-44",4:"45-54",5:"55-64",6:"65+"}
A3B_GENDER_AGE = {
    1:"Male 18-24",2:"Male 25-34",3:"Male 35-44",4:"Male 45-54",5:"Male 55-64",6:"Male 65+",
    7:"Female 18-24",8:"Female 25-34",9:"Female 35-44",10:"Female 45-54",11:"Female 55-64",12:"Female 65+"
}
A4A_new_w_ETHNICITY = {1:"White Alone NH", 2:"Non-White"}
A5_EDUCATION = {
    1:"Did not complete high school",2:"High school graduate",3:"Some college",
    4:"College graduate",5:"Post graduate education",99:"Prefer not to say"
}
A6_MARITAL = {1:"Married",2:"Single",3:"Separated",4:"Divorced",5:"Widowed/widower",99:"Prefer not to say"}
A7_HH_COMPOSITION = {
    1:"Only adult in household",2:"Live with spouse/partner",3:"Live in parents' home",
    4:"Live with other family/friends/roommates",99:"Prefer not to say"
}
A7A_MARITAL_NET = {1:"Married",2:"Living with partner",3:"Single"}
A8_INCOME = {
    1:"< $15,000",2:"$15k–< $25k",3:"$25k–< $35k",4:"$35k–< $50k",5:"$50k–< $75k",
    6:"$75k–< $100k",7:"$100k–< $150k",8:"$150k or more",98:"Don't know",99:"Prefer not to say"
}
A9_WORK = {
    1:"Self-employed",2:"Work full-time for employer",3:"Work part-time for employer",4:"Homemaker",
    5:"Full-time student",6:"Permanently sick/disabled/unable to work",7:"Unemployed or temporarily laid off",
    8:"Retired",99:"Prefer not to say"
}
A10_SPOUSE_WORK = A9_WORK.copy()
A10A_RETIREMENT_HH = {
    1:"Non-retired household",2:"Retired household - respondent retired",
    3:"Retired household - respondent not working; spouse retired"
}
A11_DEPENDENT_CHILDREN = {1:"1",2:"2",3:"3",4:"4 or more",5:"No financially dependent children",6:"Do not have any children",99:"Prefer not to say"}
A14_MOST_KNOWLEDGEABLE = {1:"You",2:"Someone else",3:"You and someone else equally",98:"Don't know",99:"Prefer not to say"}
A15_PAYS_BILLS = {1:"You",2:"Someone else",3:"Share responsibility",98:"Don't know",99:"Prefer not to say"}

# Banking / accounts (B*)
B1_CHECKING = YES_NO.copy()
B2_SAVINGS = YES_NO.copy()
B3_DEBIT = YES_NO.copy()
B4_OVERDRAW = YES_NO.copy()
B11_CCCASH = YES_NO.copy()
B12_GROCER_CASH = YES_NO.copy()
B13_MONEY_ORDERS = YES_NO.copy()
B14_NONRET_INV = YES_NO.copy()

# Retirement (C*)
C1_EMPLOYER_PLAN = YES_NO.copy()
C2_WHOSE_EMPLOYER = {
    1:"Your employer",2:"Spouse/partner's employer",3:"Both your and spouse/partner's employer",
    98:"Don't know",99:"Prefer not to say"
}
C3_SELF_SELECT_INV = YES_NO.copy()
C4_OTHER_RET_ACCTS = YES_NO.copy()
C5_CONTRIBUTE_REG = YES_NO.copy()
C7_SHARE_IN_STOCKS = {1:"More than half",2:"Less than half",3:"None",98:"Don't know",99:"Prefer not to say"}
C8_TARGET_DATE_FUND = YES_NO.copy()
C9_CHECK_ACCTS = {1:"At least once a year",2:"Once every few years",3:"Rarely",4:"Never",98:"Don't know",99:"Prefer not to say"}
C10_RET_LOAN_12MO = YES_NO.copy()
C11_HARDSHIP_WITHDRAW_12MO = YES_NO.copy()

# Housing / credit (E*, F*, G*)
E7_HAS_MORTGAGE = YES_NO.copy()
E8_HELOC = YES_NO.copy()
E12_MORTGAGE_TYPE = {1:"Fixed-rate mortgage",2:"Adjustable-rate mortgage",98:"Don't know",99:"Prefer not to say"}
E14_INT_ONLY = {1:"Yes - Interest-only mortgage or interest-only option",2:"No - Neither",98:"Don't know",99:"Prefer not to say"}
E15_BEHIND_MORT = {1:"Never",2:"Once",3:"More than once",98:"Don't know",99:"Prefer not to say"}

F1_NUM_CREDIT_CARDS = {
    1:"1",2:"2–3",3:"4–8",4:"9–12",5:"13–20",6:"More than 20",7:"No credit cards",
    98:"Don't know",99:"Prefer not to say"
}
G1_AUTO_LOAN_NOW = YES_NO.copy()
G2_COMPARE_AUTO_LENDERS = YES_NO.copy()
G4_BANKRUPTCY_2YRS = YES_NO.copy()

# Objective knowledge & attitudes (M*)
M1_1_CONSISTENT_SPENDER = AGREE_1_TO_7.copy()
M1_2_PAYS_BILLS_ON_TIME = AGREE_1_TO_7.copy()
M1_3_BUDGETER = AGREE_1_TO_7.copy()
M4_SELF_ASSESS_KNOWLEDGE = {
    1:"1 - Very low",2:"2",3:"3",4:"4",5:"5",6:"6",7:"7 - Very high",
    98:"Don't know",99:"Prefer not to say"
}
# Five objective knowledge items (classic 2009 set)
M6_INTEREST = {1:"More than $102",2:"Exactly $102",3:"Less than $102",98:"Don't know",99:"Prefer not to say"}
M7_INFLATION = {1:"More than today",2:"Exactly the same",3:"Less than today",98:"Don't know",99:"Prefer not to say"}
M8_BONDS = {
    1:"They will rise",2:"They will fall",3:"They will stay the same",
    4:"No relationship between bond prices and interest rates",
    98:"Don't know",99:"Prefer not to say"
}
M9_MORTGAGE = TRUE_FALSE.copy()
M10_DIVERSIFICATION = TRUE_FALSE.copy()

# Weights & IDs
NFCSID = None
WGT_N2 = None
WGT_D2 = None
WGT_S3 = None

# ----------------------
# Column → labels mapping
# ----------------------
LABELS_BY_COLUMN = {
    # IDs / weights
    "NFCSID": NFCSID,
    "wgt_n2": WGT_N2,
    "wgt_d2": WGT_D2,
    "wgt_s3": WGT_S3,

    # Geography
    "STATEQ": STATEQ,
    "CENSUSDIV": CENSUSDIV,
    "CENSUSREG": CENSUSREG,

    # Demographics
    "A3": A3_GENDER,
    "A3Ar_w": A3Ar_w_AGE,
    "A3B": A3B_GENDER_AGE,
    "A4A_new_w": A4A_new_w_ETHNICITY,
    "A5": A5_EDUCATION,
    "A6": A6_MARITAL,
    "A7": A7_HH_COMPOSITION,
    "A7A": A7A_MARITAL_NET,
    "A8": A8_INCOME,
    "A9": A9_WORK,
    "A10": A10_SPOUSE_WORK,
    "A10A": A10A_RETIREMENT_HH,
    "A11": A11_DEPENDENT_CHILDREN,
    "A14": A14_MOST_KNOWLEDGEABLE,
    "A15": A15_PAYS_BILLS,

    # Banking / accounts (B*)
    "B1": B1_CHECKING,
    "B2": B2_SAVINGS,
    "B3": B3_DEBIT,
    "B4": B4_OVERDRAW,
    "B11": B11_CCCASH,
    "B12": B12_GROCER_CASH,
    "B13": B13_MONEY_ORDERS,
    "B14": B14_NONRET_INV,

    # Retirement (C*)
    "C1": C1_EMPLOYER_PLAN,
    "C2": C2_WHOSE_EMPLOYER,
    "C3": C3_SELF_SELECT_INV,
    "C4": C4_OTHER_RET_ACCTS,
    "C5": C5_CONTRIBUTE_REG,
    "C7": C7_SHARE_IN_STOCKS,
    "C8": C8_TARGET_DATE_FUND,
    "C9": C9_CHECK_ACCTS,
    "C10": C10_RET_LOAN_12MO,
    "C11": C11_HARDSHIP_WITHDRAW_12MO,

    # Housing / credit (E*, F*, G*)
    "E7": E7_HAS_MORTGAGE,
    "E8": E8_HELOC,
    "E12": E12_MORTGAGE_TYPE,
    "E14": E14_INT_ONLY,
    "E15": E15_BEHIND_MORT,

    "F1": F1_NUM_CREDIT_CARDS,

    "G1": G1_AUTO_LOAN_NOW,
    "G2": G2_COMPARE_AUTO_LENDERS,
    "G4": G4_BANKRUPTCY_2YRS,

    # Attitudes & knowledge (M*)
    "M1_1": M1_1_CONSISTENT_SPENDER,
    "M1_2": M1_2_PAYS_BILLS_ON_TIME,
    "M1_3": M1_3_BUDGETER,
    "M4": M4_SELF_ASSESS_KNOWLEDGE,

    # Objective knowledge – literacy quiz (5 items)
    "M6": M6_INTEREST,
    "M7": M7_INFLATION,
    "M8": M8_BONDS,
    "M9": M9_MORTGAGE,
    "M10": M10_DIVERSIFICATION,
}


# Financial Literacy

### Financial Literacy is defined by the following columns
"J5": "Emergency Funds",
"J6": "College Savings",
"J20": "Emergency Confidence",
"J32": "Credit Record",
"B1": "Checking Account",
"B2": "Savings Account",
"B4": "Overdraw",
"C1_2012": "Retirement Plan",
"C5_2012": "Other Retirement Accounts",
"B14A_1": "Other Investments",
"E15_2015": "Mortgage Payment",
"F1": "Credit Card",
"G20": "Past Due Payments",
"G35": "Late Student Loan",
"G38": "Debt Agency Contact",
"M1_1": "Financial Confidence",
"M4": "Financial Knowledge"

### To score a person on financial literacy we will be scoring them weighting their scores based on the format of the questions 
(To discuss with group)
Current Scheme:
J5: Yes (3), No (2), Don't Know (0), Prefer Not to Say (0), No Response Given (0)
J6: Yes (3), No (2), Don't Know (0), Prefer Not to Say (0), No Response Given (0)
J20: Certain could come up with full $2,000 (5), Probably could (4), Probably could not (3), certain could not (2), Don't know (0), Prefer not to say (0), No Response Given (0)
J32: Very Good (5), Good (4), About Average (3), Bad (2), Very Bad (1), Don't Know (0), Prefer Not to Say (0), No Response Given (0)
B1: Yes (3), No (2), Don't Know (0), Prefer Not to Say (0), No Response Given (0)
B2: Yes (3), No (2), Don't Know (0), Prefer Not to Say (0), No Response Given (0)
B4: Yes (3), No (2), Don't Know (0), Prefer Not to Say (0), No Response Given (0)
C1_2012: Yes (3), No (2), Don't Know (0), Prefer Not to Say (0), No Response Given (0) 
C5_2012: Yes (3), No (2), Don't Know (0), Prefer Not to Say (0), No Response Given (0)
B14A_1: Yes (3), No (2), Don't Know (0), Prefer Not to Say (0), No Response Given (0) 
E15_2015: Never (?), Once (?), More than once (?), Don't Know (0), Prefer Not to Say (0), No Response Given (0)
F1: 2 to 3 (?), 3 to 4 (?), 1 (?), 4 to 8 (?), 9 to 12 (?), 13 to 20 (?), More than 20, No credit cards (?), Don't Know (0), Prefer Not to Say (0), No Response Given (0)
G20: Yes (3), No (2), Don't Know (0), Prefer Not to Say (0), No Response Given (0) 
G35: Never, payments not due (?), Never (?), repaying on time each month (?), Once (?), More than once (?), Don't Know (0), Prefer Not to Say (0), No Response Given (0)
G38: Yes (3), No (2), Don't Know (0), Prefer Not to Say (0), No Response Given (0)


In [2]:
# ----- Bucket 1: Critical Stability (max 5 each) -----
# Measures immediate financial resilience and payment reliability.

J5_w = {  # Emergency/rainy-day funds (Yes = best)
    1: 5,  # Yes
    2: 1,  # No
    98: 0, 99: 0, "No Response Given": 0
}

E15_w = {  # Missed mortgage payment (in last 2 years)
    1: 5,  # Never
    2: 3,  # Once
    3: 1,  # More than once
    98: 0, 99: 0, "No Response Given": 0
}

J10_w = {  # Large unexpected drop in income (past 12 months)
    2: 5,  # No
    1: 1,  # Yes
    98: 0, 99: 0, "No Response Given": 0
}

# ----- Bucket 2: Core Access & Assets (max 3 each) -----
# Captures fundamental financial inclusion and asset accumulation.

B1_w = {  # Checking account
    1: 3,  # Yes
    2: 1,  # No
    98: 0, 99: 0, "No Response Given": 0
}

B2_w = {  # Savings / money market / CDs
    1: 3,  # Yes
    2: 1,
    98: 0, 99: 0, "No Response Given": 0
}

C1_w = {  # Any employer retirement plan
    1: 3,
    2: 1,
    98: 0, 99: 0, "No Response Given": 0
}

C5_w = {  # Contribute regularly to retirement plan
    1: 3,
    2: 1,
    98: 0, 99: 0, "No Response Given": 0
}

B14_w = {  # Other (non-retirement) investments
    1: 3,
    2: 1,
    98: 0, 99: 0, "No Response Given": 0
}

# ----- Bucket 3: Credit Health & Debt Behavior (max 3 each) -----
# Evaluates responsible borrowing and credit management behavior.

F1_w = {  # Number of credit cards
    2: 3,  # 2–3 cards (optimal)
    3: 3,  # 4–8 cards (still good)
    1: 2,  # 1 card
    4: 1,  # 9–12 cards
    5: 0,  # 13–20 cards
    6: 0,  # >20 cards
    7: 1,  # None
    98: 0, 99: 0, "No Response Given": 0
}

B4_w = {  # Overdraw checking account
    2: 3,  # No
    1: 1,  # Yes
    98: 0, 99: 0, "No Response Given": 0
}

E8_w = {  # Home equity loan (proxy for debt exposure)
    2: 3,  # No (less debt exposure)
    1: 2,  # Yes
    98: 0, 99: 0, "No Response Given": 0
}

# ----- Bucket 4: Financial Planning & Preparedness (max 2 each) -----
# Focuses on forward-looking saving and planning behavior.

J6_w = {  # Saving for children’s college
    1: 2,  # Yes
    2: 1,
    98: 0, 99: 0, "No Response Given": 0
}

J8_w = {  # Figured out how much needed for retirement
    1: 2,  # Yes
    2: 1,
    98: 0, 99: 0, "No Response Given": 0
}

J9_w = {  # Pre-retirement planning done
    1: 2,  # Yes
    2: 1,
    98: 0, 99: 0, "No Response Given": 0
}

# ================================================
# Combined WEIGHTS dictionary (2009-only columns)
# ================================================
WEIGHTS = {
    # Critical Stability
    "Emergency Funds": J5_w,
    "Mortgage Payment": E15_w,
    "Income Drop": J10_w,

    # Core Access & Assets
    "Checking Account": B1_w,
    "Savings Account": B2_w,
    "Retirement Plan": C1_w,
    "Retirement Contributions": C5_w,
    "Other Investments": B14_w,

    # Credit Health & Debt Behavior
    "Credit Card": F1_w,
    "Overdraw": B4_w,
    "Home Equity Loan": E8_w,

    # Planning & Preparedness
    "College Savings": J6_w,
    "Retirement Planning": J8_w,
    "Pre-Retirement Actions": J9_w,
}


In [3]:
# Rename dictionary for predictors (2009 columns)
predictor_rename_dict = {
    # Demographics
    "A5": "Education",
    "A11": "Dependent Children",
    "A8": "Annual Income",
    "A3": "Gender",
    "A3Ar_w": "Age Group",
    "A3B": "Gender&Age",

    # Optional: geography helpers (handy for grouping/plots)
    "STATEQ": "State",
    "CENSUSDIV": "Census Division",
    "CENSUSREG": "Census Region",
}

# Rename dictionary for financial literacy outcomes / behaviors (2009 columns)
finlit_rename_dict = {
    # Stability & resilience
    "J5": "Emergency Funds",
    "E15": "Mortgage Payment",          # (lateness in last 2 years)
    "J10": "Income Drop",               # (large unexpected drop in income, 12 mo)

    # Access & assets
    "B1": "Checking Account",
    "B2": "Savings Account",
    "C1": "Retirement Plan",            # any employer plan
    "C4": "Other Retirement Accounts",  # (IRAs/other, distinct from contributions)
    "C5": "Retirement Contributions",   # contribute regularly
    "B14": "Other Investments",         # non-retirement investments

    # Credit & debt behavior
    "F1": "Credit Card",                # number of cards
    "B4": "Overdraw",                   # overdraw checking
    "E8": "Home Equity Loan",

    # Planning (present in 2009; useful for readiness)
    "J6": "College Savings",
    "J8": "Retirement Planning",        # figured out how much needed
    "J9": "Pre-Retirement Actions",     # took concrete steps

    # Self-assessed confidence/knowledge
    "M1_1": "Financial Confidence",
    "M4": "Financial Knowledge",
}

# Combine both into one dictionary
rename_dict = {**predictor_rename_dict, **finlit_rename_dict}

# Rename columns in dataframe
original_df = original_df.rename(columns=rename_dict)


In [4]:
# Defining Financial Literacy
financial_literacy = original_df.copy()
print(financial_literacy.head(10).to_string)

for col, weight_dict in WEIGHTS.items():
    # financial_literacy[f"{col}_score"] = financial_literacy[col].map(weight_dict)
    original_df[f"{col}_score"] = original_df[col].map(weight_dict)



<bound method DataFrame.to_string of        NFCSID  State  Census Division  Census Region  Gender  Age Group  \
0  2009010001     43                6              3       2          2   
1  2009010002     44                7              3       1          5   
2  2009010003     44                7              3       1          5   
3  2009010004     33                2              1       2          3   
4  2009010005      5                9              4       1          6   
5  2009010006      5                9              4       2          3   
6  2009010007     15                3              2       2          4   
7  2009010008     39                2              1       1          4   
8  2009010009     33                2              1       2          3   
9  2009010010     33                2              1       1          3   

   Gender&Age  A4A_new_w  Education  A6  ...  M1_3  Financial Knowledge  M6  \
0           8          2          5   1  ...     6        

In [5]:
original_df["Total_Score"] = original_df[[f"{col}_score" for col in WEIGHTS.keys()]].sum(axis=1)

conditions = [
    (original_df["Total_Score"] == 45),                                     
    (original_df["Total_Score"] >= 38) & (original_df["Total_Score"] < 45), 
    (original_df["Total_Score"] >= 28) & (original_df["Total_Score"] < 38), 
    (original_df["Total_Score"] >= 18) & (original_df["Total_Score"] < 28), 
    (original_df["Total_Score"] >= 9)  & (original_df["Total_Score"] < 18), 
    (original_df["Total_Score"] >= 1)  & (original_df["Total_Score"] < 9),  
    (original_df["Total_Score"] == 0)                                       
]

choices = ["Perfect", "High", "Average", "Below Average", "Low", "Bad", "None"]

original_df["FinLit_Level"] = np.select(conditions, choices, default="No Response Given")



In [6]:

print("Pre-mapping", original_df.isna().sum().sum())

for col, mapping in LABELS_BY_COLUMN.items():
    if col in original_df.columns and mapping is not None:
        original_df[col] = original_df[col].map(mapping)

# print("Post-mapping", cleaned_df.isna().sum().sum())

original_df.fillna("No Response Given", inplace=True)

# print("Post-fillna", cleaned_df.isna().sum().sum())


Pre-mapping 1202696


  original_df.fillna("No Response Given", inplace=True)


In [7]:


original_df.to_csv("2009 Cleaned.csv", index=False)
# Define new lists with human-readable names
predictors = list(predictor_rename_dict.values())
financial_lit_cols = list(finlit_rename_dict.values())

print("Post-cleaning", original_df.isna().sum().sum())

Post-cleaning 0
