In [None]:
#How does financial literacy vary across age groups and genders in the 2024 NFCS data?
#How does financial literacy vary across regions and divisions in the 2024 NFCS data?

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# === 1) Load original dataset ===
cleaned_df = pd.read_csv("2021 Cleaned.csv", na_values=["", " ", "NA", "N/A", "null", ".", "na"])
original_df = pd.read_csv("NFCS 2021 Investor Data 221121.csv", na_values=["", " ", "NA", "N/A", "null", ".", "na"])

# ----------------------
# Reusable value scales
# ----------------------
YES_NO = {1: "Yes", 2: "No", 98: "Don't know", 99: "Prefer not to say"}
YES_NO_SELECTED = {0: "NOT SELECTED", 1: "Selected"}
NEVER_SOMETIMES_FREQ = {1: "Never", 2: "Sometimes", 3: "Frequently", 98: "Don't know", 99: "Prefer not to say"}
AGREE_1_TO_7 = {1: "1 - Strongly disagree", 2: "2", 3: "3", 4: "4 - Neither agree nor disagree", 5: "5", 6: "6", 7: "7 - Strongly agree", 98: "Don't know", 99: "Prefer not to say"}
CONFIDENCE_1_TO_10 = {1: "1 - Not at all confident", 2: "2", 3: "3", 4: "4", 5: "5", 6: "6", 7: "7", 8: "8", 9: "9", 10: "10 - Extremely confident", 98: "Don't know", 99: "Prefer not to say"}
IMPORTANCE_1_TO_10 = {1: "1 - Not at all important", 2: "2", 3: "3", 4: "4", 5: "5", 6: "6", 7: "7", 8: "8", 9: "9", 10: "10 - Extremely important", 98: "Don't know", 99: "Prefer not to say"}
RELY_NOT_AT_ALL_SOME_A_LOT = {1: "Not at all", 2: "Somewhat", 3: "A great deal", 98: "Don't know", 99: "Prefer not to say"}

# ----------------------
# Variable-specific scales
# ----------------------
A1 = {1: "Primary decision-maker", 2: "Share decision-making", 3: "Do not participate in decisions", 98: "Don't know", 99: "Prefer not to say"}
A2 = YES_NO.copy()
A3 = YES_NO.copy()

B2_OWNERSHIP = YES_NO.copy()  # B2_1 .. individual items below

B30 = {1: "Less than a year ago", 2: "1 to <2 years ago", 3: "2 to <5 years ago", 4: "5 to <10 years ago", 5: "10 years ago or more", 98: "Don't know", 99: "Prefer not to say"}
B31 = {1: "2018 or earlier", 2: "2019", 3: "2020", 4: "2021", 98: "Don't know", 99: "Prefer not to say"}
B3 = {1: "None", 2: "1 to 3 times", 3: "4 to 10 times", 4: "11 times or more", 98: "Don't know", 99: "Prefer not to say"}
B32 = YES_NO.copy()
B4 = {1: "< $2,000", 2: "$2,000 to < $5,000", 3: "$5,000 to < $10,000", 4: "$10,000 to < $25,000", 5: "$25,000 to < $50,000", 6: "$50,000 to < $100,000", 7: "$100,000 to < $250,000", 8: "$250,000 to < $500,000", 9: "$500,000 to < $1,000,000", 10: "$1,000,000 or more", 98: "Don't know", 99: "Prefer not to say"}
B5 = YES_NO.copy()
B6 = YES_NO.copy()
B20 = YES_NO.copy()
B33 = YES_NO.copy()
B34 = YES_NO.copy()
B10 = {1: "Take substantial risks for substantial returns", 2: "Above-average risks for above-average returns", 3: "Average risks for average returns", 4: "Not willing to take any risks", 98: "Don't know", 99: "Prefer not to say"}
B11 = {1: "More than half", 2: "Less than half", 3: "None", 98: "Don't know", 99: "Prefer not to say"}
B35 = YES_NO.copy()
B23 = YES_NO.copy()
B24 = {1: "Not at all risky", 2: "Slightly risky", 3: "Moderately risky", 4: "Very risky", 5: "Extremely risky", 98: "Don't know", 99: "Prefer not to say"}
B25 = YES_NO.copy()
B26 = YES_NO.copy()

C22 = NEVER_SOMETIMES_FREQ.copy()  # 1..4 items share this scale
C23 = YES_NO.copy()  # 1..4 items share this scale
C24 = {1: "Do not pay any fees", 2: "< 0.5%", 3: "0.5% to < 1%", 4: "1% to < 2%", 5: "2% to < 4%", 6: "4% or more", 98: "Don't know", 99: "Prefer not to say"}
C25 = CONFIDENCE_1_TO_10.copy()
C26 = YES_NO.copy()
C30 = YES_NO.copy()
C7 = YES_NO.copy()

D1 = CONFIDENCE_1_TO_10.copy()  # 1..2 items share this scale
D2 = {1: "< 0% (negative)", 2: "0% to 4.9%", 3: "5% to 9.9%", 4: "10% to 14.9%", 5: "15% to 19.9%", 6: "20% or more", 98: "Don't know", 99: "Prefer not to say"}
D3 = {1: "Worse than the market", 2: "About the same as the market", 3: "Better than the market", 98: "Don't know", 99: "Prefer not to say"}
D21 = {1: "Buy stocks or stock funds", 2: "Sell stocks or stock funds", 3: "Neither", 98: "Don't know", 99: "Prefer not to say"}
D30 = D21.copy()
D31 = AGREE_1_TO_7.copy()

E1_1 = CONFIDENCE_1_TO_10.copy()
E20 = YES_NO.copy()
E5 = {1: "Protecting investors", 2: "Protecting institutions", 3: "Both", 4: "Neither", 98: "Don't know", 99: "Prefer not to say"}
E6 = {1: "In-person meeting with broker/advisor", 2: "Paper documents by mail", 3: "Electronic by email", 4: "Documents accessed on the Internet", 5: "None of the above", 98: "Don't know", 99: "Prefer not to say"}

F30 = RELY_NOT_AT_ALL_SOME_A_LOT.copy()  # 1..12 items share this scale
F31 = YES_NO.copy()  # 1..11 items share this scale

G1 = CONFIDENCE_1_TO_10.copy()
G2 = {1: "1 - Very low", 2: "2", 3: "3", 4: "4", 5: "5", 6: "6", 7: "7 - Very high", 98: "Don't know", 99: "Prefer not to say"}
G30 = {1: "Does not describe at all", 2: "Describes somewhat", 3: "Describes very well", 98: "Don't know", 99: "Prefer not to say"}  # 1..6 items
G31 = IMPORTANCE_1_TO_10.copy()
G4 = {1: "You own a part of the company", 2: "You have lent money to the company", 3: "You are liable for the company's debts", 4: "The company will return your original investment with interest", 98: "Don't know", 99: "Prefer not to say"}
G5 = {1: "You own a part of the company", 2: "You have lent money to the company", 3: "You are liable for the company's debts", 4: "You can vote on shareholder resolutions", 98: "Don't know", 99: "Prefer not to say"}
G6 = {1: "Preferred stock", 2: "Common stock", 3: "Bonds", 98: "Don't know", 99: "Prefer not to say"}
G7 = {1: "True", 2: "False", 98: "Don't know", 99: "Prefer not to say"}
G21 = G7.copy()
G8 = {1: "Stocks", 2: "Bonds", 3: "CDs", 4: "Money market accounts", 5: "Precious metals", 98: "Don't know", 99: "Prefer not to say"}
G22 = {1: "Lower fees and expenses", 2: "Generally less risky in the short term", 3: "Generally less likely to decline in value", 98: "Don't know", 99: "Prefer not to say"}
G11 = {1: "Municipal bonds are lower risk", 2: "Greater demand for municipal bonds", 3: "Municipal bonds can be tax-free", 98: "Don't know", 99: "Prefer not to say"}
G12 = {1: "$500", 2: "$250", 3: "$0", 98: "Don't know", 99: "Prefer not to say"}
G13 = {1: "Selling shares shortly after buying", 2: "Selling before peak", 3: "Selling at a loss", 4: "Selling borrowed shares", 98: "Don't know", 99: "Prefer not to say"}
G23 = {1: "$10", 2: "$0", 3: "-$10", 98: "Don't know", 99: "Prefer not to say"}

H31 = YES_NO.copy()

# Weights & IDs (no categorical labels)
WGT1 = None
NFCSID = None

# State-by-State demographic carry-overs
S_Gender2 = {1: "Male", 2: "Female"}
S_Age = {1: "18-34", 2: "35-54", 3: "55+"}
S_Ethnicity = {1: "White non-Hispanic", 2: "Non-White"}
S_Education = {1: "Some college or less (incl. Associate's)", 2: "College grad (Bachelor's) or more"}
S_Income = {1: "<$50K", 2: "$50-$100K", 3: "$100K+"}

# ----------------------
# Column → labels mapping
# ----------------------
LABELS_BY_COLUMN = {
    # IDs / weights
    "NFCSID": NFCSID,
    "WGT1": WGT1,

    # A — Household decision making & ownership
    "A1": A1,
    "A2": A2,
    "A3": A3,

    # B2 ownership by instrument (non-retirement)
    "B2_1": B2_OWNERSHIP,  # Individual stocks
    "B2_2": B2_OWNERSHIP,  # Individual bonds
    "B2_3": B2_OWNERSHIP,  # Mutual funds
    "B2_4": B2_OWNERSHIP,  # ETFs
    "B2_5": B2_OWNERSHIP,  # Annuities
    "B2_7": B2_OWNERSHIP,  # Commodities/futures
    "B2_20": B2_OWNERSHIP, # Whole life insurance
    "B2_21": B2_OWNERSHIP, # REITs
    "B2_23": B2_OWNERSHIP, # Microcap/penny stocks
    "B2_24": B2_OWNERSHIP, # Structured notes
    "B2_25": B2_OWNERSHIP, # Private placements

    # B — Investing timeline / activity / portfolio
    "B30": B30,
    "B31": B31,
    "B3": B3,
    "B32": B32,
    "B4": B4,
    "B5": B5,   # Margin allowed
    "B6": B6,   # Purchased on margin
    "B20": B20, # Margin call ever
    "B33": B33, # Options allowed
    "B34": B34, # Ever traded options
    "B10": B10, # Risk tolerance
    "B11": B11, # Stock share of portfolio
    "B35": B35, # Meme stocks (2021)

    # Crypto awareness / views / behavior
    "B23": B23,
    "B24": B24,
    "B25": B25,
    "B26": B26,

    # C — Channels, fees, disclosures
    "C22_1": C22, "C22_2": C22, "C22_3": C22, "C22_4": C22,
    "C23_1": C23, "C23_2": C23, "C23_3": C23, "C23_4": C23,
    "C24": C24,
    "C25": C25,
    "C26": C26,
    "C30": C30,
    "C7": C7,

    # D — Expectations, reactions, fraud worry
    "D1_1": D1, "D1_2": D1,
    "D2": D2,
    "D3": D3,
    "D21": D21,
    "D30": D30,
    "D31": D31,

    # E — Regulation & disclosures
    "E1_1": E1_1,
    "E20": E20,
    "E5": E5,
    "E6": E6,

    # F — Info sources & platforms
    "F30_1": F30, "F30_2": F30, "F30_3": F30, "F30_4": F30, "F30_5": F30, "F30_6": F30,
    "F30_7": F30, "F30_8": F30, "F30_9": F30, "F30_10": F30, "F30_11": F30, "F30_12": F30,
    "F31_1": F31, "F31_2": F31, "F31_3": F31, "F31_4": F31, "F31_5": F31, "F31_6": F31,
    "F31_7": F31, "F31_8": F31, "F31_9": F31, "F31_10": F31, "F31_11": F31,

    # G — Comfort, knowledge, motivations, literacy items
    "G1": G1,
    "G2": G2,
    "G30_1": G30, "G30_2": G30, "G30_3": G30, "G30_4": G30, "G30_5": G30, "G30_6": G30,
    "G31": G31,
    "G4": G4, "G5": G5, "G6": G6, "G7": G7, "G21": G21, "G8": G8, "G22": G22,
    "G11": G11, "G12": G12, "G13": G13, "G23": G23,

    # H — Planning
    "H31": H31,

    # Carry-over demo vars
    "S_Gender2": S_Gender2,
    "S_Age": S_Age,
    "S_Ethnicity": S_Ethnicity,
    "S_Education": S_Education,
    "S_Income": S_Income,
}



# Financial Literacy

### Financial Literacy is defined by the following columns
"J5": "Emergency Funds",
"J6": "College Savings",
"J20": "Emergency Confidence",
"J32": "Credit Record",
"B1": "Checking Account",
"B2": "Savings Account",
"B4": "Overdraw",
"C1_2012": "Retirement Plan",
"C5_2012": "Other Retirement Accounts",
"B14A_1": "Other Investments",
"E15_2015": "Mortgage Payment",
"F1": "Credit Card",
"G20": "Past Due Payments",
"G35": "Late Student Loan",
"G38": "Debt Agency Contact",
"M1_1": "Financial Confidence",
"M4": "Financial Knowledge"

### To score a person on financial literacy we will be scoring them weighting their scores based on the format of the questions 
(To discuss with group)
Current Scheme:
J5: Yes (3), No (2), Don't Know (0), Prefer Not to Say (0), No Response Given (0)
J6: Yes (3), No (2), Don't Know (0), Prefer Not to Say (0), No Response Given (0)
J20: Certain could come up with full $2,000 (5), Probably could (4), Probably could not (3), certain could not (2), Don't know (0), Prefer not to say (0), No Response Given (0)
J32: Very Good (5), Good (4), About Average (3), Bad (2), Very Bad (1), Don't Know (0), Prefer Not to Say (0), No Response Given (0)
B1: Yes (3), No (2), Don't Know (0), Prefer Not to Say (0), No Response Given (0)
B2: Yes (3), No (2), Don't Know (0), Prefer Not to Say (0), No Response Given (0)
B4: Yes (3), No (2), Don't Know (0), Prefer Not to Say (0), No Response Given (0)
C1_2012: Yes (3), No (2), Don't Know (0), Prefer Not to Say (0), No Response Given (0) 
C5_2012: Yes (3), No (2), Don't Know (0), Prefer Not to Say (0), No Response Given (0)
B14A_1: Yes (3), No (2), Don't Know (0), Prefer Not to Say (0), No Response Given (0) 
E15_2015: Never (?), Once (?), More than once (?), Don't Know (0), Prefer Not to Say (0), No Response Given (0)
F1: 2 to 3 (?), 3 to 4 (?), 1 (?), 4 to 8 (?), 9 to 12 (?), 13 to 20 (?), More than 20, No credit cards (?), Don't Know (0), Prefer Not to Say (0), No Response Given (0)
G20: Yes (3), No (2), Don't Know (0), Prefer Not to Say (0), No Response Given (0) 
G35: Never, payments not due (?), Never (?), repaying on time each month (?), Once (?), More than once (?), Don't Know (0), Prefer Not to Say (0), No Response Given (0)
G38: Yes (3), No (2), Don't Know (0), Prefer Not to Say (0), No Response Given (0)


In [27]:
# ----- Bucket 1: Critical Stability (max 5 each) -----
C25_w = {10:5, 9:4, 8:4, 7:3, 6:3, 5:2, 4:2, 3:1, 2:1, 1:1, 98:0, 99:0, "No Response Given":0}
G1_w  = C25_w.copy()  # Overall investing confidence
G2_w  = {7:5, 6:4, 5:3, 4:2, 3:2, 2:1, 1:1, 98:0, 99:0, "No Response Given":0}
G30_w = {3:5, 2:3, 1:1, 98:0, 99:0, "No Response Given":0}  # Behavioral statements

# ----- Bucket 2: Core Access & Assets (max 3 each) -----
OWNERSHIP_w = {1:3, 2:1, 98:0, 99:0, "No Response Given":0}  # applies to B2_*

# ----- Bucket 3: Credit Health & Risk Behavior (max 3 each) -----
B10_w = {1:5, 2:4, 3:3, 4:1, 98:0, 99:0, "No Response Given":0}  # Risk tolerance
B11_w = {1:5, 2:3, 3:1, 98:0, 99:0, "No Response Given":0}       # % portfolio stocks
B24_w = {5:5, 4:4, 3:3, 2:2, 1:1, 98:0, 99:0, "No Response Given":0}  # Crypto risk perception

# ----- Bucket 4: Supplementary Planning (max 2 each) -----
F30_w = {3:2, 2:1, 1:0, 98:0, 99:0, "No Response Given":0}  # Reliance on info sources
F31_w = {1:2, 2:1, 98:0, 99:0, "No Response Given":0}        # Platform usage
H31_w = {1:2, 2:0, 98:0, 99:0, "No Response Given":0}        # Has a will

# ----- Combined dictionary -----
WEIGHTS = {
    # Critical Stability
    "Emergency Confidence": C25_w,
    "Investing Confidence": G1_w,
    "Financial Comfort": G2_w,
    "Stay Calm During Downturns": G30_w,
    "Avoid Panic Selling": G30_w,
    "Monitor Investments Regularly": G30_w,
    "Diversify Portfolio": G30_w,
    "Save Consistently": G30_w,
    "Long-Term Focus": G30_w,

    # Core Access & Assets
    "Own Stocks": OWNERSHIP_w,
    "Own Mutual Funds": OWNERSHIP_w,
    "Own ETFs": OWNERSHIP_w,
    "Own Annuities": OWNERSHIP_w,
    "Own REITs": OWNERSHIP_w,
    "Own Private Placements": OWNERSHIP_w,

    # Credit / Risk Behavior
    "Risk Tolerance": B10_w,
    "Stock Share of Portfolio": B11_w,
    "Crypto Risk Perception": B24_w,

    # Planning / Information
    "Financial Advisors": F30_w,
    "Friends Family": F30_w,
    "Online Articles": F30_w,
    "Financial TV": F30_w,
    "Podcasts": F30_w,
    "Social Media": F30_w,
    "Government Websites": F30_w,
    "Company Publications": F30_w,
    "Financial Newsletters": F30_w,
    "Employer Resources": F30_w,
    "Influencers": F30_w,
    "Other Info Sources": F30_w,

    "Use Brokerage Apps": F31_w,
    "Use Robo Advisors": F31_w,
    "Use Banking Apps": F31_w,
    "Use Social Platforms": F31_w,
    "Use Spreadsheets": F31_w,
    "Use Forums": F31_w,
    "Use Investment Clubs": F31_w,
    "Use Financial News Apps": F31_w,
    "Use Podcasts": F31_w,
    "Use YouTube": F31_w,
    "Use Other Tools": F31_w,

    "Estate Planning": H31_w,
}

In [28]:
# === 3) Sorting Variables ===

# Rename dictionary for predictors
predictor_rename_dict = {
    "S_Education": "Education",
    "S_Age": "Age Group",
    "S_Gender2": "Gender",
    "S_Income": "Annual Income",
    "S_Ethnicity": "Ethnicity",
}


# Rename dictionary for financial literacy outcomes
# === Financial literacy & stability outcomes ===
finlit_rename_dict = {
    # Critical Stability
    "C25": "Emergency Confidence",
    "G1": "Investing Confidence",
    "G2": "Financial Comfort",
    "G30_1": "Stay Calm During Downturns",
    "G30_2": "Avoid Panic Selling",
    "G30_3": "Monitor Investments Regularly",
    "G30_4": "Diversify Portfolio",
    "G30_5": "Save Consistently",
    "G30_6": "Long-Term Focus",

    # Core Access & Assets
    "B2_1": "Own Stocks",
    "B2_3": "Own Mutual Funds",
    "B2_4": "Own ETFs",
    "B2_5": "Own Annuities",
    "B2_21": "Own REITs",
    "B2_25": "Own Private Placements",

    # Credit / Risk Behavior
    "B10": "Risk Tolerance",
    "B11": "Stock Share of Portfolio",
    "B24": "Crypto Risk Perception",

    # Planning / Information
    "F30_1": "Financial Advisors",
    "F30_2": "Friends Family",
    "F30_3": "Online Articles",
    "F30_4": "Financial TV",
    "F30_5": "Podcasts",
    "F30_6": "Social Media",
    "F30_7": "Government Websites",
    "F30_8": "Company Publications",
    "F30_9": "Financial Newsletters",
    "F30_10": "Employer Resources",
    "F30_11": "Influencers",
    "F30_12": "Other Info Sources",
    "F31_1": "Use Brokerage Apps",
    "F31_2": "Use Robo Advisors",
    "F31_3": "Use Banking Apps",
    "F31_4": "Use Social Platforms",
    "F31_5": "Use Spreadsheets",
    "F31_6": "Use Forums",
    "F31_7": "Use Investment Clubs",
    "F31_8": "Use Financial News Apps",
    "F31_9": "Use Podcasts",
    "F31_10": "Use YouTube",
    "F31_11": "Use Other Tools",
    "H31": "Estate Planning",
}


# Combine both into one dictionary
rename_dict = {**predictor_rename_dict, **finlit_rename_dict}

# Rename columns in dataframe
original_df = original_df.rename(columns=rename_dict)

In [29]:
# Defining Financial Literacy
financial_literacy = original_df.copy()
print(financial_literacy.head(10).to_string)

for col, weight_dict in WEIGHTS.items():
    # financial_literacy[f"{col}_score"] = financial_literacy[col].map(weight_dict)
    original_df[f"{col}_score"] = original_df[col].map(weight_dict)



<bound method DataFrame.to_string of        NFCSID  A1  A2  A3  Own Stocks  B2_2  Own Mutual Funds  Own ETFs  \
0  2021010001   1   1   1           1     2                 1         2   
1  2021010028   1   1   1           1     2                 1         2   
2  2021010039   1   1   1           1     1                 1        98   
3  2021010045   2   2   1           2     2                98        98   
4  2021010048   1   1   1           1     2                 1         2   
5  2021010050   1   1   1           1     2                 2         1   
6  2021010085   1   1   1           1     1                 1        98   
7  2021010088   1   1   1           2     2                 1         2   
8  2021010090   1   1   1           1     2                 2         2   
9  2021010095   1   1   1           1     2                 2         2   

   Own Annuities  B2_7  ...  G12  G13  G23  Estate Planning      WGT1  Gender  \
0              2     2  ...    2    3   98              

In [30]:
# sum the *_score columns for the NEW weights dict
original_df["Total_Score"] = original_df[[f"{col}_score" for col in WEIGHTS.keys()]].sum(axis=1)

conditions = [
    (original_df["Total_Score"] >= 97),                                                  
    (original_df["Total_Score"] >= 81) & (original_df["Total_Score"] < 97),              
    (original_df["Total_Score"] >= 54) & (original_df["Total_Score"] < 81),              
    (original_df["Total_Score"] >= 36) & (original_df["Total_Score"] < 54),              
    (original_df["Total_Score"] >= 18) & (original_df["Total_Score"] < 36),              
    (original_df["Total_Score"] >= 1)  & (original_df["Total_Score"] < 18),              
    (original_df["Total_Score"] == 0)                                                    
]

choices = ["Perfect", "High", "Average", "Below Average", "Low", "Bad", "None"]

original_df["FinLit_Level"] = np.select(conditions, choices, default="No Response Given")


In [None]:

print("Pre-mapping", original_df.isna().sum().sum())

for col, mapping in LABELS_BY_COLUMN.items():
    if col in original_df.columns and mapping is not None:
        original_df[col] = original_df[col].map(mapping)

print("Post-mapping", cleaned_df.isna().sum().sum())

original_df.fillna("No Response Given", inplace=True)

print("Post-fillna", cleaned_df.isna().sum().sum())


Pre-mapping 9233


  original_df.fillna("No Response Given", inplace=True)


In [32]:


original_df.to_csv("2021 Cleaned.csv", index=False)
# Define new lists with human-readable names
predictors = list(predictor_rename_dict.values())
financial_lit_cols = list(finlit_rename_dict.values())

print("Post-cleaning", original_df.isna().sum().sum())

Post-cleaning 0
