In [31]:
#How does financial literacy vary across age groups and genders in the 2024 NFCS data?
#How does financial literacy vary across regions and divisions in the 2024 NFCS data?

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# === 1) Load original dataset ===
# cleaned_df = pd.read_csv("2015 Cleaned.csv", na_values=["", " ", "NA", "N/A", "null", ".", "na"])
original_df = pd.read_csv("NFCS 2015 Investor Data 161108.csv", na_values=["", " ", "NA", "N/A", "null", ".", "na"])

# ----------------------
# Shared Scales
# ----------------------
YES_NO = {1: "Yes", 2: "No", 98: "Don't know", 99: "Prefer not to say"}
YES_NO_SELECTED = {0: "NOT SELECTED", 1: "Selected"}
NEVER_SOMETIMES_FREQ = {1: "Never", 2: "Sometimes", 3: "Frequently", 98: "Don't know", 99: "Prefer not to say"}
AGREE_1_TO_7 = {1: "1 - Strongly disagree", 2: "2", 3: "3", 4: "4 - Neither agree nor disagree",
                 5: "5", 6: "6", 7: "7 - Strongly agree", 98: "Don't know", 99: "Prefer not to say"}
CONFIDENCE_1_TO_10 = {1: "1 - Not at all confident", 2: "2", 3: "3", 4: "4", 5: "5", 6: "6", 7: "7",
                       8: "8", 9: "9", 10: "10 - Extremely confident", 98: "Don't know", 99: "Prefer not to say"}
IMPORTANCE_1_TO_10 = {1: "1 - Not at all important", 2: "2", 3: "3", 4: "4", 5: "5", 6: "6", 7: "7",
                       8: "8", 9: "9", 10: "10 - Extremely important", 98: "Don't know", 99: "Prefer not to say"}
CLEARNESS_1_TO_10 = {1: "1 - Not at all clear", 2: "2", 3: "3", 4: "4", 5: "5", 6: "6", 7: "7",
                      8: "8", 9: "9", 10: "10 - Extremely clear", 98: "Don't know", 99: "Prefer not to say"}
RELY_NOT_AT_ALL_SOME_A_LOT = {1: "Not at all", 2: "Somewhat", 3: "A great deal", 98: "Don't know", 99: "Prefer not to say"}


# ----------------------
# Variable-Specific Scales
# ----------------------

A1 = {1: "Primary decision-maker", 2: "Share decision-making",
      3: "Do not participate in decisions", 98: "Don't know", 99: "Prefer not to say"}
A2 = YES_NO.copy()
A3 = YES_NO.copy()

B2_OWNERSHIP = YES_NO.copy()
B3 = {1: "None", 2: "1 to 3 times", 3: "4 to 10 times", 4: "11 times or more", 98: "Don't know", 99: "Prefer not to say"}
B4 = {1: "< $2,000", 2: "$2,000 to < $5,000", 3: "$5,000 to < $10,000", 4: "$10,000 to < $25,000",
      5: "$25,000 to < $50,000", 6: "$50,000 to < $100,000", 7: "$100,000 to < $250,000",
      8: "$250,000 to < $500,000", 9: "$500,000 to < $1,000,000", 10: "$1,000,000 or more",
      98: "Don't know", 99: "Prefer not to say"}
B5 = YES_NO.copy()
B6 = YES_NO.copy()
B8 = YES_NO.copy()
B10 = {1: "Take substantial risks for substantial returns", 2: "Above-average risks for above-average returns",
       3: "Average risks for average returns", 4: "Not willing to take any risks",
       98: "Don't know", 99: "Prefer not to say"}
B11 = {1: "More than half", 2: "Less than half", 3: "None", 98: "Don't know", 99: "Prefer not to say"}

C1 = {1: "Make all decisions myself", 2: "Some with professional help",
      3: "Professional makes all decisions", 98: "Don't know", 99: "Prefer not to say"}
C2 = YES_NO.copy()
C3 = {1: "None", 2: "1 to 3 times", 3: "4 to 10 times", 4: "11 times or more", 98: "Don't know", 99: "Prefer not to say"}
C4 = YES_NO.copy()
C5 = {1: "Not at all important", 2: "Somewhat important", 3: "Very important",
      98: "Don't know", 99: "Prefer not to say"}
C6 = C5.copy()
C7 = YES_NO.copy()
C8 = CLEARNESS_1_TO_10.copy()
C9 = YES_NO.copy()
C10 = RELY_NOT_AT_ALL_SOME_A_LOT.copy()
C11 = YES_NO.copy()
C12 = {1: "Do not read", 2: "Skim", 3: "Read entire", 98: "Don't know", 99: "Prefer not to say"}
C13 = IMPORTANCE_1_TO_10.copy()
C14 = CLEARNESS_1_TO_10.copy()
C15 = AGREE_1_TO_7.copy()
C16 = YES_NO.copy()

D1 = CONFIDENCE_1_TO_10.copy()
D2 = {1: "<0% (negative)", 2: "0–4.9%", 3: "5–9.9%", 4: "10–14.9%", 5: "15–19.9%", 6: "20%+",
      98: "Don't know", 99: "Prefer not to say"}
D3 = {1: "Worse than market", 2: "Same as market", 3: "Better than market", 98: "Don't know", 99: "Prefer not to say"}
D4 = AGREE_1_TO_7.copy()

E1 = CONFIDENCE_1_TO_10.copy()
E2 = YES_NO.copy()
E3 = {1: "Do not read disclosure", 2: "Skim disclosure", 3: "Read entire disclosure",
      98: "Don't know", 99: "Prefer not to say"}
E4 = {1: "Very valuable", 2: "Somewhat valuable", 3: "Not at all valuable", 98: "Don't know", 99: "Prefer not to say"}
E5 = {1: "Protecting investors", 2: "Protecting institutions", 3: "Both", 4: "Neither",
      98: "Don't know", 99: "Prefer not to say"}
E6 = {1: "In-person meeting", 2: "Paper by mail", 3: "Electronic by email", 4: "Online portal",
      5: "None of the above", 98: "Don't know", 99: "Prefer not to say"}

F1 = YES_NO.copy()
F2 = YES_NO.copy()
F3 = YES_NO.copy()
F4 = YES_NO.copy()
F5 = YES_NO.copy()

G1 = CONFIDENCE_1_TO_10.copy()
G2 = {1: "1 - Very low", 2: "2", 3: "3", 4: "4", 5: "5", 6: "6", 7: "7 - Very high", 98: "Don't know", 99: "Prefer not to say"}
G4 = {1: "Own a part of the company", 2: "Lent money to the company", 3: "Liable for debts",
      4: "Company returns investment + interest", 98: "Don't know", 99: "Prefer not to say"}
G5 = {1: "Own a part of the company", 2: "Lent money to the company", 3: "Liable for debts",
      4: "Can vote on resolutions", 98: "Don't know", 99: "Prefer not to say"}
G6 = {1: "Preferred stock", 2: "Common stock", 3: "Bonds", 98: "Don't know", 99: "Prefer not to say"}
G7 = {1: "True", 2: "False", 98: "Don't know", 99: "Prefer not to say"}
G8 = {1: "Stocks", 2: "Bonds", 3: "CDs", 4: "Money market accounts", 5: "Precious metals", 98: "Don't know", 99: "Prefer not to say"}
G9 = {1: "-10%", 2: "-5%", 3: "+5%", 4: "+10%", 5: "+15%", 6: "+20%", 98: "Don't know", 99: "Prefer not to say"}
G10 = {1: "Nominal = not inflation-adjusted; Real = inflation-adjusted",
       2: "Nominal = expected; Real = actual",
       3: "Nominal = pre-tax; Real = after-tax",
       4: "Nominal = pre-fee; Real = post-fee", 98: "Don't know", 99: "Prefer not to say"}
G11 = {1: "Municipal bonds lower risk", 2: "Greater demand", 3: "Tax-free", 98: "Don't know", 99: "Prefer not to say"}
G12 = {1: "$500", 2: "$250", 3: "$0", 98: "Don't know", 99: "Prefer not to say"}
G13 = {1: "Selling soon after buying", 2: "Selling before peak", 3: "Selling at a loss",
       4: "Selling borrowed shares", 98: "Don't know", 99: "Prefer not to say"}

H2 = YES_NO.copy()
H3 = YES_NO.copy()


# ----------------------
# Master Mapping
# ----------------------

LABELS_BY_COLUMN = {
    # A section
    "A1": A1, "A2": A2, "A3": A3,

    # B section
    "B2_1": B2_OWNERSHIP, "B2_2": B2_OWNERSHIP, "B2_3": B2_OWNERSHIP,
    "B2_4": B2_OWNERSHIP, "B2_5": B2_OWNERSHIP, "B2_6": B2_OWNERSHIP,
    "B2_7": B2_OWNERSHIP, "B2_8": B2_OWNERSHIP,
    "B3": B3, "B4": B4, "B5": B5, "B6": B6, "B8": B8, "B10": B10, "B11": B11,

    # C section
    "C1": C1, "C2": C2, "C3": C3, "C4": C4,
    "C5_1": C5, "C5_2": C5, "C5_3": C5, "C5_4": C5, "C5_5": C5,
    "C6": C6, "C7": C7, "C8": C8, "C9": C9,
    "C10_1": C10, "C10_2": C10, "C10_3": C10,
    "C11": C11, "C12": C12, "C13": C13, "C14": C14, "C15": C15,
    "C16_1": C16, "C16_2": C16, "C16_3": C16,

    # D section
    "D1_1": D1, "D1_2": D1, "D2": D2, "D3": D3, "D4": D4,

    # E section
    "E1_1": E1, "E1_2": E1, "E1_3": E1, "E2": E2, "E3": E3,
    "E4": E4, "E5": E5, "E6": E6,

    # F section
    "F1_1": F1, "F1_2": F1, "F1_3": F1, "F1_4": F1, "F1_5": F1, "F1_6": F1, "F1_7": F1, "F1_8": F1, "F1_9": F1,
    "F2_1": F2, "F2_2": F2, "F2_3": F2, "F2_4": F2, "F2_5": F2, "F2_6": F2, "F2_7": F2,
    "F3_1": F3, "F3_2": F3, "F3_3": F3, "F3_4": F3, "F3_5": F3, "F3_6": F3, "F3_7": F3, "F3_8": F3,
    "F4": F4, "F5": F5,

    # G section
    "G1": G1, "G2": G2, "G4": G4, "G5": G5, "G6": G6, "G7": G7, "G8": G8,
    "G9": G9, "G10": G10, "G11": G11, "G12": G12, "G13": G13,

    # H section
    "H2": H2, "H3": H3
}


# Financial Literacy

### Financial Literacy is defined by the following columns
"J5": "Emergency Funds",
"J6": "College Savings",
"J20": "Emergency Confidence",
"J32": "Credit Record",
"B1": "Checking Account",
"B2": "Savings Account",
"B4": "Overdraw",
"C1_2012": "Retirement Plan",
"C5_2012": "Other Retirement Accounts",
"B14A_1": "Other Investments",
"E15_2015": "Mortgage Payment",
"F1": "Credit Card",
"G20": "Past Due Payments",
"G35": "Late Student Loan",
"G38": "Debt Agency Contact",
"M1_1": "Financial Confidence",
"M4": "Financial Knowledge"

### To score a person on financial literacy we will be scoring them weighting their scores based on the format of the questions 
(To discuss with group)
Current Scheme:
J5: Yes (3), No (2), Don't Know (0), Prefer Not to Say (0), No Response Given (0)
J6: Yes (3), No (2), Don't Know (0), Prefer Not to Say (0), No Response Given (0)
J20: Certain could come up with full $2,000 (5), Probably could (4), Probably could not (3), certain could not (2), Don't know (0), Prefer not to say (0), No Response Given (0)
J32: Very Good (5), Good (4), About Average (3), Bad (2), Very Bad (1), Don't Know (0), Prefer Not to Say (0), No Response Given (0)
B1: Yes (3), No (2), Don't Know (0), Prefer Not to Say (0), No Response Given (0)
B2: Yes (3), No (2), Don't Know (0), Prefer Not to Say (0), No Response Given (0)
B4: Yes (3), No (2), Don't Know (0), Prefer Not to Say (0), No Response Given (0)
C1_2012: Yes (3), No (2), Don't Know (0), Prefer Not to Say (0), No Response Given (0) 
C5_2012: Yes (3), No (2), Don't Know (0), Prefer Not to Say (0), No Response Given (0)
B14A_1: Yes (3), No (2), Don't Know (0), Prefer Not to Say (0), No Response Given (0) 
E15_2015: Never (?), Once (?), More than once (?), Don't Know (0), Prefer Not to Say (0), No Response Given (0)
F1: 2 to 3 (?), 3 to 4 (?), 1 (?), 4 to 8 (?), 9 to 12 (?), 13 to 20 (?), More than 20, No credit cards (?), Don't Know (0), Prefer Not to Say (0), No Response Given (0)
G20: Yes (3), No (2), Don't Know (0), Prefer Not to Say (0), No Response Given (0) 
G35: Never, payments not due (?), Never (?), repaying on time each month (?), Once (?), More than once (?), Don't Know (0), Prefer Not to Say (0), No Response Given (0)
G38: Yes (3), No (2), Don't Know (0), Prefer Not to Say (0), No Response Given (0)


In [32]:
# ======================================================
# NFCS 2015 Investor Survey – Financial Literacy Weights
# ======================================================

# --- Shared utility weight template ---
BASE_W = {"No Response Given": 0, 98: 0, 99: 0}

# -------------------------------
#  Bucket 1: Core Access & Assets
# -------------------------------
# These parallel your 2021 "Core Access & Assets" block

# Checking account (F1_1)
F1_1_w = {1: 3, 2: 1, **BASE_W}   # Yes=3, No=1

# Savings account (F1_2)
F1_2_w = {1: 3, 2: 1, **BASE_W}

# Retirement plan (C2: yes/no)
C2_w = {1: 3, 2: 1, **BASE_W}

# Contribute to retirement (C3: frequency)
C3_w = {1: 0, 2: 1, 3: 2, 4: 3, **BASE_W}  # More contributions = higher weight

# Other investment ownership (B2_1 Individual stocks; B2_3 Mutual funds; B2_4 Bonds)
B2_1_w = {1: 3, 2: 1, **BASE_W}
B2_3_w = {1: 3, 2: 1, **BASE_W}
B2_4_w = {1: 3, 2: 1, **BASE_W}

# -------------------------------
#  Bucket 2: Investment Knowledge
# -------------------------------
# Gauges conceptual understanding and objective literacy items

# Self-rated knowledge (G2)
G2_w = {7: 5, 6: 4, 5: 3, 4: 2, 3: 1, 2: 0, 1: 0, **BASE_W}

# Investment confidence (G1)
G1_w = {10: 5, 9: 4, 8: 3, 7: 2, 6: 1, 5: 0, 4: 0, 3: 0, 2: 0, 1: 0, **BASE_W}

# Objective knowledge questions (G4–G13)
# Each question right (coded 1 in Investor file) = full points
KNOWLEDGE_CORRECT = {1: 3, 2: 0, 3: 0, 4: 0, **BASE_W}
G4_w = KNOWLEDGE_CORRECT.copy()
G5_w = KNOWLEDGE_CORRECT.copy()
G6_w = KNOWLEDGE_CORRECT.copy()
G7_w = {1: 3, 2: 0, **BASE_W}          # True/False
G8_w = {1: 3, 2: 3, 3: 0, 4: 0, 5: 0, **BASE_W}  # Bonds/stocks choice
G9_w = {4: 3, 5: 2, 6: 1, **BASE_W}    # Return expectation question
G10_w = {3: 3, **BASE_W}               # Real vs nominal return (correct=3)
G11_w = {3: 3, **BASE_W}               # Municipal bonds tax-free (correct=3)
G12_w = {1: 3, **BASE_W}               # Math question: correct=$500
G13_w = {4: 3, **BASE_W}               # Short selling definition (correct=4)

# -------------------------------
#  Bucket 3: Financial Behavior
# -------------------------------
# Measures habits such as risk tolerance, diversification, planning

# Risk tolerance (B10)
B10_w = {1: 5, 2: 4, 3: 3, 4: 1, **BASE_W}

# Decision-making autonomy (C1)
C1_w = {1: 3, 2: 2, 3: 1, **BASE_W}

# Reading statements/disclosures (C12, E3)
C12_w = {3: 3, 2: 2, 1: 1, **BASE_W}
E3_w = {3: 3, 2: 2, 1: 1, **BASE_W}

# Regulatory awareness (E5)
E5_w = {1: 3, 2: 1, 3: 3, 4: 0, **BASE_W}

# -------------------------------
#  Bucket 4: Confidence & Planning
# -------------------------------
# Uses perception and preparedness indicators

# Confidence in understanding investments (D1_1)
D1_1_w = {10: 5, 9: 4, 8: 3, 7: 2, 6: 1, 5: 0, **BASE_W}

# Confidence comparing returns (D1_2)
D1_2_w = D1_1_w.copy()

# Confidence in disclosures (E1_1)
E1_1_w = D1_1_w.copy()

# Confidence avoiding fraud (E1_2)
E1_2_w = D1_1_w.copy()

# Satisfaction with information clarity (C14)
C14_w = {10: 5, 9: 4, 8: 3, 7: 2, 6: 1, 5: 0, **BASE_W}

# -------------------------------
# Master Dictionary
# -------------------------------
WEIGHTS = {
    "Checking Account": F1_1_w,
    "Savings Account": F1_2_w,
    "Retirement Plan": C2_w,
    "Retirement Contributions": C3_w,
    "Individual Stocks": B2_1_w,
    "Mutual Funds": B2_3_w,
    "Bonds": B2_4_w,

    # Investment Knowledge
    "Self-Rated Knowledge": G2_w,
    "Investment Confidence": G1_w,
    "Objective Knowledge 1 (Company Ownership)": G4_w,
    "Objective Knowledge 2 (Shareholder Rights)": G5_w,
    "Objective Knowledge 3 (Security Type)": G6_w,
    "Objective Knowledge 4 (True/False)": G7_w,
    "Objective Knowledge 5 (Diversification)": G8_w,
    "Objective Knowledge 6 (Expected Return)": G9_w,
    "Objective Knowledge 7 (Real vs Nominal)": G10_w,
    "Objective Knowledge 8 (Municipal Bonds)": G11_w,
    "Objective Knowledge 9 (Math Problem)": G12_w,
    "Objective Knowledge 10 (Short Selling)": G13_w,

    # Financial Behavior
    "Risk Tolerance": B10_w,
    "Decision-Making Autonomy": C1_w,
    "Reads Statements": C12_w,
    "Reads Disclosures": E3_w,
    "Regulatory Awareness": E5_w,

    # Confidence & Planning
    "Confidence in Investing": D1_1_w,
    "Confidence Comparing Returns": D1_2_w,
    "Confidence Reading Disclosures": E1_1_w,
    "Confidence Avoiding Fraud": E1_2_w,
    "Clarity Satisfaction": C14_w
}


In [33]:
# === 3) Sorting & Renaming Variables (2015 Investor Dataset) ===

# ---------------------------------
# Predictor / Demographic Variables
# ---------------------------------
predictor_rename_dict = {
    "A4": "Education",              # 2015 equivalent to A5_2015
    "A6": "Dependent Children",
    "A7": "Annual Income",
    "A40": "Gender",
    "A9": "Age Group",
    "A41": "Gender&Age"
}

# ---------------------------------
# Financial Literacy & Behavior Variables
# ---------------------------------
finlit_rename_dict = {
    # --- Core Access & Assets ---
    "F1_1": "Checking Account",
    "F1_2": "Savings Account",
    "C2": "Retirement Plan",
    "C3": "Retirement Contributions",
    "B2_1": "Individual Stocks",
    "B2_3": "Mutual Funds",
    "B2_4": "Bonds",

    # --- Investment Knowledge ---
    "G1": "Investment Confidence",
    "G2": "Self-Rated Knowledge",
    "G4": "Objective Knowledge 1 (Company Ownership)",
    "G5": "Objective Knowledge 2 (Shareholder Rights)",
    "G6": "Objective Knowledge 3 (Security Type)",
    "G7": "Objective Knowledge 4 (True/False)",
    "G8": "Objective Knowledge 5 (Diversification)",
    "G9": "Objective Knowledge 6 (Expected Return)",
    "G10": "Objective Knowledge 7 (Real vs Nominal)",
    "G11": "Objective Knowledge 8 (Municipal Bonds)",
    "G12": "Objective Knowledge 9 (Math Problem)",
    "G13": "Objective Knowledge 10 (Short Selling)",

    # --- Financial Behavior ---
    "B10": "Risk Tolerance",
    "C1": "Decision-Making Autonomy",
    "C12": "Reads Statements",
    "E3": "Reads Disclosures",
    "E5": "Regulatory Awareness",

    # --- Confidence & Planning ---
    "D1_1": "Confidence in Investing",
    "D1_2": "Confidence Comparing Returns",
    "E1_1": "Confidence Reading Disclosures",
    "E1_2": "Confidence Avoiding Fraud",
    "C14": "Clarity Satisfaction"
}

# ---------------------------------
# Combine both dictionaries
# ---------------------------------
rename_dict = {**predictor_rename_dict, **finlit_rename_dict}

# ---------------------------------
# Apply renaming
# ---------------------------------
original_df = original_df.rename(columns=rename_dict)


In [34]:
# Defining Financial Literacy
financial_literacy = original_df.copy()
print(financial_literacy.head(10).to_string)

for col, weight_dict in WEIGHTS.items():
    # financial_literacy[f"{col}_score"] = financial_literacy[col].map(weight_dict)
    original_df[f"{col}_score"] = original_df[col].map(weight_dict)



<bound method DataFrame.to_string of        NFCSID  A1  A2  A3  Individual Stocks  B2_2  Mutual Funds  Bonds  B2_5  \
0  2015010006   1   1   1                  1     1             1      1     2   
1  2015010014   1   1   1                  1     1             1      1     1   
2  2015010023   2   1   1                  1     1             2      1     2   
3  2015010033   2   1   1                  1     1             1      2     1   
4  2015010034   1   1   1                  1     1             2      1     2   
5  2015010040   1   1   1                  2     2             1      2     2   
6  2015010041   2   1   1                  1     2             2      2     2   
7  2015010044   1   1   1                  1     2             2      2     2   
8  2015010046   1   1   1                  1     1             1      2     2   
9  2015010054   1   1   1                  2     2             1      2     2   

   B2_6  ...  G14_98  G14_99  H2  H3      WGT1  S_Gender  S_Age  S_Ethn

In [35]:
original_df["Total_Score"] = original_df[[f"{col}_score" for col in WEIGHTS.keys()]].sum(axis=1)

conditions = [
    (original_df["Total_Score"] >= 100),
    (original_df["Total_Score"] >= 85) & (original_df["Total_Score"] < 100),
    (original_df["Total_Score"] >= 60) & (original_df["Total_Score"] < 85),
    (original_df["Total_Score"] >= 40) & (original_df["Total_Score"] < 60),
    (original_df["Total_Score"] >= 20) & (original_df["Total_Score"] < 40),
    (original_df["Total_Score"] >= 1) & (original_df["Total_Score"] < 20),
    (original_df["Total_Score"] == 0)
]

choices = ["Perfect", "High", "Average", "Below Average", "Low", "Bad", "None"]

original_df["FinLit_Level"] = np.select(conditions, choices, default="No Response Given")



In [36]:

print("Pre-mapping", original_df.isna().sum().sum())

for col, mapping in LABELS_BY_COLUMN.items():
    if col in original_df.columns and isinstance(mapping, dict):
        original_df[col] = pd.Series.map(original_df[col], mapping)

# print("Post-mapping", cleaned_df.isna().sum().sum())

original_df.fillna("No Response Given", inplace=True)

# print("Post-fillna", cleaned_df.isna().sum().sum())


Pre-mapping 18174


  original_df.fillna("No Response Given", inplace=True)


In [37]:


original_df.to_csv("2015 Cleaned.csv", index=False)
# Define new lists with human-readable names
predictors = list(predictor_rename_dict.values())
financial_lit_cols = list(finlit_rename_dict.values())

print("Post-cleaning", original_df.isna().sum().sum())

Post-cleaning 0
