In [1]:
import numpy as np
import pandas as pd
import seaborn as sns

In [None]:
# ==== Column to Keep ====
"""
filtered_columns = {
    "A3Ar_w": "Age Group",
    "A8_2021": "Annual Income",
    "CENSUSDIV": "Census Division",
    "CENSUSREG": "Census Region",
    "B1": "Checking Account",
    "J8": "Retirement Planning",
    "F1": "Credit Card",
    "F2_2": "Credit Card Balance Interest",
    "A5_2015": "Education",
    "J5": "Emergency Funds",
    "M1_1": "Financial Confidence",
    "M4": "Financial Knowledge",
    "A50A": "Gender",
    "A50B": "Gender&Age",
    "B14A_1": "Other Investments",
    "C5_2012": "Other Retirement Accounts",
    "B4": "Overdraw",
    "B2": "Savings Account",
    "STATEQ": "STATEQ",
    "H1": "Health Insurance",
    "M20": "Financial Education"
}

"""

'\nfiltered_columns = {\n    "A3Ar_w": "Age Group",\n    "A8_2021": "Annual Income",\n    "CENSUSDIV": "Census Division",\n    "CENSUSREG": "Census Region",\n    "B1": "Checking Account",\n    "J6": "College Savings",\n    "F1": "Credit Card",\n    "J32": "Credit Record",\n    "G38": "Debt Agency Contact",\n    "A5_2015": "Education",\n    "J20": "Emergency Confidence",\n    "J5": "Emergency Funds",\n    "M1_1": "Financial Confidence",\n    "M4": "Financial Knowledge",\n    "A50A": "Gender",\n    "A50B": "Gender&Age",\n    "G35": "Late Student Loan",\n    "E15_2015": "Mortgage Payment",\n    "B14A_1": "Other Investments",\n    "C5_2012": "Other Retirement Accounts",\n    "B4": "Overdraw",\n    "G20": "Past Due Payments",\n    "C1_2012": "Retirement Plan",\n    "B2": "Savings Account",\n    "STATEQ": "STATEQ"\n}\n'

In [None]:
# ========= Work on 2024 dataset =========

# Set relative path to the dataset on GitHub
df2024_path = 'https://raw.githubusercontent.com/Fasty8/ba780-fall25-a08/refs/heads/cleaning/FINRA%20National%20Financial%20Capability%20Study/2024-SxS-Data-and-Data-Info/NFCS%202024%20State%20Data%20250623.csv'

# Load 2024 dataset and replace missing values with NaN
df2024 = pd.read_csv(df2024_path, na_values=["", " ", "NA", "N/A", "null", ".", "na"])

In [15]:
df2024.head()

Unnamed: 0,NFCSID,STATEQ,CENSUSDIV,CENSUSREG,A50A,A3Ar_w,A50B,A4A_new_w,A5_2015,A6,...,M6,M7,M8,M31,M50,M9,M10,wgt_n2,wgt_d2,wgt_s3
0,2024010001,36,3,2,2,3,9,1,6,1,...,1,3,98,98,98,1,2,1.153548,1.123949,0.859644
1,2024010002,48,9,4,1,6,6,1,5,5,...,1,3,98,3,3,1,2,1.398688,0.86344,0.975078
2,2024010003,38,9,4,1,6,6,1,4,1,...,1,3,1,98,98,2,2,1.398688,0.472645,0.893974
3,2024010004,48,9,4,2,6,12,1,7,1,...,1,3,1,4,98,1,2,1.250293,0.614156,0.778748
4,2024010005,44,7,3,2,6,12,2,7,4,...,1,3,2,3,2,1,1,1.250076,2.228957,0.783507


In [None]:
# Filter df2024 to keep only the selected columns and stored in a new DataFrame 'filtered_df2024'
filtered_df2024 = df2024[[
                        'A3Ar_w', 'A8_2021', 'CENSUSDIV', 'CENSUSREG', 'B1', 'J8', 'F1', 'F2_2',
                        'A5_2015', 'J5', 'M1_1', 'M4', 'A50A', 'A50B',
                        'B14A_1', 'C5_2012', 'B4', 'B2',
                        'STATEQ', 'H1', 'M20'
                        ]]

In [20]:
filtered_df2024.head(3)

Unnamed: 0,A3Ar_w,A8_2021,CENSUSDIV,CENSUSREG,B1,J8,F1,F2_2,G38,A5_2015,...,E15_2015,B14A_1,C5_2012,B4,G20,C1_2012,B2,STATEQ,H1,M20
0,3,8,3,2,1,1.0,2,1.0,2,6,...,1.0,1.0,1.0,2.0,2,1,1,36,1,2
1,6,5,9,4,1,,3,2.0,2,5,...,,2.0,2.0,2.0,2,1,1,48,1,1
2,6,7,9,4,1,,3,1.0,2,4,...,1.0,2.0,,2.0,2,1,1,38,1,2


In [None]:
# ======= Create dictionary list to map values in each column =======

# A3Ar_w (6) Age group
age_group = {
    1: "18-24",
    2: "25-34",
    3: "35-44",
    4: "45-54",
    5: "55-64",
    6: "65+"
}

# A8_2021 (14) Household approximate annual income
annual_income = {
    1: "Less than $15,000",
    2: "At least $15,000 but less than $25,000",
    3: "At least $25,000 but less than $35,000",
    4: "At least $35,000 but less than $50,000",
    5: "At least $50,000 but less than $75,000",
    6: "At least $75,000 but less than $100,000",
    7: "At least $100,000 but less than $150,000",
    8: "At least $150,000 but less than $200,000",
    9: "At least $200,000 but less than $300,000",
    10: "$300,000 or more",
    98: "Don't know",
    99: "Prefer not to say"
}

# CENSUSDIV (3) Census Division
census_div = {
    1: "New England",
    2: "Middle Atlantic",
    3: "East North Central",
    4: "West North Central",
    5: "South Atlantic",
    6: "East South Central",
    7: "West South Central",
    8: "Mountain",
    9: "Pacific"
}

# CENSUSREG (4) Census Region
census_reg = {
    1: "Northeast",
    2: "Midwest",
    3: "South",
    4: "West"
}

# B1 (60) Do you [Does your household] have a checking account?
checking_account = {
    1: "Yes",
    2: "No",
    98: "Don't know",
    99: "Prefer not to say"
}

# J8 (27) Have you ever tried to figure out how much you need to save for retirement?
retirement_planning = {
    1: "Yes",
    2: "No",
    98: "Don't know",
    99: "Prefer not to say"
}

# F1 (88) Number of credit cards
num_credit_cards = {
    1: "1",
    2: "2 to 3",
    3: "4 to 8",
    4: "9 to 12",
    5: "13 to 20",
    6: "More than 20",
    7: "No credit cards",
    98: "Don't know",
    99: "Prefer not to say"
}

# F2_2 (98) In the past 12 months, which of the following describes your experience with credit cards? - IN SOME MONTHS, I CARRIED OVER A BALANCE AND WAS CHARGED INTEREST.
credit_card_balance_interest = {
    1: "Yes",
    2: "No",
    98: "Don't know",
    99: "Prefer not to say"
}

# A5_2015 (9) Highest education completed
education_2015 = {
    1: "Did not complete high school",
    2: "High school graduate - regular high school diploma",
    3: "High school graduate - GED or alternative credential",
    4: "Some college, no degree",
    5: "Associate's degree",
    6: "Bachelor's degree",
    7: "Post graduate degree",
    99: "Prefer not to say"
}

# J5 (30) Emergency or rainy day funds
emergency_funds = {
    1: "Yes",
    2: "No",
    98: "Don't know",
    99: "Prefer not to say"
}

# M1_1 (117) How strongly do you agree or disagree with the following statement? - I am good at dealing with day-to-day financial matters, such as checking accounts, credit and debit cards, and tracking expenses
financial_confidence = {
    1: "Strongerly disagree",
    2: "Moderately disagree",
    3: "Slightly disagree",
    4: "Neither agree nor disagree",
    5: "Slightly agree",
    6: "Moderately agree",
    7: "Strongly agree",
    98: "Don't know",
    99: "Prefer not to say"
}

# M4 (118) Self-assessed financial knowledge (1-7)
financial_knowledge = {
    1: "1", # very low
    2: "2",
    3: "3",
    4: "4",
    5: "5",
    6: "6",
    7: "7", # very high
    98: "Don't know",
    99: "Prefer not to say"
}

# A50A (5) Gender (nonbinary randomly assigned)
gender = {
    1: "Male",
    2: "Female"
}

# A50B (7) Gender/Age net (nonbinary randomly assigned)
gender_age = {
    1: "Male 18-24",
    2: "Male 25-34",
    3: "Male 35-44",
    4: "Male 45-54",
    5: "Male 55-64",
    6: "Male 65+",
    7: "Female 18-24",
    8: "Female 25-34",
    9: "Female 35-44",
    10: "Female 45-54",
    11: "Female 55-64",
    12: "Female 65+"
}

# B14A_1 (77) Not including retirement accounts, do you [does your household] have any investments in... - Stocks, bonds, mutual funds, or other securities
other_investments = {
    1: "Yes",
    2: "No",
    98: "Don't know",
    99: "Prefer not to say"
}

