In [31]:
import pandas as pd
import numpy as np
import seaborn as sns

In [87]:
# ==== Columns to Keep (reference list)====
"""
filtered_columns = {
    "STATEQ": "State",
    "CENSUSDIV": "Census Division",
    "CENSUSREG": "Census Region",
    "A3": "Gender",
    "A3Ar_w": "Age Group",
    "A3B": "Gender&Age",
    "A5_2012": "Education",
    "A8": "Annual Income",
    "B1": "Checking Account",
    "B2": "Savings Account",
    "B4": "Overdraw",
    "B14": "Other Investments",
    "C5_2012": "Other Retirement Accounts",
    "F1": "Number of Credit Cards",
    "F2_2": "Credit Card Balance Interest",
    "J5": "Emergency Funds",
    "J8": "Retirement Planning",
    "H1": "Health Insurance",
    "M1_1": "Financial Confidence",
    "M4": "Financial Knowledge"
}
"""

'\nfiltered_columns = {\n    "STATEQ": "State",\n    "CENSUSDIV": "Census Division",\n    "CENSUSREG": "Census Region",\n    "A3": "Gender",\n    "A3Ar_w": "Age Group",\n    "A3B": "Gender&Age",\n    "A5_2012": "Education",\n    "A8": "Annual Income",\n    "B1": "Checking Account",\n    "B2": "Savings Account",\n    "B4": "Overdraw",\n    "B14": "Other Investments",\n    "C5_2012": "Other Retirement Accounts",\n    "F1": "Number of Credit Cards",\n    "F2_2": "Credit Card Balance Interest",\n    "J5": "Emergency Funds",\n    "J8": "Retirement Planning",\n    "H1": "Health Insurance",\n    "M1_1": "Financial Confidence",\n    "M4": "Financial Knowledge"\n}\n'

In [77]:
# ======================== Work on 2012 dataset ========================

# Set relative path to the dataset on GitHub
df2012_path = 'https://raw.githubusercontent.com/Fasty8/ba780-fall25-a08/refs/heads/cleaning/FINRA%20National%20Financial%20Capability%20Study/2012-SxS-Data-and-Data-Info/NFCS%202012%20State%20Data%20130503.csv'

# Load 2012 dataset and replace missing values with NaN
df2012 = pd.read_csv(df2012_path, na_values=["", " ", "NA", "N/A", "null", ".", "na"])

# Filter df2012 to keep only the selected columns and stored in a new DataFrame 'filtered_df2012'
filtered_df2012 = df2012[[
                        'STATEQ', 'CENSUSDIV', 'CENSUSREG', 'A3', 'A3Ar_w', 'A3B', 'A5_2012', 'A8', 'B1',
                        'B2', 'B4', 'B14', 'C5_2012', 'F1', 'F2_2', 'J5', 'J8', 'H1', 'M1_1', 'M4'
                        ]]


In [60]:
# === Pre-Cleaning Phase: Display the first few rows of the original and filtered DataFrames ===
print(df2012.head())
print(filtered_df2012.head())

       NFCSID  STATEQ  CENSUSDIV  CENSUSREG  A3  A3Ar_w  A3B  A4A_new_w  \
0  2012010001      24          4          2   2       5   11          1   
1  2012010002      10          5          3   2       5   11          1   
2  2012010003      23          3          2   2       6   12          1   
3  2012010004      14          3          2   2       6   12          1   
4  2012010005      44          7          3   2       2    8          2   

   A5_2012  A6  ...  M21_4  M22  M6  M7  M8  M9  M10    wgt_n2    wgt_d2  \
0        6   4  ...    NaN    1   1   3   4   1    2  0.363417  1.260305   
1        4   1  ...    NaN    1   1  98  98  98   98  1.173593  2.956766   
2        4   4  ...    NaN    1   1   3  98   1    2  1.577671  1.168197   
3        2   5  ...    NaN    1   2  98  98  98   98  1.577671  1.531504   
4        4   1  ...    NaN    1   1   2  98   1    1  2.167569  2.524668   

     wgt_s3  
0  0.631540  
1  1.005697  
2  1.025873  
3  1.179334  
4  0.926108  

[5 rows

In [78]:
# ======================== Create dictionary list to map values in each column =========================

# STATEQ (2) State
state = {
    1: "Alabama", 2: "Alaska", 3: "Arizona", 4: "Arkansas", 5: "California",
    6: "Colorado", 7: "Connecticut", 8: "Delaware", 9: "District of Columbia",
    10: "Florida", 11: "Georgia", 12: "Hawaii", 13: "Idaho", 14: "Illinois",
    15: "Indiana", 16: "Iowa", 17: "Kansas", 18: "Kentucky", 19: "Louisiana",
    20: "Maine", 21: "Maryland", 22: "Massachusetts", 23: "Michigan",
    24: "Minnesota", 25: "Mississippi", 26: "Missouri", 27: "Montana",
    28: "Nebraska", 29: "Nevada", 30: "New Hampshire", 31: "New Jersey",
    32: "New Mexico", 33: "New York", 34: "North Carolina", 35: "North Dakota",
    36: "Ohio", 37: "Oklahoma", 38: "Oregon", 39: "Pennsylvania", 40: "Rhode Island",
    41: "South Carolina", 42: "South Dakota", 43: "Tennessee", 44: "Texas",
    45: "Utah", 46: "Vermont", 47: "Virginia", 48: "Washington", 49: "West Virginia",
    50: "Wisconsin", 51: "Wyoming"
}

# CENSUSDIV (3) Census Division
census_div = {
    1: "New England",
    2: "Middle Atlantic",
    3: "East North Central",
    4: "West North Central",
    5: "South Atlantic",
    6: "East South Central",
    7: "West South Central",
    8: "Mountain",
    9: "Pacific"
}

# CENSUSREG (4) Census Region
census_reg = {
    1: "Northeast",
    2: "Midwest",
    3: "South",
    4: "West"
}

# A3 (5) Gender (nonbinary randomly assigned)
gender = {
    1: "Male",
    2: "Female"
}

# A3Ar_w (6) Age group
age_group = {
    1: "18-24",
    2: "25-34",
    3: "35-44",
    4: "45-54",
    5: "55-64",
    6: "65+"
}

# A3B (7) Gender/Age net (nonbinary randomly assigned)
gender_age = {
    1: "Male 18-24",
    2: "Male 25-34",
    3: "Male 35-44",
    4: "Male 45-54",
    5: "Male 55-64",
    6: "Male 65+",
    7: "Female 18-24",
    8: "Female 25-34",
    9: "Female 35-44",
    10: "Female 45-54",
    11: "Female 55-64",
    12: "Female 65+"
}

# A5_2012 (9) Highest education completed
education = {
    1: "Did not complete high school",
    2: "High school graduate - regular high school diploma",
    3: "High school graduate - GED or alternative credential",
    4: "Some college, no degree",
    5: "Associate's degree",
    6: "Bachelor's degree",
    7: "Post graduate degree",
    99: "Prefer not to say"
}

# A8 (14) Household approximate annual income
annual_income = {
    1: "Less than $15,000",
    2: "At least $15,000 but less than $25,000",
    3: "At least $25,000 but less than $35,000",
    4: "At least $35,000 but less than $50,000",
    5: "At least $50,000 but less than $75,000",
    6: "At least $75,000 but less than $100,000",
    7: "At least $100,000 but less than $150,000",
    8: "At least $150,000 but less than $200,000",
    9: "At least $200,000 but less than $300,000",
    10: "$300,000 or more",
    98: "Don't know",
    99: "Prefer not to say"
}

# B1 (60) Do you [Does your household] have a checking account?
checking_account = {
    1: "Yes",
    2: "No",
    98: "Don't know",
    99: "Prefer not to say"
}

# B2 (61) Do you [Does your household] have a savings account, money market account, or CDs?
savings_account = {
    1: "Yes",
    2: "No",
    98: "Don't know",
    99: "Prefer not to say"
}

# B4 (62) Do you [or your spouse/partner] overdraw your checking account occasionally?
overdraw = {
    1: "Yes",
    2: "No",
    98: "Don't know",
    99: "Prefer not to say"
}

# B14 (77) Not including retirement accounts, do you [does your household] have any investments in... - Stocks, bonds, mutual funds, or other securities
other_investments = {
    1: "Yes",
    2: "No",
    98: "Don't know",
    99: "Prefer not to say"
}

# C5_2012 (74) Do you [or your spouse/partner] regularly contribute to a retirement account like a [Thrift Savings Plan (TSP),] 401(k) or IRA?
other_retirement_accounts = {
    1: "Yes",
    2: "No",
    98: "Don't know",
    99: "Prefer not to say"
}

# F1 (88) Number of credit cards
num_credit_cards = {
    1: "1",
    2: "2 to 3",
    3: "4 to 8",
    4: "9 to 12",
    5: "13 to 20",
    6: "More than 20",
    7: "No credit cards",
    98: "Don't know",
    99: "Prefer not to say"
}

# F2_2 (98) In the past 12 months, which of the following describes your experience with credit cards? - IN SOME MONTHS, I CARRIED OVER A BALANCE AND WAS CHARGED INTEREST.
credit_card_balance_interest = {
    1: "Yes",
    2: "No",
    98: "Don't know",
    99: "Prefer not to say"
}

# J5 (30) Emergency or rainy day funds
emergency_funds = {
    1: "Yes",
    2: "No",
    98: "Don't know",
    99: "Prefer not to say"
}

# J8 (27) Have you ever tried to figure out how much you need to save for retirement?
retirement_planning = {
    1: "Yes",
    2: "No",
    98: "Don't know",
    99: "Prefer not to say"
}

# H1 (104) Are you covered by health insurance?
health_insurance = {
    1: "Yes",
    2: "No",
    98: "Don't know",
    99: "Prefer not to say"
}

# M1_1 (117) How strongly do you agree or disagree with the following statement? - I am good at dealing with day-to-day financial matters, such as checking accounts, credit and debit cards, and tracking expenses
financial_confidence = {
    1: "Strongerly disagree",
    2: "Moderately disagree",
    3: "Slightly disagree",
    4: "Neither agree nor disagree",
    5: "Slightly agree",
    6: "Moderately agree",
    7: "Strongly agree",
    98: "Don't know",
    99: "Prefer not to say"
}

# M4 (118) Self-assessed financial knowledge (1-7)
financial_knowledge = {
    1: "1", # very low
    2: "2",
    3: "3",
    4: "4",
    5: "5",
    6: "6",
    7: "7", # very high
    98: "Don't know",
    99: "Prefer not to say"
}


In [88]:
# ======== RENAME AND REPLACE ========

# ============= Re-name columns to more readable names =============
column_names_mapping = {
    "STATEQ": "State",
    "CENSUSDIV": "Census Division",
    "CENSUSREG": "Census Region",
    "A3": "Gender",
    "A3Ar_w": "Age Group",
    "A3B": "Gender&Age",
    "A5_2012": "Education",
    "A8": "Annual Income",
    "B1": "Checking Account",
    "B2": "Savings Account",
    "B4": "Overdraw",
    "B14": "Other Investments",
    "C5_2012": "Other Retirement Accounts",
    "F1": "Number of Credit Cards",
    "F2_2": "Credit Card Balance Interest",
    "J5": "Emergency Funds",
    "J8": "Retirement Planning",
    "H1": "Health Insurance",
    "M1_1": "Financial Confidence",
    "M4": "Financial Knowledge"
}



# ============= Match column names to coded values dicts =============
column_values_mapping = {
    "STATEQ": state,
    "CENSUSDIV": census_div,
    "CENSUSREG": census_reg,
    "A3": gender,
    "A3Ar_w": age_group,
    "A3B": gender_age,
    "A5_2012": education,
    "A8": annual_income,
    "B1": checking_account,
    "B2": savings_account,
    "B4": overdraw,
    "B14": other_investments,
    "C5_2012": other_retirement_accounts,
    "J5": emergency_funds,
    "J8": retirement_planning,
    "F1": num_credit_cards,
    "F2_2": credit_card_balance_interest,
    "H1": health_insurance,
    "M1_1": financial_confidence,
    "M4": financial_knowledge
}


# ========= Replace values and Rename columns. Saved to a new df 'cleaned_df2012' ===========
cleaned_df2012 = filtered_df2012.replace(column_values_mapping).rename(columns=column_names_mapping)

In [68]:
cleaned_df2012.head()

Unnamed: 0,State,Census Division,Census Region,Gender,Age Group,Gender&Age,Education,Annual Income,Checking Account,Savings Account,Overdraw,Other Investments,Other Retirement Accounts,Credit Card,Credit Card Balance Interest,Emergency Funds,Retirement Planning,Health Insurance,Financial Confidence,Financial Knowledge
0,Minnesota,West North Central,Midwest,Female,55-64,Female 55-64,Bachelor's degree,"At least $50,000 but less than $75,000",Yes,Yes,No,Yes,Yes,2 to 3,No,Don't know,Don't know,Yes,Strongly agree,6
1,Florida,South Atlantic,South,Female,55-64,Female 55-64,"Some college, no degree","At least $15,000 but less than $25,000",Yes,Yes,No,No,,No credit cards,,No,,No,Moderately agree,5
2,Michigan,East North Central,Midwest,Female,65+,Female 65+,"Some college, no degree","At least $25,000 but less than $35,000",Yes,Yes,No,No,No,9 to 12,No,Yes,,Yes,Strongly agree,5
3,Illinois,East North Central,Midwest,Female,65+,Female 65+,High school graduate - regular high school dip...,"At least $15,000 but less than $25,000",Yes,Yes,No,Yes,No,4 to 8,Yes,No,,Yes,Neither agree nor disagree,Don't know
4,Texas,West South Central,South,Female,25-34,Female 25-34,"Some college, no degree","At least $35,000 but less than $50,000",Yes,Yes,No,No,,No credit cards,,Yes,No,No,Neither agree nor disagree,6


In [90]:
# Total missing values in each column
cleaned_df2012.isna().sum().sort_values(ascending=False)

cleaned_df2012.to_csv("Cleaned 2012.csv", index=False)

In [91]:
# ==== Columns to Keep (reference list)====
"""
filtered_columns = {
    "STATEQ": "State",
    "CENSUSDIV": "Census Division",
    "CENSUSREG": "Census Region",
    "A3": "Gender",
    "A3Ar_w": "Age Group",
    "A3B": "Gender&Age",
    "A5": "Education",
    "A8": "Annual Income",
    "B1": "Checking Account",
    "B2": "Savings Account",
    "B4": "Overdraw",
    "B14": "Other Investments",
    "C5": "Other Retirement Accounts",
    "J5": "Emergency Funds",
    "J8": "Retirement Planning",
    "F1": "Number of Credit Cards",
    "F2_2": "Credit Card Balance Interest",
    "H1": "Health Insurance",
    "M1_1": "Financial Confidence",
    "M4": "Financial Knowledge"
}
"""

'\nfiltered_columns = {\n    "STATEQ": "State",\n    "CENSUSDIV": "Census Division",\n    "CENSUSREG": "Census Region",\n    "A3": "Gender",\n    "A3Ar_w": "Age Group",\n    "A3B": "Gender&Age",\n    "A5": "Education",\n    "A8": "Annual Income",\n    "B1": "Checking Account",\n    "B2": "Savings Account",\n    "B4": "Overdraw",\n    "B14": "Other Investments",\n    "C5": "Other Retirement Accounts",\n    "J5": "Emergency Funds",\n    "J8": "Retirement Planning",\n    "F1": "Number of Credit Cards",\n    "F2_2": "Credit Card Balance Interest",\n    "H1": "Health Insurance",\n    "M1_1": "Financial Confidence",\n    "M4": "Financial Knowledge"\n}\n'

In [92]:
# ======================== Work on 2009 dataset ========================

# Set relative path to the dataset on GitHub
df2009_path = 'https://raw.githubusercontent.com/Fasty8/ba780-fall25-a08/refs/heads/main/FINRA%20National%20Financial%20Capability%20Study/2009-SxS-v2/NFCS%202009%20State%20Data%20220712%20v2.csv'

# Load 2009 dataset and replace missing values with NaN
df2009 = pd.read_csv(df2009_path, na_values=["", " ", "NA", "N/A", "null", ".", "na"])

# Filter df2012 to keep only the selected columns and stored in a new DataFrame 'filtered_df2012'
filtered_df2009 = df2009[[
                        'STATEQ', 'CENSUSDIV', 'CENSUSREG', 'A3', 'A3Ar_w', 'A3B', 'A5', 'A8', 'B1',
                        'B2', 'B4', 'B14', 'C5', 'F1', 'F2_2', 'J5', 'J8', 'H1', 'M1_1', 'M4'
                        ]]


In [71]:
# === Pre-Cleaning Phase: Display the first few rows of the original and filtered DataFrames ===
print(df2009.head())
print(filtered_df2009.head())

       NFCSID  STATEQ  CENSUSDIV  CENSUSREG  A3  A3Ar_w  A3B  A4A_new_w  A5  \
0  2009010001      43          6          3   2       2    8          2   5   
1  2009010002      44          7          3   1       5    5          1   3   
2  2009010003      44          7          3   1       5    5          1   3   
3  2009010004      33          2          1   2       3    9          2   3   
4  2009010005       5          9          4   1       6    6          1   3   

   A6  ...  M1_3  M4  M6  M7  M8  M9  M10    wgt_n2    wgt_d2    wgt_s3  
0   1  ...     6   5   1   3   3   1   98  0.207777  0.393232  0.351114  
1   4  ...     7   6   1   3   2   1    2  1.593761  3.634348  1.282727  
2   1  ...     5   5   1   3   2   1    2  1.593761  3.634348  1.282727  
3   2  ...     2   6   1  98  98   1    2  2.547156  1.891680  1.366110  
4   4  ...     4   4   1   3   1   1    2  2.121042  4.461678  1.384670  

[5 rows x 128 columns]
   STATEQ  CENSUSDIV  CENSUSREG  A3  A3Ar_w  A3B  A5  A8 

In [93]:
# ======================== Create dictionary list to map values in each column =========================

# STATEQ (2) State
state = {
    1: "Alabama", 2: "Alaska", 3: "Arizona", 4: "Arkansas", 5: "California",
    6: "Colorado", 7: "Connecticut", 8: "Delaware", 9: "District of Columbia",
    10: "Florida", 11: "Georgia", 12: "Hawaii", 13: "Idaho", 14: "Illinois",
    15: "Indiana", 16: "Iowa", 17: "Kansas", 18: "Kentucky", 19: "Louisiana",
    20: "Maine", 21: "Maryland", 22: "Massachusetts", 23: "Michigan",
    24: "Minnesota", 25: "Mississippi", 26: "Missouri", 27: "Montana",
    28: "Nebraska", 29: "Nevada", 30: "New Hampshire", 31: "New Jersey",
    32: "New Mexico", 33: "New York", 34: "North Carolina", 35: "North Dakota",
    36: "Ohio", 37: "Oklahoma", 38: "Oregon", 39: "Pennsylvania", 40: "Rhode Island",
    41: "South Carolina", 42: "South Dakota", 43: "Tennessee", 44: "Texas",
    45: "Utah", 46: "Vermont", 47: "Virginia", 48: "Washington", 49: "West Virginia",
    50: "Wisconsin", 51: "Wyoming"
}

# CENSUSDIV (3) Census Division
census_div = {
    1: "New England",
    2: "Middle Atlantic",
    3: "East North Central",
    4: "West North Central",
    5: "South Atlantic",
    6: "East South Central",
    7: "West South Central",
    8: "Mountain",
    9: "Pacific"
}

# CENSUSREG (4) Census Region
census_reg = {
    1: "Northeast",
    2: "Midwest",
    3: "South",
    4: "West"
}

# A3 (5) Gender (nonbinary randomly assigned)
gender = {
    1: "Male",
    2: "Female"
}

# A3Ar_w (6) Age group
age_group = {
    1: "18-24",
    2: "25-34",
    3: "35-44",
    4: "45-54",
    5: "55-64",
    6: "65+"
}

# A3B (7) Gender/Age net (nonbinary randomly assigned)
gender_age = {
    1: "Male 18-24",
    2: "Male 25-34",
    3: "Male 35-44",
    4: "Male 45-54",
    5: "Male 55-64",
    6: "Male 65+",
    7: "Female 18-24",
    8: "Female 25-34",
    9: "Female 35-44",
    10: "Female 45-54",
    11: "Female 55-64",
    12: "Female 65+"
}

# A5 (9) Highest education completed
education = {
    1: "Did not complete high school",
    2: "High school graduate - regular high school diploma",
    3: "High school graduate - GED or alternative credential",
    4: "Some college, no degree",
    5: "Associate's degree",
    6: "Bachelor's degree",
    7: "Post graduate degree",
    99: "Prefer not to say"
}

# A8 (14) Household approximate annual income
annual_income = {
    1: "Less than $15,000",
    2: "At least $15,000 but less than $25,000",
    3: "At least $25,000 but less than $35,000",
    4: "At least $35,000 but less than $50,000",
    5: "At least $50,000 but less than $75,000",
    6: "At least $75,000 but less than $100,000",
    7: "At least $100,000 but less than $150,000",
    8: "At least $150,000 but less than $200,000",
    9: "At least $200,000 but less than $300,000",
    10: "$300,000 or more",
    98: "Don't know",
    99: "Prefer not to say"
}

# B1 (60) Do you [Does your household] have a checking account?
checking_account = {
    1: "Yes",
    2: "No",
    98: "Don't know",
    99: "Prefer not to say"
}

# B2 (61) Do you [Does your household] have a savings account, money market account, or CDs?
savings_account = {
    1: "Yes",
    2: "No",
    98: "Don't know",
    99: "Prefer not to say"
}

# B4 (62) Do you [or your spouse/partner] overdraw your checking account occasionally?
overdraw = {
    1: "Yes",
    2: "No",
    98: "Don't know",
    99: "Prefer not to say"
}

# B14 (77) Not including retirement accounts, do you [does your household] have any investments in... - Stocks, bonds, mutual funds, or other securities
other_investments = {
    1: "Yes",
    2: "No",
    98: "Don't know",
    99: "Prefer not to say"
}

# C5 (74) Do you [or your spouse/partner] regularly contribute to a retirement account like a [Thrift Savings Plan (TSP),] 401(k) or IRA?
other_retirement_accounts = {
    1: "Yes",
    2: "No",
    98: "Don't know",
    99: "Prefer not to say"
}

# F1 (88) Number of credit cards
num_credit_cards = {
    1: "1",
    2: "2 to 3",
    3: "4 to 8",
    4: "9 to 12",
    5: "13 to 20",
    6: "More than 20",
    7: "No credit cards",
    98: "Don't know",
    99: "Prefer not to say"
}

# F2_2 (98) In the past 12 months, which of the following describes your experience with credit cards? - IN SOME MONTHS, I CARRIED OVER A BALANCE AND WAS CHARGED INTEREST.
credit_card_balance_interest = {
    1: "Yes",
    2: "No",
    98: "Don't know",
    99: "Prefer not to say"
}

# J5 (30) Emergency or rainy day funds
emergency_funds = {
    1: "Yes",
    2: "No",
    98: "Don't know",
    99: "Prefer not to say"
}

# J8 (27) Have you ever tried to figure out how much you need to save for retirement?
retirement_planning = {
    1: "Yes",
    2: "No",
    98: "Don't know",
    99: "Prefer not to say"
}

# H1 (104) Are you covered by health insurance?
health_insurance = {
    1: "Yes",
    2: "No",
    98: "Don't know",
    99: "Prefer not to say"
}

# M1_1 (117) How strongly do you agree or disagree with the following statement? - I am good at dealing with day-to-day financial matters, such as checking accounts, credit and debit cards, and tracking expenses
financial_confidence = {
    1: "Strongerly disagree",
    2: "Moderately disagree",
    3: "Slightly disagree",
    4: "Neither agree nor disagree",
    5: "Slightly agree",
    6: "Moderately agree",
    7: "Strongly agree",
    98: "Don't know",
    99: "Prefer not to say"
}

# M4 (118) Self-assessed financial knowledge (1-7)
financial_knowledge = {
    1: "1", # very low
    2: "2",
    3: "3",
    4: "4",
    5: "5",
    6: "6",
    7: "7", # very high
    98: "Don't know",
    99: "Prefer not to say"
}


In [99]:
# ======== RENAME AND REPLACE ========

# ============= Re-name columns to more readable names =============
column_names_mapping = {
    "STATEQ": "State",
    "CENSUSDIV": "Census Division",
    "CENSUSREG": "Census Region",
    "A3": "Gender",
    "A3Ar_w": "Age Group",
    "A3B": "Gender&Age",
    "A5": "Education",
    "A8": "Annual Income",
    "B1": "Checking Account",
    "B2": "Savings Account",
    "B4": "Overdraw",
    "B14": "Other Investments",
    "C5": "Other Retirement Accounts",
    "F1": " Number of Credit Cards",
    "F2_2": "Credit Card Balance Interest",
    "J5": "Emergency Funds",
    "J8": "Retirement Planning",
    "H1": "Health Insurance",
    "M1_1": "Financial Confidence",
    "M4": "Financial Knowledge"
}



# ============= Match column names to coded values dicts =============
column_values_mapping = {
    "STATEQ": state,
    "CENSUSDIV": census_div,
    "CENSUSREG": census_reg,
    "A3": gender,
    "A3Ar_w": age_group,
    "A3B": gender_age,
    "A5": education,
    "A8": annual_income,
    "B1": checking_account,
    "B2": savings_account,
    "B4": overdraw,
    "B14": other_investments,
    "C5": other_retirement_accounts,
    "F1": num_credit_cards,
    "F2_2": credit_card_balance_interest,
    "J5": emergency_funds,
    "J8": retirement_planning,
    "H1": health_insurance,
    "M1_1": financial_confidence,
    "M4": financial_knowledge
}


# ========= Replace values and Rename columns. Saved to a new df 'cleaned_df2009' ===========
cleaned_df2009 = filtered_df2009.replace(column_values_mapping).rename(columns=column_names_mapping)

In [98]:
cleaned_df2009.head()

Unnamed: 0,State,Census Division,Census Region,Gender,Age Group,Gender&Age,Education,Annual Income,Checking Account,Savings Account,Overdraw,Other Investments,Other Retirement Accounts,Number of Credit Cards,Credit Card Balance Interest,Emergency Funds,Retirement Planning,Health Insurance,Financial Confidence,Financial Knowledge
0,Tennessee,East South Central,South,Female,25-34,Female 25-34,5,"At least $100,000 but less than $150,000",Yes,Yes,No,Yes,,1,Yes,No,No,Yes,Strongly agree,5
1,Texas,West South Central,South,Male,55-64,Male 55-64,3,"At least $25,000 but less than $35,000",Yes,Yes,No,Yes,,4 to 8,Yes,No,,No,Moderately agree,6
2,Texas,West South Central,South,Male,55-64,Male 55-64,3,"At least $50,000 but less than $75,000",Yes,Yes,No,Yes,1.0,2 to 3,Yes,No,Yes,Yes,Moderately agree,5
3,New York,Middle Atlantic,Northeast,Female,35-44,Female 35-44,3,"At least $35,000 but less than $50,000",No,Yes,,No,,No credit cards,,No,No,Yes,Moderately disagree,6
4,California,Pacific,West,Male,65+,Male 65+,3,"At least $25,000 but less than $35,000",Yes,Yes,Yes,No,,No credit cards,,No,No,Yes,Moderately agree,4


In [101]:
# Total missing values in each column
cleaned_df2009.isna().sum().sort_values(ascending=False)

cleaned_df2009.to_csv("Cleaned 2009.csv", index=False)