# Initial Setup

In [None]:
""" 
Team 5 Analytics Challenge Group Assignment: Data Wrangling 

Authors: Andrew Bakker, Hutson Collins, Durrelle Maynard, Dario Santiago Lopez, and Chris Sawyer 
Collaborators: ChatGPT (OpenAI), which assisted in iterative troubleshooting, design 
decisions, and refining data-cleaning logic.

Beginning of Assignment: 
The first thing we want to do is import the libraries necessary for the given task 
In this case, we import pandas since we will be analyzing and creating dataframes, and any additional libraries 
that immediately comes to mind 
"""
# --- Step 1: Import libraries ---

import os 
from pathlib import Path
import pandas as pd 

In [None]:
""" 
Since this is a team assignment, we will be implementing a way for every teammate to access the CSV files without 
directly asking for the file path where the CSV files are located. Instead, if they are already in the same directory, 
they can just start running the code 
"""

# --- Step 2: Open/Analyze the CSV files & Create DF's ---
# Get the directory where the current notebook is located
notebook_dir = Path().resolve()

# Point to CSV in the same directory
ar_csv = notebook_dir / "application-records.csv" # "ar" is application-records.csv abbreviated, so we don't have to type out a long variable 
pr_csv = notebook_dir / "personal-records.csv"    # "pr" is personal-records.csv abbreviated for the same reason above 

# Create the DataFrame  
ar_df = pd.read_csv(ar_csv)
pr_df = pd.read_csv(pr_csv)

# Take a quick peek of the sizes/shapes 
print("Shape Preview")
print("----------------------")
print("application-records.csv size: ", ar_df.shape)
print("personal-records.csv size: ", pr_df.shape)
print("----------------------\n")

# Now take a look at the actual dataframes (do them in two different cell blocks)
print("Now Previewing DataFrames")
print("application-records.csv")
ar_df


In [None]:
# Take a peek at the other DataFrame
print("personal-records.csv")
pr_df

In [None]:
# Check to see if they have any columns in common 
print("Column Names")
print("application-records.csv columns:")
print(list(ar_df.columns))
print("\npersonal-records.csv columns:")
print(list(pr_df.columns))

In [None]:
"""
So, the only columns the two datasets share is record_id. However, personal-records.csv has significantly more rows than 
application-records.csv. So, my (Dario's) thought process is to see if they have any duplicates in either of the datasets 
since I would consider cleaning pr_df (since it has 6 rows) and then consider merging them into one dataset. 
""" 
# Check for duplicates in both 'record_id' columns in each of the datasets 
print("AR duplicate IDs:", ar_df['record_id'].duplicated().sum())
print("PR duplicate IDs:", pr_df['record_id'].duplicated().sum())

# Also check for shared, extra, and total length of the column 
ar_ids = set(ar_df['record_id'])
pr_ids = set(pr_df['record_id'])

print("\nAR IDs:", len(ar_ids))
print("PR IDs:", len(pr_ids))
print("Shared IDs:", len(ar_ids & pr_ids))
print("Extra in PR:", len(pr_ids - ar_ids))
print("Extra in AR:", len(ar_ids - pr_ids))

In [None]:
"""Analyze the duplicates to help with how we will move forward. Specifically, look at the values in the duplicates""" 
# Look at a few duplicate examples in AR
ar_dupes = ar_df[ar_df['record_id'].duplicated(keep=False)]
print("AR duplicate sample:")
display(ar_dupes.sort_values('record_id').head(10))

# Look at a few duplicate examples in PR
pr_dupes = pr_df[pr_df['record_id'].duplicated(keep=False)]
print("PR duplicate sample:")
display(pr_dupes.sort_values('record_id').head(10))


In [None]:
"""
So, based on the results above we can see that the 'duplicates' are not actually duplicates, but rather just NaNs. 
So, I (Dario) think it would be best to go on and drop them since they are missing their unique identifier. 
From there, down the road, the additional 'record_id' values that aren't found in application-records.csv 
will be dropped during an inner join since they are unmatched. I think this would be the best first step in 
the data wrangling/cleaning process since we would just be removing a bunch of noisy rows. 
"""
# --- Step 3: Remove rows with missing record_id ---

# Create variables to help show the shape of both DFs (before dropping duplicates) 
before_ar = ar_df.shape[0]
before_pr = pr_df.shape[0]

# Drop duplicates
ar_df = ar_df.dropna(subset=['record_id']).copy()
pr_df = pr_df.dropna(subset=['record_id']).copy()

print(f"AR rows removed: {before_ar - ar_df.shape[0]}")
print(f"PR rows removed: {before_pr - pr_df.shape[0]}")

# Confirm that there are no more duplicates 
print("AR duplicate IDs after drop:", ar_df['record_id'].duplicated().sum())
print("PR duplicate IDs after drop:", pr_df['record_id'].duplicated().sum())

In [None]:
# Take a quick peek at both df shapes again and ensure we actually removed the duplicates
print("AR Shape: ", ar_df.shape)
print("PR Shape: ", pr_df.shape)
print("\nUnique AR ID's:", ar_df['record_id'].nunique())
print("Unique PR ID's:", pr_df['record_id'].nunique())

In [None]:
#Inner join of both dataframes
df_records = pd.merge(ar_df, pr_df, on='record_id',how = 'inner')
df_records.to_csv("df_records.csv", index=False)

In [None]:
#import Ebay csv file
df_records = pd.read_csv("df_records.csv")
#standardize column names
df_records.columns = df_records.columns.str.strip().str.lower()

In [None]:
df_records.head(20)
df_records.dtypes

In [None]:
#view all initial columns
for col in df_records.columns:
    print(col)

In [None]:
# check for missingness in data (as percentages)
missing_percentages = (df_records.isnull().sum() / len(df_records)) * 100

# filter only columns with more than 1% (adjust as needed)
missing_percentages = missing_percentages[missing_percentages > 1]

# sort in descending order
missing_percentages = missing_percentages.sort_values(ascending=False)

print(missing_percentages)

# Clean Income Records

In [None]:
#Initial View of the Income Tab
income = df_records['income'].tolist()
income



In [None]:
#For standardization of the income column, we are limitiing all values to 2 decimal points. 
#Apply change to df_records
df_records['income'] = df_records['income'].round(2)
income = df_records['income'].tolist()
income

In [None]:
# Count how many instances of each unique value
income_counts = df_records['income'].value_counts()

print(income_counts)

In [None]:
#Now we are checking empty values inn our income column
income_nan_count = df_records['income'].isnull().sum()
print(income_nan_count)

In [None]:
import matplotlib.pyplot as plt

# --- Distribution of income (quantile variable) ---
plt.figure(figsize=(6,4))
df_records['income'].hist(bins=50, edgecolor='black')
plt.title("Distribution of income (quantiles)")
plt.xlabel("income (0–1 scale)")
plt.ylabel("frequency")
plt.show()

# --- Distribution of salary (raw dollars) ---
plt.figure(figsize=(6,4))
df_records['salary'].hist(bins=50, edgecolor='black')
plt.title("Distribution of salary (USD)")
plt.xlabel("salary")
plt.ylabel("frequency")
plt.show()

# --- Relationship between income (quantile) and salary ---
plt.figure(figsize=(6,4))
plt.scatter(df_records['income'], df_records['salary'], alpha=0.2, s=10)
plt.title("income (quantile) vs salary (USD)")
plt.xlabel("income (0–1)")
plt.ylabel("salary (USD)")
plt.show()

In [None]:
income_nan = df_records[df_records['income'].isnull()]
income_nan.sample(10)

# Clean name_email_similarity column

In [None]:
#Initial View of the Column
name_email_similarity = df_records['name_email_similarity'].tolist()
name_email_similarity

In [None]:
#For standardization of the name_email_similarity column, we are limitiing all values to 2 decimal points. 
#Apply change to df_records
df_records['name_email_similarity'] = df_records['name_email_similarity'].round(2)
name_email_similarity = df_records['name_email_similarity'].unique()
name_email_similarity

# Clean Salary Column

In [None]:
#Initial View of the Column
salary = df_records['salary'].tolist()
salary

In [None]:
# The intial view of the salary column does not show any discrepancies
# For standardization of the salary column, we are ensuring there are no values that have cents included in the submission
#Apply change to df_records
df_records['salary'] = df_records['salary'].round(0)

# Clean prev_address_months_count Column

In [None]:
#Initial View of the Column
prev_address_months_count = df_records['prev_address_months_count'].tolist()
prev_address_months_count

In [None]:
#Check Missingness in the table
count_neg_one_missing = (df_records['prev_address_months_count'] == -1).sum()
missing_percentages = (count_neg_one_missing / len(df_records)) * 100
print(f"Number of -1 values:", count_neg_one_missing)
print(f"Percentage of -1 values: {missing_percentages:.2f}%")
# check for missingness in data (as percentages)


The column has about 71% of values missing; however, this is an optional tab, filled out by the applicants, as indicated by Martin. We will keep this column in. It probably is not a strong indicator on its own, but it may point to other red flags in the dataset.

# Clean current_address_months_count column

In [None]:
#Initial View of the Column
current_address_months_count = df_records['current_address_months_count'].unique()
current_address_months_count

In [None]:
#Check Missingness in the table
count_neg_one_missing = (df_records['current_address_months_count'] == -1).sum()
missing_percentages = (count_neg_one_missing / len(df_records)) * 100
print(f"Number of -1 values:", count_neg_one_missing)
print(f"Percentage of -1 values: {missing_percentages:.2f}%")
# check for missingness in data (as percentages)

There are not many missing values in this values. We can possibly impute these and create flag variables for those. 

# Clean customer_age column

In [None]:
#Initial View of the Column
customer_age = df_records['customer_age'].unique()
customer_age

In [None]:
#Check Missingness in the table
count_missing = (df_records['customer_age'].isnull()).sum()
missing_percentages = (count_neg_one_missing / len(df_records)) * 100
print(f"Number of missing values:", count_neg_one_missing)
print(f"Percentage of missing values: {missing_percentages:.2f}%")
# check for missingness in data (as percentages)

There is nothing missing in this column. There are also no values that do not represent the age bins.

# Clean date_of_birth column

In [134]:
birth_values = df_records[['date_of_birth']]

# Save to CSV
birth_values.to_csv("birth_values.csv", index=False)

In [136]:
import pandas as pd
import numpy as np
import re

def standardize_date_column(df, col, output_fmt="%d-%m-%Y", dayfirst_preferred=True,
                            two_digit_year_pivot=2025, keep_raw=True, report=True):
    """
    Clean & standardize a messy date column into a single string format (default dd-mm-yyyy).

    Handles:
      - Mixed delimiters: 18-12-2003, 18/09/94
      - Month names: 21-Jun-76, 5 Dec 72
      - U.S. style leftovers: 1/26/87
      - Excel serials: 45123
      - Compact 8-digit numbers: 20240531 or 31052024
      - Two-digit years with pivot correction (e.g., 87 -> 1987 if parsed as 2087)

    Args:
      df: DataFrame
      col: column name containing messy dates (string-like)
      output_fmt: final strftime format
      dayfirst_preferred: first try day-first parsing (common outside U.S.)
      two_digit_year_pivot: years > pivot after parsing (from 2-digit inputs) will be shifted -100 years
      keep_raw: if True, adds `{col}_raw` with original text
      report: if True, prints a quick parsing summary

    Returns:
      Series of strings in output_fmt with NaN for unparsed.
    """
    s_raw = df[col].astype(str).str.strip()
    s_raw = s_raw.replace({"": np.nan, "nan": np.nan, "None": np.nan})

    if keep_raw:
        raw_col = f"{col}_raw"
        if raw_col not in df.columns:
            df[raw_col] = s_raw

    # Working datetime series
    out = pd.Series(pd.NaT, index=df.index, dtype="datetime64[ns]")

    def try_parse(mask, parser, **kwargs):
        nonlocal out
        idx = out[mask].index
        if len(idx) == 0:
            return
        parsed = parser(s_raw.loc[idx], errors="coerce", **kwargs)
        out.loc[idx] = out.loc[idx].fillna(parsed)

    # PASS 1: broad parse with preferred day-first
    try_parse(out.isna(), pd.to_datetime, dayfirst=dayfirst_preferred)

    # PASS 2: named-month formats (explicit)
    named_formats = ["%d-%b-%y", "%d-%b-%Y", "%d %b %y", "%d %b %Y",
                     "%d-%B-%y", "%d-%B-%Y", "%d %B %y", "%d %B %Y"]
    for fmt in named_formats:
        mask = out.isna()
        try_parse(mask, lambda x, **k: pd.to_datetime(x, format=fmt, **k))

    # PASS 3: explicit D/M/Y (two- and four-digit years)
    dmy_formats = ["%d/%m/%y", "%d/%m/%Y", "%d-%m-%y", "%d-%m-%Y"]
    for fmt in dmy_formats:
        mask = out.isna()
        try_parse(mask, lambda x, **k: pd.to_datetime(x, format=fmt, **k))

    # PASS 4: explicit M/D/Y for leftovers (U.S.-style)
    mdy_formats = ["%m/%d/%y", "%m/%d/%Y", "%m-%d-%y", "%m-%d-%Y"]
    for fmt in mdy_formats:
        mask = out.isna()
        try_parse(mask, lambda x, **k: pd.to_datetime(x, format=fmt, **k))

    # PASS 5: compact 8-digit numbers (YYYYMMDD then DDMMYYYY)
    mask = out.isna()
    compact = s_raw.str.fullmatch(r"\d{8}", na=False) & mask
    if compact.any():
        # Try YYYYMMDD
        idx = compact[compact].index
        parsed = pd.to_datetime(s_raw.loc[idx], format="%Y%m%d", errors="coerce")
        # Fill successes
        out.loc[idx] = out.loc[idx].fillna(parsed)
        # For any still NaT, try DDMMYYYY
        still = out.loc[idx].isna()
        if still.any():
            idx2 = out.loc[idx][still].index
            parsed2 = pd.to_datetime(s_raw.loc[idx2], format="%d%m%Y", errors="coerce")
            out.loc[idx2] = out.loc[idx2].fillna(parsed2)

    # PASS 6: Excel serials (simple guard: 4–6 digits; adjust if needed)
    mask = out.isna()
    serial_like = s_raw.str.fullmatch(r"\d{4,6}", na=False) & mask
    if serial_like.any():
        idx = serial_like[serial_like].index
        serial_vals = pd.to_numeric(s_raw.loc[idx], errors="coerce")
        parsed = pd.to_datetime(serial_vals, unit="D", origin="1899-12-30", errors="coerce")
        out.loc[idx] = out.loc[idx].fillna(parsed)

    # --- Two-digit year pivot correction ---
    # Detect rows whose raw value had a 2-digit year
    two_digit_pattern = re.compile(r"(^|[^0-9])\d{1,2}[-/\s](?:[A-Za-z]{3,}|[0-9]{1,2})[-/\s]\d{2}($|[^0-9])")
    has_two_digit_year = s_raw.fillna("").apply(lambda x: bool(two_digit_pattern.search(x)))
    # If parsed and year > pivot, subtract 100 years (only for two-digit-year inputs)
    mask = out.notna() & has_two_digit_year & (out.dt.year > two_digit_year_pivot)
    if mask.any():
        out.loc[mask] = out.loc[mask] - pd.offsets.DateOffset(years=100)

    # Final string formatting
    result = out.dt.strftime(output_fmt)

    # Optional report
    if report:
        total = len(df)
        parsed = out.notna().sum()
        print(f"Parsed: {parsed}/{total} ({parsed/total*100:.2f}%)")
        if parsed < total:
            unparsed = df.loc[out.isna(), col].head(10).tolist()
            print("Sample unparsed values (first 10):", unparsed)

    return result

# ----------------------
# Usage
# ----------------------
df_records['date_of_birth'] = standardize_date_column(df_records, 'date_of_birth', output_fmt='%d-%m-%Y')


Parsed: 823551/823551 (100.00%)


In [138]:
#Initial View of the Column
birth_values = df_records['date_of_birth'].tolist()
birth_values

['18-12-2003',
 '28-05-1993',
 '21-06-1976',
 '18-09-1994',
 '05-12-1981',
 '05-02-1982',
 '12-08-1988',
 '21-09-1995',
 '05-12-1972',
 '15-08-1984',
 '15-03-2004',
 '26-01-1987',
 '24-06-1993',
 '30-04-1982',
 '18-03-1997',
 '27-11-1990',
 '24-09-2002',
 '16-11-2002',
 '05-06-2000',
 '15-12-1990',
 '04-05-1968',
 '13-05-1984',
 '30-04-1986',
 '25-12-1977',
 '21-08-1981',
 '24-01-1996',
 '12-02-1979',
 '24-04-1987',
 '17-09-1978',
 '10-02-1982',
 '09-06-1980',
 '13-01-2001',
 '09-02-2000',
 '27-07-1998',
 '17-02-2000',
 '03-09-1988',
 '01-02-1964',
 '26-07-1975',
 '27-07-1975',
 '24-10-2001',
 '05-11-1995',
 '02-01-1970',
 '21-01-2003',
 '30-05-1996',
 '22-08-1978',
 '14-12-1991',
 '20-03-1987',
 '09-04-2001',
 '10-12-2003',
 '05-05-1988',
 '27-04-2001',
 '21-04-1969',
 '23-04-1982',
 '28-06-1990',
 '09-02-1994',
 '16-10-1970',
 '01-10-1996',
 '07-01-1979',
 '04-04-1995',
 '30-08-1999',
 '26-09-1993',
 '07-06-2003',
 '02-06-1999',
 '10-01-2003',
 '08-02-1972',
 '10-12-1993',
 '04-05-19

In [139]:
# Count NaN values
count_missing = df_records['date_of_birth'].isna().sum()
missing_percentages = (count_missing / len(df_records)) * 100

print(f"Number of missing (NaN) values: {count_missing}")
print(f"Percentage of missing values: {missing_percentages:.2f}%")

Number of missing (NaN) values: 0
Percentage of missing values: 0.00%


# Clean days_since_request Column


In [None]:
#Initial View of the Column
days_since_request_values = df_records['days_since_request'].unique
days_since_request_values

For this column, there were many entries with decimal places, indicating less than a day had passed. For this, we decided to round these down to zero and create a new flag column to reference all those accounts with same day requests. 

In [None]:
# Convert to whole days
df_records['days_since_request'] = df_records['days_since_request'].round().astype('Int64')

# Add a same-day flag
df_records['same_day'] = df_records['days_since_request'] < 1


In [None]:
#Initial View of the Column
days_since_request_values = df_records['days_since_request'].unique()
days_since_request_values

# Clean intended_balcon_amount Column

In [None]:
#Initial View of the Column
intended_balcon_amount_values = df_records['intended_balcon_amount'].tolist()
intended_balcon_amount_values

For this column, we will treat all values less than 0 as missing entries. We also rounded all values to the nearest dollar amount. 

In [None]:
# Round to 0 decimals and replace negatives with -1
df_records['intended_balcon_amount'] = df_records['intended_balcon_amount'].round(0)
df_records['intended_balcon_amount'] = df_records['intended_balcon_amount'].where(df_records['intended_balcon_amount'] >= 0, -1)


In [None]:
#Initial View of the Column
intended_balcon_amount_values = df_records['intended_balcon_amount'].unique()
intended_balcon_amount_values

In [None]:
# Replace -0.0 with 0.0
df_records['intended_balcon_amount'] = df_records['intended_balcon_amount'].replace(-0.0, 0.0)

# Clean payment_type Column

In [None]:
#Initial View of the Column
payment_type_values = df_records['payment_type'].unique()
payment_type_values

I asked Martin, the data scientist, about the mapping for this column. This is what I was provided with:
From what I recall of the mapping table:

AA → ACH / Direct Deposit

AB → Debit or Credit Card

AC → Check

AD → Cash (branch funding)

AE → Other / Unknown

In [None]:
# Define mapping table
payment_map = {
    "AA": "ACH / Direct Deposit",
    "AB": "Debit or Credit Card",
    "AC": "Check",
    "AD": "Cash (Branch Funding)",
    "AE": "Other / Unknown"
}

# Apply mapping, default to "Unknown" if unmapped
df_records['payment_type'] = df_records['payment_type'].map(payment_map).fillna("Unknown")


In [None]:
#Initial View of the Column
payment_type_values = df_records['payment_type'].unique()
payment_type_values

# Clean zip_count_4w Column


In [None]:
#Initial View of the Column
zip_count_4w_values = df_records['zip_count_4w'].tolist()
zip_count_4w_values

In [None]:
type_None =  df_records['zip_count_4w'].isna()
type_None.sum()

There does not seem to be anything missing or wrong with this column

# Clean velocity_6h column


In [None]:
#Initial View of the Column
velocity_6h_values = df_records['velocity_6h'].tolist()
velocity_6h_values

In [None]:
# Round to 0 decimals
df_records['velocity_6h'] = df_records['velocity_6h'].round(0)

# Replace negatives with -1
df_records['velocity_6h'] = df_records['velocity_6h'].where(df_records['velocity_6h'] >= 0, -1)

# Force clean integers (no -0.0, no float artifacts)
df_records['velocity_6h'] = df_records['velocity_6h'].astype(int)

Since we are looking at applications, I removed all decimals to ensure we are representing all applications as a whole. You cannot have a .5 application. Furthemore, for all negative values, I changed the values to -1 to represent missing values or errors in the data. We can impute these in the future.

In [None]:
#Initial View of the Column
velocity_6h_values = df_records['velocity_6h']
velocity_6h_values

# Clean velocity_24h column


In [None]:
#Initial View of the Column
velocity_24h_values = df_records['velocity_24h'].tolist()
velocity_24h_values

I am repeating the same process for this column as the previous

In [None]:
# Round to 0 decimals
df_records['velocity_24h'] = df_records['velocity_24h'].round(0)

# Replace negatives with -1
df_records['velocity_24h'] = df_records['velocity_24h'].where(df_records['velocity_24h'] >= 0, -1)

# Force clean integers (no -0.0, no float artifacts)
df_records['velocity_24h'] = df_records['velocity_24h'].astype(int)

In [None]:
#Initial View of the Column
velocity_24h_values = df_records['velocity_24h'].tolist()
velocity_24h_values

# Clean velocity_4w Column


In [None]:
#Initial View of the Column
velocity_4w_values = df_records['velocity_4w'].tolist()
velocity_4w_values

I am repeating the same process for this column as the previous

In [122]:
# Round to 0 decimals
df_records['velocity_4w'] = df_records['velocity_4w'].round(0)

# Replace negatives with -1
df_records['velocity_4w'] = df_records['velocity_4w'].where(df_records['velocity_4w'] >= 0, -1)

# Force clean integers (no -0.0, no float artifacts)
df_records['velocity_4w'] = df_records['velocity_4w'].astype(int)

In [None]:
#Initial View of the Column
velocity_4w_values = df_records['velocity_4w'].tolist()
velocity_4w_values

# Clean bank_branch_count_8w Column


In [None]:
bank_branch_count_8w_values = df_records['bank_branch_count_8w'].tolist()
bank_branch_count_8w_values

I'm changing this to an integer with no decimal places. There cannot be a .5 application. 

In [128]:
df_records['bank_branch_count_8w'] = df_records['bank_branch_count_8w'].astype(int)
bank_branch_count_8w_values = df_records['bank_branch_count_8w']
bank_branch_count_8w_values

0           3
1          11
2           1
3         705
4          28
         ... 
823546      9
823547    744
823548      0
823549      7
823550     14
Name: bank_branch_count_8w, Length: 823551, dtype: int64

# Clean date_of_birth_distinct_emails_4w Column

In [129]:
date_of_birth_distinct_emails_4w_values = df_records['date_of_birth_distinct_emails_4w']
date_of_birth_distinct_emails_4w_values.unique()

array([18., 13.,  6.,  5.,  8.,  7., 10., 20.,  9., 22., 15., 14., 16.,
        3., 11.,  2., 17.,  0., 12.,  4., 19.,  1., 23., 21., 28., 29.,
       26., 25., 24., 27., 32., 30., 33., 31., 34., 36., 35., 37., 39.,
       38.])

Chaning this cell to an integer data type, since these are adding whole applications together

In [130]:
df_records['date_of_birth_distinct_emails_4w'] = df_records['date_of_birth_distinct_emails_4w'].astype(int)

# Clean Employment Status

In [131]:
employment_status_values = df_records['employment_status']
employment_status_values.unique()

array(['CA', 'CB', 'CC', 'CF', 'CD', 'CE', 'CG'], dtype=object)

Based on what Martin said:

I wasn’t able to find a formal data dictionary with the mapping inside the uploaded DFS documents. But based on how the employment codes were set up in the old intake system, here’s the mapping we used internally:

CA → Employed full-time

CB → Employed part-time

CC → Self-employed

CD → Student

CE → Unemployed

CF → Retired

CG → Other / Not specified

Those codes slipped into the dataset when raw system values weren’t translated back to text labels. For modeling and reporting, we always remapped them to plain categories, and anything unusual (like a stray code outside A–G) was bucketed into “Unknown.”

In [132]:
# Define mapping dictionary
employment_map = {
    "CA": "Employed full-time",
    "CB": "Employed part-time",
    "CC": "Self-employed",
    "CD": "Student",
    "CE": "Unemployed",
    "CF": "Retired",
    "CG": "Other / Not specified"
}

# Apply mapping, default to "Other / Not specified" if unmapped
df_records['employment_status'] = df_records['employment_status'].map(employment_map)


In [133]:
employment_status_values = df_records['employment_status']
employment_status_values.unique()

array(['Employed full-time', 'Employed part-time', 'Self-employed',
       'Retired', 'Student', 'Unemployed', 'Other / Not specified'],
      dtype=object)