In [1]:
import pandas as pd
import seaborn as sns
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
import matplotlib.pyplot as plt


In [2]:
# load data
df = pd.read_csv('data/processed/census_income_learn.csv')

| Column Name                                 | Data Type | Required | Mapping | Post Processing|
|---------------------------------------------|-----------|----------|---------|----------------|
| age                                         | int64     | y        |         |ok, only over 18    |
| class_of_worker                             | object    | y        | map     |ok, keep only 5 cats|
| detailed_industry_recode                    | object    | y        | map     |no,high corr with class_of_worker|
| detailed_occupation_recode                  | object    | y        | map     |no,high corr with class_of_worker|
| education                                   | object    | y        | map     |ok|
| wage_per_hour                               | int64     | y        |         |ok|
| enroll_in_edu_inst_last_wk                  | object    | y        |         |ok|
| marital_stat                                | object    | y        | map     |ok|
| major_industry_code                         | object    | y        |         |ok|
| major_occupation_code                       | object    | y        |         |no|
| race                                        | object    | y        |         |ok|
| hispanic_origin                             | object    | n        |         |not relative|
| sex                                         | object    | y        |         |ok, might be sensitive|
| member_of_a_labor_union                     | object    | y        |         |ok, just keep 'Yes'|
| reason_for_unemployment                     | object    | y        |         |ok|
| full_or_part_time_employment_stat           | object    | y        | map     |ok|
| capital_gains                               | int64     | y        |         |ok|
| capital_losses                              | int64     | y        |         |ok|
| dividends_from_stocks                       | int64     | y        |         |ok|
| tax_filer_stat                              | object    | y        | map     |ok|
| region_of_previous_residence                | object    | y        |         |ok. just four cats|
| state_of_previous_residence                 | object    | n        |         |no, too many cats|
| detailed_household_and_family_stat          | object    | y        | map     |ok|
| detailed_household_summary_in_household     | object    | y        | map     |no, too many cats|
| instance_weight                             | float64   | n        |         |no, instructed|
| migration_code_change_in_msa                | object    | y        | map     |no, high corr with move_reg|
| migration_code_change_in_reg                | object    | y        |         |ok, only two cats|
| migration_code_move_within_reg              | object    | n        |         |no, high corr with move_reg|
| live_in_this_house_1_year_ago               | object    | y        |         |ok|
| migration_prev_res_in_sunbelt               | object    | y        |         |ok|
| num_persons_worked_for_employer             | int64     | y        |         |ok|
| family_members_under_18                     | object    | n        |         |ok|
| country_of_birth_father                     | object    | y        |         |no, high corr with citizenship|
| country_of_birth_mother                     | object    | y        |         |no, high corr with citizenship|
| country_of_birth_self                       | object    | y        |         |no, high corr with citizenship|
| citizenship                                 | object    | y        |         |ok|
| own_business_or_self_employed               | object    | y        | map     |ok|
| fill_inc_questionnaire_for_veterans_admin   | object    | y        |         |no, using veterans_benefits instead|
| veterans_benefits                           | object    | y        |         |ok|
| weeks_worked_in_year                        | int64     | y        |         |ok|
| year                                        | int64     | y        |         |not related|
| target                                      | object    | T        |         |Target|

In [3]:
features_ok = [
    'age',
    'class_of_worker',
    'education',
    'wage_per_hour',
    'enroll_in_edu_inst_last_wk',
    'marital_stat',
    'major_industry_code',
    'race',
    'sex',
    'member_of_a_labor_union',
    'reason_for_unemployment',
    'full_or_part_time_employment_stat',
    'capital_gains',
    'capital_losses',
    'dividends_from_stocks',
    'tax_filer_stat',
    'region_of_previous_residence',
    'detailed_household_and_family_stat',
    'migration_code_change_in_reg',
    'live_in_this_house_1_year_ago',
    'migration_prev_res_in_sunbelt',
    'num_persons_worked_for_employer',
    'family_members_under_18',
    'citizenship',
    'own_business_or_self_employed',
    'veterans_benefits',
    'weeks_worked_in_year',
    'target'
]

df_selected = df[features_ok]

In [4]:
# select the numerical columns (non-object)
numeric_cols = df_selected.select_dtypes(exclude=['object']).columns
print(f"Number of numeric columns: {len(numeric_cols)}")
print("\nNumeric columns:")
print(numeric_cols.tolist())
df_numeric = df_selected[numeric_cols]

Number of numeric columns: 8

Numeric columns:
['age', 'wage_per_hour', 'capital_gains', 'capital_losses', 'dividends_from_stocks', 'num_persons_worked_for_employer', 'weeks_worked_in_year', 'target']


In [5]:
# get categorical columns
categorical_cols = df_selected.select_dtypes(include=['object']).columns

# convert categorical columns to binary using get_dummies
df_binary = pd.get_dummies(df_selected[categorical_cols], drop_first=False)

# convert column names to lowercase and replace spaces with underscores
df_binary.columns = [x.lower() for x in df_binary.columns.str.replace(' ', '_')]

print(f"Original shape: {df_selected.shape}")
print(f"Binary shape: {df_binary.shape}")
print("\nBinary columns:")
print(df_binary.columns.tolist())

Original shape: (196294, 28)
Binary shape: (196294, 99)

Binary columns:
['class_of_worker_government', 'class_of_worker_not_employed', 'class_of_worker_not_in_universe', 'class_of_worker_private_sector', 'class_of_worker_self-employed', 'education_advanced_degree', 'education_below_high_school', 'education_children', 'education_college_graduate', 'education_high_school_graduate', 'education_some_college', 'enroll_in_edu_inst_last_wk__college_or_university', 'enroll_in_edu_inst_last_wk__high_school', 'enroll_in_edu_inst_last_wk__not_in_universe', 'marital_stat_divorced', 'marital_stat_married', 'marital_stat_married-spouse_absent', 'marital_stat_never_married', 'marital_stat_separated', 'marital_stat_widowed', 'major_industry_code__agriculture', 'major_industry_code__armed_forces', 'major_industry_code__business_and_repair_services', 'major_industry_code__communications', 'major_industry_code__construction', 'major_industry_code__education', 'major_industry_code__entertainment', 'major

In [6]:
bin_cols_to_keep = [
    'class_of_worker_government',
    'class_of_worker_not_employed',
    'class_of_worker_private_sector',
    'class_of_worker_self-employed',
    'education_advanced_degree',
    'education_below_high_school',
    'education_college_graduate',
    'education_high_school_graduate',
    'education_some_college',
    'enroll_in_edu_inst_last_wk__college_or_university',
    'enroll_in_edu_inst_last_wk__high_school',
    'marital_stat_divorced',
    'marital_stat_married',
    'marital_stat_never_married',
    'marital_stat_separated',
    'marital_stat_widowed',
    'major_industry_code__agriculture',
    'major_industry_code__armed_forces',
    'major_industry_code__business_and_repair_services',
    'major_industry_code__communications',
    'major_industry_code__construction',
    'major_industry_code__education',
    'major_industry_code__entertainment',
    'major_industry_code__finance_insurance_and_real_estate',
    'major_industry_code__forestry_and_fisheries',
    'major_industry_code__hospital_services',
    'major_industry_code__manufacturing-durable_goods',
    'major_industry_code__manufacturing-nondurable_goods',
    'major_industry_code__medical_except_hospital',
    'major_industry_code__mining',
    'major_industry_code__other_professional_services',
    'major_industry_code__personal_services_except_private_hh',
    'major_industry_code__private_household_services',
    'major_industry_code__public_administration',
    'major_industry_code__retail_trade',
    'major_industry_code__social_services',
    'major_industry_code__transportation',
    'major_industry_code__utilities_and_sanitary_services',
    'major_industry_code__wholesale_trade',
    'race__amer_indian_aleut_or_eskimo',
    'race__asian_or_pacific_islander',
    'race__black',
    'race__white',
    'sex__male',
    'member_of_a_labor_union__no',
    'member_of_a_labor_union__yes',
    'reason_for_unemployment_job_leaver',
    'reason_for_unemployment_job_loser',
    'reason_for_unemployment_new_entrant',
    'reason_for_unemployment_re-entrant',
    'full_or_part_time_employment_stat_fte',
    'full_or_part_time_employment_stat_not_employed',
    'full_or_part_time_employment_stat_pte',
    'tax_filer_stat_individual_filer',
    'tax_filer_stat_non-filer',
    'region_of_previous_residence__abroad',
    'region_of_previous_residence__midwest',
    'region_of_previous_residence__northeast',
    'region_of_previous_residence__south',
    'region_of_previous_residence__west',
    'detailed_household_and_family_stat_child',
    'detailed_household_and_family_stat_extended_family',
    'detailed_household_and_family_stat_primary_householder',
    'migration_code_change_in_reg_different_area',
    'migration_code_change_in_reg_same_area',
    'live_in_this_house_1_year_ago__no',
    'live_in_this_house_1_year_ago__yes',
    'migration_prev_res_in_sunbelt__no',
    'migration_prev_res_in_sunbelt__yes',
    'family_members_under_18__both_parents_present',
    'family_members_under_18__father_only_present',
    'family_members_under_18__mother_only_present',
    'family_members_under_18__neither_parent_present',
    'citizenship_foreign',
    'citizenship_native',
    'own_business_or_self_employed_no',
    'own_business_or_self_employed_yes',
    'veterans_benefits_not_a_veteran',
    'veterans_benefits_veteran'
]


In [7]:
# select only the bin columns that are in the list of columns to keep
df_binary = df_binary[bin_cols_to_keep]


Unnamed: 0,age,wage_per_hour,capital_gains,capital_losses,dividends_from_stocks,num_persons_worked_for_employer,weeks_worked_in_year,target,class_of_worker_government,class_of_worker_not_employed,...,family_members_under_18__both_parents_present,family_members_under_18__father_only_present,family_members_under_18__mother_only_present,family_members_under_18__neither_parent_present,citizenship_foreign,citizenship_native,own_business_or_self_employed_no,own_business_or_self_employed_yes,veterans_benefits_not_a_veteran,veterans_benefits_veteran
0,73,0,0,0,0,0,0,1,False,False,...,False,False,False,False,False,True,False,False,True,False
1,58,0,0,0,0,1,52,1,False,False,...,False,False,False,False,False,True,False,False,True,False
2,18,0,0,0,0,0,0,1,False,False,...,False,False,False,False,True,False,False,False,True,False
3,9,0,0,0,0,0,0,1,False,False,...,True,False,False,False,False,True,False,False,False,False
4,10,0,0,0,0,0,0,1,False,False,...,True,False,False,False,False,True,False,False,False,False


In [9]:
print('final shape:', df_prepared.shape)

final shape: (196294, 87)


In [13]:
df_prepared.head().T

Unnamed: 0,0,1,2,3,4
age,73,58,18,9,10
wage_per_hour,0,0,0,0,0
capital_gains,0,0,0,0,0
capital_losses,0,0,0,0,0
dividends_from_stocks,0,0,0,0,0
...,...,...,...,...,...
citizenship_native,True,True,False,True,True
own_business_or_self_employed_no,False,False,False,False,False
own_business_or_self_employed_yes,False,False,False,False,False
veterans_benefits_not_a_veteran,True,True,True,False,False


In [14]:
df_prepared['target'].to_numpy()

array([1, 1, 1, ..., 1, 1, 1], dtype=int64)

Train data: ###############################
Number of numeric columns: 8
Original shape: (196294, 28)
Binary shape: (196294, 99)
X_train shape: (196294, 86)

Test data: ###############################
Number of numeric columns: 8
Original shape: (98879, 28)
Binary shape: (98879, 99)
X_test shape: (98879, 86)


ModuleNotFoundError: No module named 'imblearn'

In [8]:
results = train_and_evaluate_rf(X_train, X_test, y_train, y_test)

# Print the classification report
print("Model Performance:")
print(results['metrics']['classification_report'])

# Plot feature importance
plot_feature_importance(results['metrics']['feature_importance'])

# Access the trained model
model = results['model']

Model Performance:
              precision    recall  f1-score   support

           0       0.66      0.43      0.52      6186
           1       0.96      0.99      0.97     92693

    accuracy                           0.95     98879
   macro avg       0.81      0.71      0.75     98879
weighted avg       0.94      0.95      0.95     98879



In [17]:

X_train.columns[7:len(X_train.columns)]

Index(['class_of_worker_government', 'class_of_worker_not_employed',
       'class_of_worker_private_sector', 'class_of_worker_self-employed',
       'education_advanced_degree', 'education_below_high_school',
       'education_college_graduate', 'education_high_school_graduate',
       'education_some_college',
       'enroll_in_edu_inst_last_wk__college_or_university',
       'enroll_in_edu_inst_last_wk__high_school', 'marital_stat_divorced',
       'marital_stat_married', 'marital_stat_never_married',
       'marital_stat_separated', 'marital_stat_widowed',
       'major_industry_code__agriculture', 'major_industry_code__armed_forces',
       'major_industry_code__business_and_repair_services',
       'major_industry_code__communications',
       'major_industry_code__construction', 'major_industry_code__education',
       'major_industry_code__entertainment',
       'major_industry_code__finance_insurance_and_real_estate',
       'major_industry_code__forestry_and_fisheries',
     

In [None]:
categorical_features = [0, 2, 5]  # Replace with your actual categorical feature indices

# Train the model with SMOTE-NC
results = train_and_evaluate_rf(X_train, X_test, y_train, y_test,
                              use_smote=True,
                              categorical_features=categorical_features)

# Print the classification report
print("\nModel Performance:")
print(results['metrics']['classification_report'])

# Plot feature importance
plot_feature_importance(results['metrics']['feature_importance'])