## Dataiku ML Assignment

Goal: Identify the characteristics of individuals earning more or less than $50K.

## Import Packages

In [4]:
import pandas as pd  # For data manipulation and analysis
import numpy as np  # For numerical computations
import matplotlib.pyplot as plt  # For data visualization
import seaborn as sns  # For statistical data visualization
from sklearn.model_selection import train_test_split, RandomizedSearchCV  # For data splitting and hyperparameter tuning
from sklearn.ensemble import RandomForestClassifier  # For Random Forest classification
from sklearn.linear_model import LogisticRegression  # For Logistic Regression
from xgboost import XGBClassifier  # For XGBoost classification
from sklearn.preprocessing import MinMaxScaler, LabelEncoder  # For scaling and encoding data
from sklearn.metrics import (  # For model evaluation metrics
    classification_report,
    roc_auc_score,
    confusion_matrix,
    precision_recall_curve,
)
from sklearn.feature_selection import SelectKBest, chi2  # For feature selection
from imblearn.over_sampling import SMOTE  # For handling class imbalance through oversampling
from sklearn.metrics import f1_score # F-1 Score

## Load Data

In [6]:
def load_data(train_path, test_path):
    """
    Load and preprocess the training and test datasets.
    """
    # Define column names for the datasets based on provided metadata
    column_names = [
        'age', 'class_of_worker', 'detailed_industry_recode', 'detailed_occupation_recode', 'education', 
        'wage_per_hour', 'enroll_in_edu_inst_last_wk', 'marital_stat', 'major_industry_code', 
        'major_occupation_code', 'race', 'hispanic_origin', 'sex', 'member_of_a_labor_union', 'reason_for_unemployment', 
        'full_or_part_time_employment_stat', 'capital_gains', 'capital_losses', 'dividends_from_stocks', 
        'tax_filer_stat', 'region_of_previous_residence', 'state_of_previous_residence', 
        'detailed_household_and_family_stat', 'detailed_household_summary_in_household', 'instance_weight', 
        'migration_code_change_in_msa', 'migration_code_change_in_reg', 'migration_code_move_within_reg', 
        'live_in_this_house_1_year_ago', 'migration_prev_res_in_sunbelt', 'num_persons_worked_for_employer', 
        'family_members_under_18', 'country_of_birth_father', 'country_of_birth_mother', 'country_of_birth_self', 
        'citizenship', 'own_business_or_self_employed', 'fill_inc_questionnaire_for_veterans_admin', 
        'veterans_benefits', 'weeks_worked_in_year', 'year', 'income'
    ]
    
    # Load the training dataset
    train_df = pd.read_csv(train_path, header=None)  # Load CSV without a header
    train_df.columns = column_names  # Assign column names to the DataFrame
    
    # Load the test dataset
    test_df = pd.read_csv(test_path, header=None)  # Load CSV without a header
    test_df.columns = column_names  # Assign column names

    # Return the raw loaded datasets for preprocessing
    return train_df, test_df

## Clean Data

In [8]:
def clean_data(train_df, test_df):
    """
    Handle missing values, filter rows, and drop duplicates.
    Returns cleaned and split training data, along with cleaned training and test datasets.
    Includes checks for income distribution before and after cleaning.
    """
    # Store original row counts
    original_train_rows = train_df.shape[0]
    original_test_rows = test_df.shape[0]

    # Log income distribution before cleaning
    print("\nIncome distribution (training data) before cleaning:")
    print(train_df['income'].value_counts())
    print("\nIncome distribution (test data) before cleaning:")
    print(test_df['income'].value_counts())

    # Clean training data
    train_df.replace(['?', '', ' '], np.nan, inplace=True)  # Replace placeholders with NaN
    train_df.dropna(inplace=True)  # Drop rows with missing values
    train_df.drop_duplicates(inplace=True)  # Drop duplicate rows
    train_df = train_df[
        (train_df['age'] >= 18)].copy()  # Explicit copy to avoid warnings
    if 'instance_weight' in train_df.columns:  # Drop 'instance_weight' column if it exists
        train_df.drop('instance_weight', axis=1, inplace=True)

    # Clean test data
    test_df.replace(['?', '', ' '], np.nan, inplace=True)  # Replace placeholders with NaN
    test_df.dropna(inplace=True)  # Drop rows with missing values
    test_df.drop_duplicates(inplace=True)  # Drop duplicate rows
    test_df = test_df[
        (test_df['age'] >= 18)].copy()  # Explicit copy to avoid warnings
    if 'instance_weight' in test_df.columns:  # Drop 'instance_weight' column if it exists
        test_df.drop('instance_weight', axis=1, inplace=True)

    # Count rows after cleaning
    cleaned_train_rows = train_df.shape[0]
    cleaned_test_rows = test_df.shape[0]

    # Log the number of rows before and after cleaning
    print(f"\nTraining Data: {original_train_rows} rows before cleaning, {cleaned_train_rows} rows after cleaning.")
    print(f"Test Data: {original_test_rows} rows before cleaning, {cleaned_test_rows} rows after cleaning.")

    # Log income distribution after cleaning
    print("\nIncome distribution (training data) after cleaning:")
    print(train_df['income'].value_counts())
    print("\nIncome distribution (test data) after cleaning:")
    print(test_df['income'].value_counts())

    # Split the training data into features and target variable
    X_train = train_df.drop('income', axis=1)  # Features
    y_train = train_df['income']  # Target variable

    # Split training data into training and validation sets
    X_train_split, X_val_split, y_train_split, y_val_split = train_test_split(
        X_train, y_train, test_size=0.2, random_state=42, stratify=y_train
    )

    # Return splits, cleaned test data, and cleaned training data
    return train_df, (X_train_split, X_val_split, y_train_split, y_val_split), test_df

## Data Analysis 

In [10]:
def perform_eda(train_df):
    """
    Perform Exploratory Data Analysis (EDA) on the training dataset.
    This function generates various visualizations and saves them as image files 
    to analyze the distribution and relationships of key features in the dataset.
    """
    # Basic statistics summary
    print("Basic Statistics of Train Data:")
    print(train_df.describe())

    # Plot: Distribution of Income
    print("\nDistribution of Income:")
    sns.countplot(data=train_df, x='income', palette="viridis")
    plt.title('Income Distribution')
    plt.xlabel('Income (0 = Less than 50K, 1 = 50K+)')
    plt.ylabel('Count')
    plt.tight_layout()  # Ensure labels are not cut off
    plt.savefig('eda_income_distribution.png')
    plt.close()

    # Plot: Income vs Gender
    print("\nIncome vs Gender:")
    sns.countplot(data=train_df, x='sex', hue='income', palette="viridis")
    plt.title('Income by Gender')
    plt.xlabel('Gender (0 = Female, 1 = Male)')
    plt.ylabel('Count')
    plt.legend(title='Income', loc='upper right', labels=['Less than 50K', '50K+'])
    plt.tight_layout()
    plt.savefig('eda_income_by_gender.png')
    plt.close()

    # Plot: Income vs Education
    print("\nIncome vs Education:")
 
    sns.countplot(data=train_df, y='education', hue='income', palette="viridis",
                  order=train_df['education'].value_counts().index)
    plt.title('Income by Education Level')
    plt.xlabel('Count')
    plt.ylabel('Education Level')
    plt.legend(title='Income', loc='upper right', labels=['Less than 50K', '50K+'])
    plt.tight_layout()
    plt.savefig('eda_income_by_education.png')
    plt.close()

    # Plot: Income vs Race
    print("\nIncome vs Race:")
    sns.countplot(data=train_df, y='race', hue='income', palette="viridis",
                  order=train_df['race'].value_counts().index)
    plt.title('Income by Race')
    plt.xlabel('Count')
    plt.ylabel('Race')
    plt.legend(title='Income', loc='upper right', labels=['Less than 50K', '50K+'])
    plt.tight_layout()
    plt.savefig('eda_income_by_race.png')
    plt.close()

    # Plot: Age Distribution vs Income
    print("\nAge Distribution vs Income:")
    sns.histplot(data=train_df, x='age', hue='income', bins=20, kde=True, palette="viridis", multiple="stack")
    plt.title('Age Distribution by Income')
    plt.xlabel('Age')
    plt.ylabel('Frequency')
    plt.legend(title='Income', loc='upper right', labels=['Less than 50K', '50K+'])
    plt.tight_layout()
    plt.savefig('eda_age_distribution.png')
    plt.close()

    # Plot: Marital Status Distribution vs Income
    plt.figure(figsize=(10, 6))
    sns.countplot(data=train_df, x='marital_stat', hue='income', palette="viridis")
    plt.title('Marital Status Distribution by Income')
    plt.xlabel('Marital Status')
    plt.ylabel('Frequency')
    plt.xticks(rotation=45)
    plt.legend(title='Income', loc='upper right', labels=['Less than 50K', '50K+'])
    plt.tight_layout()
    plt.savefig('eda_marital_stat_income.png')
    plt.close()

    # Plot: Boxplots for Wage, Weeks Worked, and Capital Net Gain
    print("\nBoxplots for Wage, Weeks Worked, and Capital Net Gain:")
    fig, axes = plt.subplots(1, 3, figsize=(18, 5))

    sns.boxplot(data=train_df, x='income', y='wage_per_hour', ax=axes[0], hue='income', palette="viridis")
    axes[0].set_title('Wage per Hour by Income')
    axes[0].set_xlabel('Income (0 = Less than 50K, 1 = 50K+)')
    axes[0].set_ylabel('Wage per Hour')

    sns.boxplot(data=train_df, x='income', y='weeks_worked_in_year', ax=axes[1], hue='income', palette="viridis")
    axes[1].set_title('Weeks Worked by Income')
    axes[1].set_xlabel('Income (0 = Less than 50K, 1 = 50K+)')
    axes[1].set_ylabel('Weeks Worked')

    sns.boxplot(data=train_df, x='income', y='capital_gains', ax=axes[2], hue='income', palette="viridis")
    axes[2].set_title('Capital Gains')
    axes[2].set_xlabel('Income (0 = Less than 50K, 1 = 50K+)')
    axes[2].set_ylabel('Capital Gains')
    
    plt.tight_layout()
    plt.savefig('eda_boxplots.png')
    plt.close()

## Process Data

In [12]:
def preprocess_data(train_df, splits, test_df):
    """
    Preprocess the data by performing feature engineering, label encoding, 
    and target variable transformation.
    """
    # Unpack splits
    X_train_split, X_val_split, y_train_split, y_val_split = splits

    # Feature engineering: Calculate net capital gain and drop redundant columns
    for dataset in [train_df, X_train_split, X_val_split]:
        dataset['capital_net_gain'] = dataset['capital_gains'] - dataset['capital_losses']  # Calculate net capital gain
        dataset.drop(columns=['capital_gains', 'capital_losses'], inplace=True)  # Drop original columns

    # Apply the same transformation to the test dataset
    test_df['capital_net_gain'] = test_df['capital_gains'] - test_df['capital_losses']  # Calculate net capital gain
    test_df.drop(columns=['capital_gains', 'capital_losses'], inplace=True)  # Drop original columns

    # Initialize a dictionary to store LabelEncoders for each categorical column
    label_encoders = {}

    # Encode categorical features using LabelEncoder
    for column in train_df.select_dtypes(include=['object']).columns:
        if column != 'income':  # Skip the target variable
            le = LabelEncoder()  # Initialize LabelEncoder
            train_df[column] = le.fit_transform(train_df[column])  # Fit and transform training data
            X_train_split[column] = le.transform(X_train_split[column])  # Transform training split
            X_val_split[column] = le.transform(X_val_split[column])  # Transform validation split
            test_df[column] = le.transform(test_df[column])  # Transform test data
            label_encoders[column] = le  # Store the encoder

    # Convert the target variable (`income`) to binary
    train_df['income'] = train_df['income'].apply(lambda x: 1 if x.strip() == '50000+.' else 0)
    test_df['income'] = test_df['income'].apply(lambda x: 1 if x.strip() == '50000+.' else 0)
    y_train_split = y_train_split.apply(lambda x: 1 if x.strip() == '50000+.' else 0)
    y_val_split = y_val_split.apply(lambda x: 1 if x.strip() == '50000+.' else 0)

    # Return the preprocessed datasets and the label encoders
    return train_df, (X_train_split, X_val_split, y_train_split, y_val_split), test_df, label_encoders

## Feature Engineering 

In [14]:
def feature_engineering(splits, test_df):
    """
    Perform feature engineering on training, validation, and test datasets.
    This function scales the data and retains all features for model training and evaluation.
    """
    X_train_split, X_val_split, y_train_split, y_val_split = splits

    # Scale data (fit scaler only on training data)
    scaler = MinMaxScaler()
    X_train_split_scaled = pd.DataFrame(scaler.fit_transform(X_train_split), columns=X_train_split.columns)
    X_val_split_scaled = pd.DataFrame(scaler.transform(X_val_split), columns=X_val_split.columns)
    X_test_scaled = pd.DataFrame(scaler.transform(test_df.drop(columns=['income'])), columns=test_df.drop(columns=['income']).columns)

    # Retain all features
    selected_features = X_train_split.columns

    return (X_train_split_scaled, X_val_split_scaled, y_train_split, y_val_split), X_test_scaled, selected_features

## Plot Features 

In [16]:
def plot_feature_importance(model, features, model_name="Model"):
    """
    Visualizes the importance of features for models that support the feature_importances_ attribute (e.g., Random Forest, XGBoost).
    Saves the feature importance plot to a file.
    
    Args:
        model: The machine learning model.
        features (list): A list of feature names.
        model_name (str): The name of the model (used for saving the plot).
    """
    if hasattr(model, "feature_importances_"):
        importances = pd.DataFrame({
            'Feature': features,
            'Importance': model.feature_importances_
        }).sort_values(by='Importance', ascending=False)

        plt.figure(figsize=(10, 6))
        sns.barplot(data=importances, x='Importance', y='Feature', dodge=False)
        plt.title(f"Feature Importance - {model_name}")
        plt.xlabel("Feature Importance Score")
        plt.ylabel("Features")
        
        # Adjust layout to prevent labels from getting cut off
        plt.tight_layout()
        
        # Save the plot
        filename = f"feature_importance_{model_name.replace(' ', '_').lower()}.png"
        plt.savefig(filename, bbox_inches='tight')
        print(f"Feature importance plot saved as {filename}")
        plt.close()
    else:
        print(f"{model_name} does not support feature importance.")

## Evaluate Models

In [18]:
def evaluate_on_validation(model, X_train_split, X_val_split, y_train_split, y_val_split):
    """
    Evaluates the performance of a trained model on the validation set.
    Outputs a classification report, AUC score, F1-Score, and visualizations.
    """
    model.fit(X_train_split, y_train_split)
    predictions = model.predict(X_val_split)
    probabilities = model.predict_proba(X_val_split)[:, 1]

    # Calculate AUC and F1-Score
    auc_score = roc_auc_score(y_val_split, probabilities)
    f1 = f1_score(y_val_split, predictions)

    print("\nValidation Set Evaluation:")
    print("Classification Report:")
    print(classification_report(y_val_split, predictions))
    print(f"AUC Score: {auc_score:.4f}")
    print(f"F1-Score: {f1:.4f}")

    # Precision-Recall Curve
    precision, recall, _ = precision_recall_curve(y_val_split, probabilities)
    plt.plot(recall, precision, label=f'{type(model).__name__}')
    plt.xlabel('Recall')
    plt.ylabel('Precision')
    plt.title('Precision-Recall Curve (Validation)')
    plt.legend()
    filename_prc = f"precision_recall_curve_validation_{type(model).__name__.lower()}.png"
    plt.savefig(filename_prc)
    print(f"Precision-Recall Curve saved as {filename_prc}")
    plt.close()

    # Confusion Matrix
    cm = confusion_matrix(y_val_split, predictions)
    sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', cbar=False)
    plt.title("Confusion Matrix (Validation)")
    plt.ylabel("Actual")
    plt.xlabel("Predicted")
    filename_cm = f"confusion_matrix_validation_{type(model).__name__.lower()}.png"
    plt.savefig(filename_cm)
    print(f"Confusion Matrix saved as {filename_cm}")
    plt.close()

    return {'AUC': auc_score, 'F1-Score': f1}

## Imbalance & Train

In [20]:
def handle_imbalance(X_train_split, y_train_split):
    """
    Balances the training data using SMOTE (Synthetic Minority Oversampling Technique) to handle class imbalance.
    """
    smote = SMOTE(random_state=42)
    X_resampled, y_resampled = smote.fit_resample(X_train_split, y_train_split)
    return X_resampled, y_resampled

def train_and_evaluate_models(X_train_split, X_val_split, y_train_split, y_val_split, selected_features):
    """
    Trains and evaluates three machine learning models (Random Forest, Logistic Regression, XGBoost) on the training and validation datasets. 
    Outputs performance metrics and feature importance plots.
    Calls evaluate_on_validation.
    Calls plot_feature_importance.
    """
    results = {}

    # Random Forest
    rf_model = RandomForestClassifier(random_state=42, n_estimators=100, class_weight="balanced")
    print("Random Forest:")
    rf_metrics = evaluate_on_validation(rf_model, X_train_split, X_val_split, y_train_split, y_val_split)
    plot_feature_importance(rf_model, selected_features, model_name="Random Forest")
    results['Random Forest'] = rf_metrics

    # Logistic Regression
    lr_model = LogisticRegression(max_iter=1000, random_state=42, class_weight="balanced")
    print("Logistic Regression:")
    lr_metrics = evaluate_on_validation(lr_model, X_train_split, X_val_split, y_train_split, y_val_split)
    results['Logistic Regression'] = lr_metrics

    # XGBoost
    scale_pos_weight = (len(y_train_split) - sum(y_train_split)) / sum(y_train_split)
    print(f"Calculated scale_pos_weight for XGBoost: {scale_pos_weight:.2f}")
    xgb_model = XGBClassifier(random_state=42, scale_pos_weight=scale_pos_weight, eval_metric='logloss')
    print("XGBoost:")
    xgb_metrics = evaluate_on_validation(xgb_model, X_train_split, X_val_split, y_train_split, y_val_split)
    plot_feature_importance(xgb_model, selected_features, model_name="XGBoost")
    results['XGBoost'] = xgb_metrics

    return results

## Model Comparison 

In [22]:
def model_comparison(results):
    """
    Compares the performance of machine learning models based on their AUC and F1 scores.
    Saves comparison plots for both metrics.
    
    Args:
        results (dict): A dictionary containing model names as keys and a dictionary of metrics (e.g., AUC, F1-Score) as values.
    """
    print("\n### Model Comparison Recap ###")
    model_names = list(results.keys())
    metrics = ['AUC', 'F1-Score']

    data = []
    for model, metrics_dict in results.items():
        row = [model] + [metrics_dict.get(metric, "N/A") for metric in metrics]
        data.append(row)

    comparison_df = pd.DataFrame(data, columns=['Model'] + metrics)
    print(comparison_df)

    # Plot AUC Comparison
    plt.figure(figsize=(10, 6))
    sns.barplot(x="AUC", y="Model", data=comparison_df, palette="viridis")
    plt.title("Model AUC Comparison")
    plt.xlabel("AUC Score")
    plt.ylabel("Model")
    plt.tight_layout()
    plt.savefig("model_auc_comparison.png", bbox_inches='tight')
    print("Model AUC Comparison saved as model_auc_comparison.png")
    plt.close()

    # Plot F1-Score Comparison
    plt.figure(figsize=(10, 6))
    sns.barplot(x="F1-Score", y="Model", data=comparison_df, palette="viridis")
    plt.title("Model F1-Score Comparison")
    plt.xlabel("F1-Score")
    plt.ylabel("Model")
    plt.tight_layout()
    plt.savefig("model_f1_score_comparison.png", bbox_inches='tight')
    print("Model F1-Score Comparison saved as model_f1_score_comparison.png")
    plt.close()

## Tune XGBoost

In [24]:
def hyperparameter_tuning(X_train_split, y_train_split, X_val_split, y_val_split):
    """
    Performs hyperparameter tuning on the XGBoost model using RandomizedSearchCV to optimize its parameters for better performance.
    """
    param_grid = {
        'n_estimators': [100, 200, 300],
        'max_depth': [3, 5, 7],
        'learning_rate': [0.01, 0.1, 0.2],
        'subsample': [0.7, 0.8, 0.9]
    }

    xgb_model = XGBClassifier(random_state=42, eval_metric='logloss')

    xgb_random_search = RandomizedSearchCV(
        xgb_model,
        param_distributions=param_grid,
        n_iter=10,
        cv=3,
        scoring='roc_auc',
        random_state=42,
        n_jobs=-1
    )

    xgb_random_search.fit(X_train_split, y_train_split)

    best_params = xgb_random_search.best_params_
    print("Best Parameters for XGBoost:", best_params)

    xgb_optimized = XGBClassifier(**best_params, random_state=42, eval_metric='logloss')
    xgb_optimized.fit(X_train_split, y_train_split)

    return xgb_optimized

## Evaluate XGBoost 

In [26]:
def evaluate_on_test(model, X_test, y_test):
    """
    Evaluates the final model's performance on the test set.
    Outputs a classification report and AUC score.
    """
    predictions = model.predict(X_test)
    probabilities = model.predict_proba(X_test)[:, 1]

    print("\nTest Set Evaluation:")
    print("Classification Report:")
    print(classification_report(y_test, predictions))

    # Precision-Recall Curve
    precision, recall, _ = precision_recall_curve(y_test, probabilities)
    plt.plot(recall, precision, label=f'{type(model).__name__}')
    plt.xlabel('Recall')
    plt.ylabel('Precision')
    plt.title('Precision-Recall Curve (Test)')
    plt.legend()
    filename_prc = f"precision_recall_curve_test_{type(model).__name__.lower()}.png"
    plt.savefig(filename_prc)
    print(f"Precision-Recall Curve saved as {filename_prc}")
    plt.close()

    # Confusion Matrix
    cm = confusion_matrix(y_test, predictions)
    sns.heatmap(cm, annot=True, fmt='d', cmap='Blues')
    plt.title("Confusion Matrix (Test)")
    plt.ylabel("Actual")
    plt.xlabel("Predicted")
    filename_cm = f"confusion_matrix_test_{type(model).__name__.lower()}.png"
    plt.savefig(filename_cm)
    print(f"Confusion Matrix saved as {filename_cm}")
    plt.close()

    return roc_auc_score(y_test, probabilities)

## Main 

In [28]:
def main():
    train_path = '/Users/angieguerrero/Desktop/Dataiku Project/data/census_income_learn.csv'
    test_path = '/Users/angieguerrero/Desktop/Dataiku Project/data/census_income_test.csv'

    # Load and clean data
    train_df, test_df = load_data(train_path, test_path)

    # Clean data 
    train_df, splits, test_df = clean_data(train_df, test_df)

    # Perform initial EDA
    perform_eda(train_df)

    # Preprocessing
    train_df, splits, test_df, encoders = preprocess_data(train_df, splits, test_df)

    # Feature engineering
    splits, X_test_scaled, selected_features = feature_engineering(splits, test_df)

    # Handle class imbalance on training data
    X_train_split, X_val_split, y_train_split, y_val_split = splits
    X_train_split, y_train_split = handle_imbalance(X_train_split, y_train_split)

    # Train and evaluate models on validation set
    results = train_and_evaluate_models(X_train_split, X_val_split, y_train_split, y_val_split, selected_features)

    # Compare models
    model_comparison(results)

    # Hyperparameter tuning for the best model (XGBoost in this case)
    xgb_optimized = hyperparameter_tuning(X_train_split, y_train_split, X_val_split, y_val_split)

    # Evaluate the optimized model on the test set
    y_test = test_df['income']
    test_auc = evaluate_on_test(xgb_optimized, X_test_scaled, y_test)

if __name__ == "__main__":
    main()


Income distribution (training data) before cleaning:
income
 - 50000.    187141
 50000+.      12382
Name: count, dtype: int64

Income distribution (test data) before cleaning:
income
 - 50000.    93576
 50000+.      6186
Name: count, dtype: int64

Training Data: 199523 rows before cleaning, 143468 rows after cleaning.
Test Data: 99762 rows before cleaning, 72023 rows after cleaning.

Income distribution (training data) after cleaning:
income
 - 50000.    131088
 50000+.      12380
Name: count, dtype: int64

Income distribution (test data) after cleaning:
income
 - 50000.    65837
 50000+.      6186
Name: count, dtype: int64
Basic Statistics of Train Data:
                 age  detailed_industry_recode  detailed_occupation_recode  \
count  143468.000000             143468.000000               143468.000000   
mean       44.687484                 20.792177                   15.222781   
std        17.646275                 18.157176                   14.862288   
min        18.000000   


Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `x` variable to `hue` and set `legend=False` for the same effect.

  sns.countplot(data=train_df, x='income', palette="viridis")



Income vs Gender:

Income vs Education:

Income vs Race:

Age Distribution vs Income:

Boxplots for Wage, Weeks Worked, and Capital Net Gain:
Random Forest:

Validation Set Evaluation:
Classification Report:
              precision    recall  f1-score   support

           0       0.96      0.96      0.96     26218
           1       0.57      0.55      0.56      2476

    accuracy                           0.92     28694
   macro avg       0.76      0.76      0.76     28694
weighted avg       0.92      0.92      0.92     28694

AUC Score: 0.9127
F1-Score: 0.5595
Precision-Recall Curve saved as precision_recall_curve_validation_randomforestclassifier.png
Confusion Matrix saved as confusion_matrix_validation_randomforestclassifier.png
Feature importance plot saved as feature_importance_random_forest.png
Logistic Regression:

Validation Set Evaluation:
Classification Report:
              precision    recall  f1-score   support

           0       0.98      0.80      0.88     26218
    


Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `y` variable to `hue` and set `legend=False` for the same effect.

  sns.barplot(x="AUC", y="Model", data=comparison_df, palette="viridis")

Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `y` variable to `hue` and set `legend=False` for the same effect.

  sns.barplot(x="F1-Score", y="Model", data=comparison_df, palette="viridis")


Best Parameters for XGBoost: {'subsample': 0.8, 'n_estimators': 300, 'max_depth': 5, 'learning_rate': 0.2}

Test Set Evaluation:
Classification Report:
              precision    recall  f1-score   support

           0       0.96      0.97      0.97     65837
           1       0.66      0.52      0.58      6186

    accuracy                           0.94     72023
   macro avg       0.81      0.75      0.77     72023
weighted avg       0.93      0.94      0.93     72023

Precision-Recall Curve saved as precision_recall_curve_test_xgbclassifier.png
Confusion Matrix saved as confusion_matrix_test_xgbclassifier.png
