In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, accuracy_score

# --- 1. Load Data ---
try:
    df = pd.read_csv('./data/high_salary.csv')
    print("Successfully loaded 'high_salary.csv'")
    print("-" * 30)
except FileNotFoundError:
    print("Error: 'high_salary.csv' not found. Please make sure it's in the same directory.")
    exit()

# --- 2. Define Features (X) and Target (y) ---

# Drop identifier columns and redundant columns
# 'education' is dropped in favor of 'education-num'
# 'native-country-code' is dropped in favor of 'native-country'
try:
    columns_to_drop = ['id', 'social-security-number', 'house-number', 'education', 'native-country-code']
    X = df.drop(columns=columns_to_drop + ['label'])
    y = df['label']
    
    # Check for class balance
    print("Target variable 'label' distribution:")
    print(y.value_counts(normalize=True))
    print("-" * 30)

except KeyError as e:
    print(f"Error: A required column is missing. {e}")
    exit()

# --- 3. Identify Feature Types ---
# Identify numerical and categorical columns
numerical_features = X.select_dtypes(include=['int64', 'float64']).columns.tolist()
categorical_features = X.select_dtypes(include=['object']).columns.tolist()

print(f"Numerical features: {numerical_features}")
print(f"Categorical features: {categorical_features}")
print("-" * 30)

# --- 4. Split Data ---
# Split into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)
print(f"Training set size: {X_train.shape[0]} samples")
print(f"Testing set size: {X_test.shape[0]} samples")
print("-" * 30)

# --- 5. Create Preprocessing Pipelines ---

# Pipeline for numerical features:
# 1. Impute missing values with the median
# 2. Scale features
num_pipeline = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())
])

# Pipeline for categorical features:
# 1. Impute missing values with the most frequent value
# 2. One-hot encode the categories
cat_pipeline = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore', sparse_output=False))
])

# --- 6. Combine Pipelines with ColumnTransformer ---
# Create a preprocessor that applies the correct pipeline to each column type
preprocessor = ColumnTransformer(
    transformers=[
        ('num', num_pipeline, numerical_features),
        ('cat', cat_pipeline, categorical_features)
    ],
    remainder='passthrough'
)

# --- 7. Create and Train the Final Model ---
# Use a RandomForestClassifier as the model
model = RandomForestClassifier(random_state=42, n_jobs=-1)

# Create the full pipeline: preprocess, then model
full_pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('model', model)
])

# Train the model
print("Training the model...")
full_pipeline.fit(X_train, y_train)
print("Model training complete.")
print("-" * 30)

# --- 8. Evaluate the Model ---
print("Evaluating model on the test set...")
# Make predictions on the test set
y_pred = full_pipeline.predict(X_test)

# Calculate and print accuracy
accuracy = accuracy_score(y_test, y_pred)
print(f"Model Accuracy: {accuracy:.4f}")
print("\nClassification Report:")

# Print a detailed classification report
# Note: '1.0' likely means 'high salary' and '0.0' means 'low salary'
report = classification_report(y_test, y_pred)
print(report)

print("-" * 30)
print("Process finished.")

Successfully loaded 'high_salary.csv'
------------------------------
Target variable 'label' distribution:
label
0.0    0.580622
1.0    0.419378
Name: proportion, dtype: float64
------------------------------
Numerical features: ['age-group', 'fnlwgt', 'education-num', 'capitalgain', 'capitalloss', 'hoursperweek']
Categorical features: ['workclass', 'marital-status', 'occupation', 'relationship', 'race', 'sex', 'native-country']
------------------------------
Training set size: 16720 samples
Testing set size: 4180 samples
------------------------------
Training the model...
Model training complete.
------------------------------
Evaluating model on the test set...
Model Accuracy: 0.8031

Classification Report:
              precision    recall  f1-score   support

         0.0       0.83      0.83      0.83      2427
         1.0       0.77      0.76      0.77      1753

    accuracy                           0.80      4180
   macro avg       0.80      0.80      0.80      4180
weighted

In [4]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, f1_score, accuracy_score

print("Libraries imported successfully.")

# --- 1. Load Data ---
try:
    df = pd.read_csv('high_salary.csv')
    print("Successfully loaded 'high_salary.csv'")
    print("-" * 30)
except FileNotFoundError:
    print("Error: 'high_salary.csv' not found. Please make sure it's in the same directory.")
    
if 'df' in locals():
    
    # --- 2. Define Features (X) and Target (y) ---
    try:
        # Drop identifier columns and redundant/leaky columns
        columns_to_drop = ['id', 'social-security-number', 'house-number', 'education', 'native-country-code']
        X = df.drop(columns=columns_to_drop + ['label'])
        y = df['label']
        
        print("Target variable 'label' distribution (before split):")
        print(y.value_counts(normalize=True))
        print("-" * 30)

    except KeyError as e:
        print(f"Error: A required column is missing. {e}")

    # --- 3. Identify Feature Types ---
    numerical_features = X.select_dtypes(include=['int64', 'float64']).columns.tolist()
    categorical_features = X.select_dtypes(include=['object']).columns.tolist()

    print(f"Numerical features: {numerical_features}")
    print(f"Categorical features: {categorical_features}")
    print("-" * 30)

    # --- 4. Split Data ---
    # Stratify=y ensures both train and test sets have a similar class distribution
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)
    print(f"Training set size: {X_train.shape[0]} samples")
    print(f"Testing set size: {X_test.shape[0]} samples")
    print("-" * 30)

    # --- 5. Create Preprocessing Pipelines ---
    
    # Pipeline for numerical features:
    # 1. Impute missing values with the median
    # 2. Scale features
    num_pipeline = Pipeline(steps=[
        ('imputer', SimpleImputer(strategy='median')),
        ('scaler', StandardScaler())
    ])

    # Pipeline for categorical features:
    # 1. Impute missing values with the most frequent value
    # 2. One-hot encode the categories
    cat_pipeline = Pipeline(steps=[
        ('imputer', SimpleImputer(strategy='most_frequent')),
        ('onehot', OneHotEncoder(handle_unknown='ignore', sparse_output=False))
    ])

    # --- 6. Combine Pipelines with ColumnTransformer ---
    # Create a preprocessor that applies the correct pipeline to each column type
    preprocessor = ColumnTransformer(
        transformers=[
            ('num', num_pipeline, numerical_features),
            ('cat', cat_pipeline, categorical_features)
        ],
        remainder='passthrough'
    )

    # --- 7. Create Model Pipeline ---
    # This pipeline will first preprocess the data, then train the RF model
    pipeline = Pipeline(steps=[
        ('preprocessor', preprocessor),
        ('model', RandomForestClassifier(random_state=42, n_jobs=-1))
    ])

    # --- 8. Define Hyperparameter Grid for Tuning ---
    # This grid tells GridSearchCV which parameters to test
    param_grid = {
        'model__n_estimators': [100, 200],      # Number of trees
        'model__max_depth': [10, 20, 30],       # Max depth of trees
        'model__min_samples_leaf': [2, 4],    # Min samples at a leaf node
        'model__class_weight': ['balanced', None] # Key parameter to fight imbalance
    }

    # We will score based on 'f1_macro' to balance both classes' F1-scores
    grid_search = GridSearchCV(
        estimator=pipeline,
        param_grid=param_grid,
        cv=3, # 3-fold cross-validation
        scoring='f1_macro', 
        n_jobs=-1, # Use all available cores
        verbose=2  # Show progress
    )

    # --- 9. Train the Grid Search ---
    print("Starting hyperparameter tuning with GridSearchCV...")
    print("This may take several minutes...")
    grid_search.fit(X_train, y_train)
    print("Hyperparameter tuning complete.")
    print("-" * 30)

    # --- 10. Show Best Parameters and Evaluate ---
    print(f"Best parameters found: {grid_search.best_params_}")
    print(f"Best macro-F1 score during cross-validation: {grid_search.best_score_:.4f}")
    print("-" * 30)

    print("Evaluating the best model on the unseen test set...")
    # Get the best model found by the grid search
    best_model = grid_search.best_estimator_

    # Make predictions on the test set
    y_pred = best_model.predict(X_test)

    # --- 11. Display Final Report ---
    accuracy = accuracy_score(y_test, y_pred)
    print(f"Final Model Accuracy: {accuracy:.4f}")
    
    print("\nFinal Classification Report:")
    report = classification_report(y_test, y_pred)
    print(report)

    # Print the specific F1-score for the high-salary class
    f1_class_1 = f1_score(y_test, y_pred, pos_label=1.0)
    print(f"\n---> F1-Score for class '1.0' (high-salary): {f1_class_1:.4f} <---")
    print("-" * 30)
    print("Process finished.")

Libraries imported successfully.
Error: 'high_salary.csv' not found. Please make sure it's in the same directory.
Target variable 'label' distribution (before split):
label
0.0    0.580622
1.0    0.419378
Name: proportion, dtype: float64
------------------------------
Numerical features: ['age-group', 'fnlwgt', 'education-num', 'capitalgain', 'capitalloss', 'hoursperweek']
Categorical features: ['workclass', 'marital-status', 'occupation', 'relationship', 'race', 'sex', 'native-country']
------------------------------
Training set size: 16720 samples
Testing set size: 4180 samples
------------------------------
Starting hyperparameter tuning with GridSearchCV...
This may take several minutes...
Fitting 3 folds for each of 24 candidates, totalling 72 fits
Hyperparameter tuning complete.
------------------------------
Best parameters found: {'model__class_weight': None, 'model__max_depth': 20, 'model__min_samples_leaf': 2, 'model__n_estimators': 100}
Best macro-F1 score during cross-vali