In [1]:
import pandas as pd
import numpy as np
import joblib
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, accuracy_score

In [2]:
# Step 1: Load and clean data
from ucimlrepo import fetch_ucirepo
adult = fetch_ucirepo(id=2)
df = pd.concat([adult.data.features, adult.data.targets], axis=1)

df['income'] = df['income'].str.strip().str.replace('.', '', regex=False)
df.replace('?', np.nan, inplace=True)
df.dropna(inplace=True)

# Step 2: Split features and target
selected_features = [
    'age', 'capital-gain', 'capital-loss', 'hours-per-week', 'education-num',
    'workclass', 'marital-status', 'occupation', 'relationship', 'sex'
]
X = df[selected_features]
y = df['income']

In [3]:
# Step 3: Separate numeric and categorical columns
num_cols = X.select_dtypes(include=['int64', 'float64']).columns.tolist()
cat_cols = X.select_dtypes(include=['object']).columns.tolist()

# Step 4: Preprocess numeric columns
scaler = StandardScaler()
X_num = scaler.fit_transform(X[num_cols])


# Step 5: Preprocess categorical columns
encoder = OneHotEncoder(drop='first', handle_unknown='ignore', sparse_output=False)
X_cat = encoder.fit_transform(X[cat_cols])

# Step 6: Combine numeric and categorical data
X_processed = np.hstack((X_num, X_cat))

# Step 7: Train-test split
X_train, X_test, y_train, y_test = train_test_split(X_processed, y, test_size=0.2, stratify=y, random_state=42)

# Step 8: Random Forest with basic hyperparameter tuning
param_grid = {
    'n_estimators': [100, 200],             # Number of trees
    'max_depth': [None, 10, 20],            # Allow full depth and a couple restricted options
    'min_samples_split': [2, 5],            # Regular vs. slightly conservative
    'min_samples_leaf': [1, 2],             # Prevent very small leaves
    'class_weight': [None, 'balanced']      # Handle class imbalance
}

rf = RandomForestClassifier(random_state=42)
grid_search = GridSearchCV(rf, param_grid, cv=3, scoring='accuracy', verbose=2, n_jobs=-1)
grid_search.fit(X_train, y_train)

# Step 9: Evaluate best model
best_rf = grid_search.best_estimator_
y_pred = best_rf.predict(X_test)

print("Best Parameters:", grid_search.best_params_)
print("Test Accuracy:", accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred))

Fitting 3 folds for each of 48 candidates, totalling 144 fits
[CV] END class_weight=None, max_depth=None, min_samples_leaf=1, min_samples_split=5, n_estimators=100; total time=   1.5s
[CV] END class_weight=None, max_depth=None, min_samples_leaf=1, min_samples_split=5, n_estimators=100; total time=   1.5s
[CV] END class_weight=None, max_depth=None, min_samples_leaf=1, min_samples_split=5, n_estimators=100; total time=   1.5s
[CV] END class_weight=None, max_depth=None, min_samples_leaf=1, min_samples_split=2, n_estimators=100; total time=   1.6s
[CV] END class_weight=None, max_depth=None, min_samples_leaf=1, min_samples_split=2, n_estimators=100; total time=   1.7s
[CV] END class_weight=None, max_depth=None, min_samples_leaf=1, min_samples_split=2, n_estimators=100; total time=   1.7s
[CV] END class_weight=None, max_depth=None, min_samples_leaf=2, min_samples_split=2, n_estimators=100; total time=   1.6s
[CV] END class_weight=None, max_depth=None, min_samples_leaf=2, min_samples_split=2,

In [4]:
import joblib

# Assuming model is already trained
joblib.dump(best_rf, 'rf_model.pkl')
joblib.dump(scaler, 'scaler.pkl')
joblib.dump(encoder, 'encoder.pkl')

['encoder.pkl']

In [5]:
pkl_model = joblib.load('rf_model.pkl')

sample = {
    'age': 37,
    'capital-gain': 0,
    'capital-loss': 0,
    'hours-per-week': 40,
    'education-num': 11,
    'workclass': 'Private',
    'marital-status': 'Married-civ-spouse',
    'occupation': 'Exec-managerial',
    'relationship': 'Husband',
    'sex': 'Male'
}

# Step 1: Convert to DataFrame
sample_df = pd.DataFrame([sample])

# Step 2: Preprocess
# Scale numeric columns
sample_num = scaler.transform(sample_df[num_cols])

# Encode categorical columns
sample_cat = encoder.transform(sample_df[cat_cols])

# Step 3: Combine
sample_processed = np.hstack((sample_num, sample_cat))

# Step 4: Predict
prediction = pkl_model.predict(sample_processed)
print("Prediction:", prediction[0])

Prediction: >50K
