In [10]:
import pandas as pd
import numpy as np
import joblib
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, accuracy_score

In [2]:
# Step 1: Load and clean data
from ucimlrepo import fetch_ucirepo
adult = fetch_ucirepo(id=2)
df = pd.concat([adult.data.features, adult.data.targets], axis=1)

df['income'] = df['income'].str.strip().str.replace('.', '', regex=False)
df.replace('?', np.nan, inplace=True)
df.dropna(inplace=True)

# Step 2: Split features and target
selected_features = [
    'age', 'capital-gain', 'capital-loss', 'hours-per-week', 'education-num',
    'workclass', 'marital-status', 'occupation', 'relationship', 'sex'
]
X = df[selected_features]
y = df['income']

In [4]:
# Step 3: Separate numeric and categorical columns
num_cols = X.select_dtypes(include=['int64', 'float64']).columns.tolist()
cat_cols = X.select_dtypes(include=['object']).columns.tolist()

# Step 4: Preprocess numeric columns
scaler = StandardScaler()
X_num = scaler.fit_transform(X[num_cols])

# Step 5: Preprocess categorical columns
encoder = OneHotEncoder(drop='first', handle_unknown='ignore', sparse_output=False)
X_cat = encoder.fit_transform(X[cat_cols])

# Step 6: Combine numeric and categorical data
X_processed = np.hstack((X_num, X_cat))

In [12]:
# Step 7: Train-test split
X_train, X_test, y_train, y_test = train_test_split(X_processed, y, test_size=0.2, stratify=y, random_state=42)

# Step 8: Logistic Regression with basic hyperparameter tuning
param_grid = {
    'penalty': ['l1', 'l2'],
    'C': [0.01, 0.1, 1, 10],
    'solver': ['liblinear', 'saga'],
    'class_weight': [None, 'balanced']
}

log_reg = LogisticRegression(random_state=42)
grid_search = GridSearchCV(log_reg, param_grid, cv=3, scoring='accuracy', verbose=2, n_jobs=-1)
grid_search.fit(X_train, y_train)

# Step 9: Evaluate best model
best_log_reg = grid_search.best_estimator_
y_pred = best_log_reg.predict(X_test)

print("Best Parameters:", grid_search.best_params_)
print("Test Accuracy:", accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred))

Fitting 3 folds for each of 32 candidates, totalling 96 fits
[CV] END C=0.01, class_weight=None, penalty=l1, solver=liblinear; total time=   0.1s
[CV] END C=0.01, class_weight=None, penalty=l2, solver=liblinear; total time=   0.1s
[CV] END C=0.01, class_weight=None, penalty=l1, solver=liblinear; total time=   0.1s
[CV] END C=0.01, class_weight=None, penalty=l1, solver=liblinear; total time=   0.1s
[CV] END C=0.01, class_weight=None, penalty=l2, solver=liblinear; total time=   0.1s
[CV] END C=0.01, class_weight=None, penalty=l2, solver=liblinear; total time=   0.1s
[CV] END .C=0.01, class_weight=None, penalty=l1, solver=saga; total time=   0.2s
[CV] END .C=0.01, class_weight=None, penalty=l1, solver=saga; total time=   0.2s
[CV] END .C=0.01, class_weight=None, penalty=l1, solver=saga; total time=   0.2s
[CV] END C=0.01, class_weight=balanced, penalty=l1, solver=liblinear; total time=   0.1s
[CV] END C=0.01, class_weight=balanced, penalty=l1, solver=liblinear; total time=   0.1s
[CV] END



[CV] END C=0.1, class_weight=balanced, penalty=l2, solver=saga; total time=   0.6s
[CV] END C=0.1, class_weight=balanced, penalty=l1, solver=saga; total time=   1.3s
[CV] END C=1, class_weight=balanced, penalty=l2, solver=liblinear; total time=   0.1s
[CV] END C=1, class_weight=balanced, penalty=l1, solver=liblinear; total time=   0.7s
[CV] END C=1, class_weight=balanced, penalty=l2, solver=liblinear; total time=   0.1s
[CV] END C=1, class_weight=balanced, penalty=l2, solver=liblinear; total time=   0.1s
[CV] END ....C=1, class_weight=None, penalty=l1, solver=saga; total time=   1.2s
[CV] END ....C=1, class_weight=None, penalty=l2, solver=saga; total time=   1.0s
[CV] END C=1, class_weight=balanced, penalty=l1, solver=liblinear; total time=   0.8s




[CV] END ....C=1, class_weight=None, penalty=l2, solver=saga; total time=   1.1s
[CV] END C=1, class_weight=None, penalty=l1, solver=liblinear; total time=   0.8s




[CV] END C=1, class_weight=balanced, penalty=l1, solver=saga; total time=   1.3s
[CV] END C=1, class_weight=balanced, penalty=l1, solver=liblinear; total time=   0.9s
[CV] END ....C=1, class_weight=None, penalty=l1, solver=saga; total time=   1.2s
[CV] END C=10, class_weight=None, penalty=l2, solver=liblinear; total time=   0.2s
[CV] END C=10, class_weight=None, penalty=l2, solver=liblinear; total time=   0.1s
[CV] END ....C=1, class_weight=None, penalty=l2, solver=saga; total time=   1.1s
[CV] END C=1, class_weight=balanced, penalty=l2, solver=saga; total time=   1.3s
[CV] END C=10, class_weight=None, penalty=l2, solver=liblinear; total time=   0.1s
[CV] END C=1, class_weight=balanced, penalty=l2, solver=saga; total time=   1.2s
[CV] END C=10, class_weight=None, penalty=l1, solver=liblinear; total time=   1.0s




[CV] END ....C=1, class_weight=None, penalty=l1, solver=saga; total time=   1.2s
[CV] END C=10, class_weight=None, penalty=l1, solver=liblinear; total time=   0.9s
[CV] END C=1, class_weight=balanced, penalty=l1, solver=saga; total time=   1.3s




[CV] END C=1, class_weight=balanced, penalty=l1, solver=saga; total time=   1.4s
[CV] END C=10, class_weight=balanced, penalty=l1, solver=liblinear; total time=   0.9s
[CV] END ...C=10, class_weight=None, penalty=l2, solver=saga; total time=   1.0s
[CV] END ...C=10, class_weight=None, penalty=l1, solver=saga; total time=   1.5s
[CV] END C=10, class_weight=balanced, penalty=l1, solver=liblinear; total time=   0.9s
[CV] END ...C=10, class_weight=None, penalty=l2, solver=saga; total time=   1.1s




[CV] END C=1, class_weight=balanced, penalty=l2, solver=saga; total time=   1.1s
[CV] END C=10, class_weight=None, penalty=l1, solver=liblinear; total time=   1.1s
[CV] END ...C=10, class_weight=None, penalty=l2, solver=saga; total time=   1.1s
[CV] END C=10, class_weight=balanced, penalty=l2, solver=liblinear; total time=   0.2s
[CV] END C=10, class_weight=balanced, penalty=l2, solver=liblinear; total time=   0.1s
[CV] END C=10, class_weight=balanced, penalty=l2, solver=liblinear; total time=   0.1s




[CV] END ...C=10, class_weight=None, penalty=l1, solver=saga; total time=   1.4s
[CV] END C=10, class_weight=balanced, penalty=l1, solver=liblinear; total time=   0.9s




[CV] END C=10, class_weight=balanced, penalty=l1, solver=saga; total time=   1.1s
[CV] END C=10, class_weight=balanced, penalty=l2, solver=saga; total time=   1.0s
[CV] END C=10, class_weight=balanced, penalty=l1, solver=saga; total time=   1.2s
[CV] END C=10, class_weight=balanced, penalty=l2, solver=saga; total time=   1.0s
[CV] END ...C=10, class_weight=None, penalty=l1, solver=saga; total time=   1.2s
[CV] END C=10, class_weight=balanced, penalty=l2, solver=saga; total time=   1.0s
[CV] END C=10, class_weight=balanced, penalty=l1, solver=saga; total time=   1.2s
Best Parameters: {'C': 0.1, 'class_weight': None, 'penalty': 'l2', 'solver': 'liblinear'}
Test Accuracy: 0.8425649530127142
              precision    recall  f1-score   support

       <=50K       0.87      0.93      0.90      6803
        >50K       0.73      0.58      0.65      2242

    accuracy                           0.84      9045
   macro avg       0.80      0.76      0.77      9045
weighted avg       0.84      0.

In [None]:
joblib.dump(best_log_reg, 'pkl/log_reg_model.pkl')
joblib.dump(scaler, 'pkl/scaler.pkl')
joblib.dump(encoder, 'pkl/encoder.pkl')

['pkl/encoder.pkl']