In [2]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, accuracy_score

In [3]:
# Step 1: Load and clean data
from ucimlrepo import fetch_ucirepo
adult = fetch_ucirepo(id=2)
df = pd.concat([adult.data.features, adult.data.targets], axis=1)

df['income'] = df['income'].str.strip().str.replace('.', '', regex=False)
df.replace('?', np.nan, inplace=True)
df.dropna(inplace=True)

# Step 2: Split features and target
selected_features = [
    'age', 'capital-gain', 'capital-loss', 'hours-per-week', 'education-num',
    'workclass', 'marital-status', 'occupation', 'relationship', 'sex'
]
X = df[selected_features]
y = df['income']

In [4]:
# Step 3: Separate numeric and categorical columns
num_cols = X.select_dtypes(include=['int64', 'float64']).columns.tolist()
cat_cols = X.select_dtypes(include=['object']).columns.tolist()

# Step 4: Preprocess numeric columns
scaler = StandardScaler()
X_num = scaler.fit_transform(X[num_cols])

# Step 5: Preprocess categorical columns
encoder = OneHotEncoder(drop='first', handle_unknown='ignore', sparse_output=False)
X_cat = encoder.fit_transform(X[cat_cols])

# Step 6: Combine numeric and categorical data
X_processed = np.hstack((X_num, X_cat))

# Step 7: Train-test split
X_train, X_test, y_train, y_test = train_test_split(X_processed, y, test_size=0.2, stratify=y, random_state=42)

# Step 8: Train logistic regression without hyperparameter tuning
log_reg = LogisticRegression()
log_reg.fit(X_train, y_train)

# Step 9: Evaluate
y_pred = log_reg.predict(X_test)
print("Accuracy:", accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred))

Accuracy: 0.8432283029297954
              precision    recall  f1-score   support

       <=50K       0.87      0.93      0.90      6803
        >50K       0.73      0.59      0.65      2242

    accuracy                           0.84      9045
   macro avg       0.80      0.76      0.77      9045
weighted avg       0.84      0.84      0.84      9045



In [6]:
import joblib

# Assuming model is already trained
joblib.dump(log_reg, 'log_reg_model.pkl')

['log_reg_model.pkl']

In [None]:
pkl_model = joblib.load('log_reg_model.pkl')

sample = {
    'age': 37,
    'capital-gain': 0,
    'capital-loss': 0,
    'hours-per-week': 40,
    'education-num': 13,
    'workclass': 'Private',
    'marital-status': 'Married-civ-spouse',
    'occupation': 'Exec-managerial',
    'relationship': 'Husband',
    'sex': 'Male'
}

# Step 1: Convert to DataFrame
sample_df = pd.DataFrame([sample])

# Step 2: Preprocess
# Scale numeric columns
sample_num = scaler.transform(sample_df[num_cols])

# Encode categorical columns
sample_cat = encoder.transform(sample_df[cat_cols])

# Step 3: Combine
sample_processed = np.hstack((sample_num, sample_cat))

# Step 4: Predict
prediction = pkl_model.predict(sample_processed)
print("Prediction:", prediction[0])

Prediction: <=50K
