In [46]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.pipeline import make_pipeline
from sklearn.compose import ColumnTransformer
import joblib

In [47]:
# Load & clean the data
df = pd.read_csv("adult 3.csv")
df.replace(' ?', np.nan, inplace=True)
df.dropna(inplace=True)

In [48]:
# Features and target
X = df.drop("income", axis=1)
y = df["income"].apply(lambda x: 1 if x == ">50K" else 0)


In [49]:
# Identify categorical columns
categorical_cols = X.select_dtypes(include=["object"]).columns.tolist()

In [50]:
# Preprocessing pipeline
preprocessor = ColumnTransformer([
    ("cat", OneHotEncoder(handle_unknown="ignore"), categorical_cols)
], remainder="passthrough")

In [51]:
# Create pipeline
model = make_pipeline(preprocessor, RandomForestClassifier(n_estimators=100, random_state=42))

In [52]:
# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
# Train the pipeline
model.fit(X_train, y_train)

In [None]:
# Save the trained model
joblib.dump(model, "model_lightgbm_v2.pkl")

In [None]:
# ✅ Save feature names from the preprocessor
# Get feature names from OneHotEncoder
encoder = model.named_steps['columntransformer'].named_transformers_['cat']
encoded_feature_names = encoder.get_feature_names_out(categorical_cols)


In [None]:
# Get all final feature names (encoded + numeric)
non_cat_cols = [col for col in X.columns if col not in categorical_cols]
final_feature_names = list(encoded_feature_names) + non_cat_cols

In [None]:
# Save feature names
joblib.dump(final_feature_names, "model_features.pkl")

print("✅ Model and feature names saved successfully.")
