https://archive.ics.uci.edu/dataset/2/adult

Predictive Analytics: The Power to Predict Who Will Click, Buy, Lie, or Die
by Eric Siegel

In [318]:
import pandas as pd
import numpy as np

import requests  # for my helper function file
from pathlib import Path  # for my helper function file

# import the ensemble assignment
from sklearn.ensemble import (
    BaggingClassifier,
    GradientBoostingClassifier,
    StackingClassifier,
    VotingClassifier,
)

# import models
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import LinearSVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier


# try unsupervised learning CANNOT USE BOOSTING OR VOTING ON UNSUPERVISED
from sklearn.cluster import (
    KMeans,
)  # remmeber to use KMeans++ - bagging and stacking allowed
from sklearn.cluster import DBSCAN  # totally just extra - bagging and stacking allowed

# import metrics
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

# import preprocessing
from sklearn.preprocessing import StandardScaler, MinMaxScaler, RobustScaler
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import LabelEncoder

# import pipeline
from sklearn.pipeline import Pipeline

# import ability to visualize data
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split

# import cross validation
from sklearn.model_selection import cross_val_score

In [319]:
column_names = [
    "age",
    "workclass",
    "fnlwgt",
    "education",
    "education-num",
    "marital-status",
    "occupation",
    "relationship",
    "race",
    "sex",
    "capital-gain",
    "capital-loss",
    "hours-per-week",
    "native-country",
    "income",
]

In [320]:
# Load the 'adult.data' file with named columns
data_df = pd.read_csv("adult.data", names=column_names)

# Load the 'adult.test' file with named columns
test_df = pd.read_csv("adult.test", names=column_names)
# Combine the two DataFrames
df = pd.concat([data_df, test_df], ignore_index=True)

# Save the combined DataFrame to a new CSV file (optional)
df.to_csv("adult_combined.csv", index=False)

print("Combined dataset with named columns created successfully!")

Combined dataset with named columns created successfully!


In [321]:
columns_to_encode = [
    "workclass",
    "education",
    "marital-status",
    "occupation",
    "relationship",
    "race",
    "sex",
    "native-country",
    "income",
]

label_encoders = {}

for column in columns_to_encode:
    label_encoders[column] = LabelEncoder()
    df[column] = label_encoders[column].fit_transform(df[column])

In [322]:
# df["income"].value_counts()

In [323]:
# df['income'] = df['income'].isin(["<=50K", "<=50K.", ">50K", ">50K."]).astype(int)
# df['income_bin'] = df['income'].map(lambda x: 0 if x in ["<=50K", "<=50K."] else 1)
# df['income'] = df['income'].astype(str).str.strip().str.lower().map(lambda x: 0 if x == "<=50K" else 1)

# df['income'] = df['income'].astype(str).str.replace(r'[^\w\s<>=]', '', regex=True).str.strip()  # Remove punctuation and extra spaces
# df['income_bin'] = np.where(df['income'].isin(['<=50K', '<=50K.']), 0, 1)

In [324]:
# clean data
# df_filtered = df[df['native-country'] == "United-States"]
# df_filtered = df[lambda x: x['native-country'] == "United-States"]
# # df["native-country"].value_counts()
# df.head()

In [325]:
# df_filtered.count

In [326]:
# df.dtypes

In [327]:
# df_corr = df.drop(columns= ["workclass", "education", "marital-status", "occupation","relationship", "race", "sex", "native-country", "income"])

In [328]:
# correlation_matrix = df_corr.corr()
# correlation_matrix

In [329]:
# df_dummies = pd.get_dummies(df, columns=["race", "marital-status"], drop_first=True, dummy_na=True, dtype=bool)

# married_statuses = [
#     'Married-civ-spouse', 'Married-AF-spouse', 'Married-spouse-absent'
# ]
# df['marital_status_bin'] = df['marital-status'].apply(lambda x: 1 if x in married_statuses else 0)
# df_dummies.head()

In [330]:
# Identify boolean columns
# bool_cols = df.select_dtypes(include='bool').columns

# # Convert boolean columns to integers (0s and 1s)
# df[bool_cols] = df[bool_cols].astype(int)

# df.head()

In [331]:
# # Get categorical columns
# categorical_cols = df.select_dtypes(include=['object']).columns

# # Create dummy variables for all categorical columns
# df = pd.get_dummies(df, columns=categorical_cols, drop_first=True)
# df.head()

In [335]:
# train_test_split
minMax_Scaled = MinMaxScaler()

X = df.drop(
    columns=[
        "workclass",
        "education",
        "marital-status",
        "occupation",
        "relationship",
        "race",
        "sex",
        "native-country",
    ]
)
y = df["income"]
X = minMax_Scaled.fit_transform(X)
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

In [336]:
# init models -- Suggest Hyperparameter tuning
lr_model = LogisticRegression()  # dan
nb_model = GaussianNB()  # julio
svc_model = LinearSVC()  # julio
rfc_model = RandomForestClassifier()
dtc_model = DecisionTreeClassifier()
# KNN dan
minMax_Scaled = MinMaxScaler()

In [339]:
# Bagging
bagging_model = BaggingClassifier(estimator=rfc_model, n_estimators=10, random_state=42)
bagging_model.fit(X_train, y_train)
y_pred = bagging_model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)

bagging_model = BaggingClassifier(estimator=dtc_model, n_estimators=10, random_state=42)
bagging_model.fit(X_train, y_train)
y_pred = bagging_model.predict(X_test)
accuracy2 = accuracy_score(y_test, y_pred)

In [344]:
print(f"rfc_model: {accuracy}")
print(f"dtc_model: {accuracy2}")

rfc_model: 1.0
dtc_model: 1.0


In [None]:
# Boosting
boosting_model = GradientBoostingClassifier(
    n_estimators=100, learning_rate=0.1, random_state=42
)
boosting_model.fit(X_train, y_train)
y_pred = boosting_model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)

print(f"Boosting Model Accuracy: {accuracy:.2f}")

In [None]:
# Stacking
# We use multiple models so add the ones you wantto use in an array first - save the final estimator for the end
level1_models = []
# Define the final estimator (meta-learner) for the second level
final_estimator = (
    LogisticRegression()
)  # for example. Can use anything else - maybe try some hyperparameter tuning first?

stacking_model = StackingClassifier(
    estimators=level1_models, final_estimator=final_estimator, cv=5
)
stacking_model.fit(X_train, y_train)
y_pred = stacking_model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)

print(f"Stacking Model Accuracy: {accuracy:.2f}")

In [None]:
# Voting / Majority Method
voting_model = VotingClassifier(
    estimators=level1_models, voting="hard"
)  # Hard voting for classification - SOFT is regression
voting_model.fit(X_train, y_train)
y_pred = voting_model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)

print(f"Majority Voting Model Accuracy: {accuracy:.2f}")