All the imports and libraries used below

In [33]:
# UCI ML repo imports
from ucimlrepo import fetch_ucirepo

# sk-learn imports
from sklearn.datasets import load_breast_cancer
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler, StandardScaler
from sklearn.linear_model import LogisticRegression, Perceptron
from sklearn.metrics import accuracy_score
from sklearn.model_selection import StratifiedKFold, cross_val_score
from sklearn.pipeline import Pipeline

# Pandas and Numpy
import pandas as pd
import numpy as np

Preprocessing Data Section

In [43]:
# Preprocessing Data

adult = fetch_ucirepo(id=2)

X = adult.data['features'] #imports the features as pd dataframes
y = adult.data['targets'] # imports the targets as pd dataframes
                                      # 0     1         2      3             4               5              6            7           8    9     10              11            12               13
# print(X.features.head()) # Features: age, workclass, fnlwgt, education, education-num, marital-status, occupation, relationship, race, sex, capital-gain, capital-loss, hours-per-week, native-country
# print(y.features.head()) # does the person make more than $50k per year
# print(X.dtypes) # Assess the data types

categorical_cols = X.select_dtypes(include='object').columns

# Viewing the Data Set
# print(categorical_cols)

# for col in categorical_cols: # shows the unique values in each column
#     print(f"Column: {col}") # column name
#     print(X[col].unique()) #prints unique values in the column
#     print("-" * 40) # seperator

for col in categorical_cols:
  # Clean the Data
  X.loc[:, col] = X[col].replace('?', np.nan) # Replace '?' with NaN using .loc
  X.loc[:, col] = X[col].fillna(X[col].mode()[0]) # Fill missing values with mode using .loc
  y = y.loc[X.index]  # Ensure target matches

X_encoded = pd.get_dummies(X, columns=categorical_cols) # encodes the categorical columns
#print(X_encoded)
# print(X_encoded.dtypes)

scale = StandardScaler() # use normalization to better maintain outliers, which we care about in classification
X_scaled = scale.fit_transform(X_encoded) # scales the data

# Select 3 Features
age_class_education = X_scaled[:, [0, 4, 5, 6, 7, 9, 10, 12 ]] # selects the features of age class and education

Using Logistic Regression to Train the Initial Models

In [44]:
# Initial Model Training Using 3 Variables

# All Features
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)
logistic_model = LogisticRegression().fit(X_train, y_train) # calls logistic regression model on X_train
y_pred = logistic_model.predict(X_test)

# 3 Select Features
X_train_ace, X_test_ace, y_train_ace, y_test_ace = train_test_split(age_class_education, y, test_size=0.2, random_state=42)

logistic_model_ace = LogisticRegression().fit(X_train_ace, y_train_ace) # calls logistic regression model on X_train_ace
y_pred_ace = logistic_model_ace.predict(X_test_ace)

acc = accuracy_score(y_test, y_pred) # calculates the accuracy of the whole model
acc_ace = accuracy_score(y_test_ace, y_pred_ace) # calculates the accuracy of the 3 feature models


print(f"Accuracy for all features: {acc:.4f}")
print(f"Accuracy for age, class, and education: {acc_ace:.4f}")

  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


Accuracy for all features: 0.5645
Accuracy for age, class, and education: 0.5104


Using K-Fold to check Accuracy of Our Model

In [47]:
# Training Using K-Fold

kfold = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

# All Features
pipeline = Pipeline([
    ('scale', StandardScaler()),
    ('clf', LogisticRegression(max_iter=1000))
])

# Evaluate using cross_val_score
scores = cross_val_score(pipeline, X_encoded, y, cv=kfold, scoring='accuracy')
scores_ace = cross_val_score(pipeline, age_class_education, y, cv=kfold, scoring='accuracy')

# Report results
print(f"\nMean accuracy across folds: {scores.mean():.4f}")
print(f"Fold scores: {scores}")

print(f"\nMean accuracy across folds: {scores_ace.mean():.4f}")
print(f"Fold scores: {scores_ace}")

  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)



Mean accuracy across folds: 0.5670
Fold scores: [0.56710001 0.56822602 0.56439394 0.56552007 0.56981982]

Mean accuracy across folds: 0.5109
Fold scores: [0.51161838 0.50793326 0.51269451 0.50972563 0.51238739]


Model Testing

In [None]:
# Results and Analysis

