In [1]:
'''
Perform the following operations using Python on census bureau databset(Adult data sets)  
m. Data cleaning(Remove NA, ?, Negative values etc.)  
n. Error correcting(Outlier detection and removal)  
o. Data transformation  
p. Build Data model using regression and Naïve Bayes methods for prediction of 
income category (>=50k or <=50k) and compare accuracy Prediction.  
'''

'\nPerform the following operations using Python on census bureau databset(Adult data sets)  \nm. Data cleaning(Remove NA, ?, Negative values etc.)  \nn. Error correcting(Outlier detection and removal)  \no. Data transformation  \np. Build Data model using regression and Naïve Bayes methods for prediction of \nincome category (>=50k or <=50k) and compare accuracy Prediction.  \n'

In [18]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import accuracy_score, classification_report
from scipy.stats import zscore

In [9]:
data = pd.read_csv(r'DSBDALExam DataSets\Adult\adult.csv')

In [10]:
data.head()

Unnamed: 0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,<=50K
0,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,<=50K
1,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,<=50K
2,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,<=50K
3,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,<=50K
4,37,Private,284582,Masters,14,Married-civ-spouse,Exec-managerial,Wife,White,Female,0,0,40,United-States,<=50K


In [12]:
# Define column names as per the dataset description
data.columns = [
    "age", "workclass", "fnlwgt", "education", "education-num", "marital-status",
    "occupation", "relationship", "race", "sex", "capital-gain", "capital-loss",
    "hours-per-week", "native-country", "income"
]

In [14]:
data.head()

Unnamed: 0,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country,income
0,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,<=50K
1,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,<=50K
2,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,<=50K
3,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,<=50K
4,37,Private,284582,Masters,14,Married-civ-spouse,Exec-managerial,Wife,White,Female,0,0,40,United-States,<=50K


In [15]:
# Data Cleaning

# Drop rows with missing values
data.dropna(inplace=True)

# Convert numerical columns to correct datatype
numeric_cols = ["age", "fnlwgt", "education-num", "capital-gain", "capital-loss", "hours-per-week"]
data[numeric_cols] = data[numeric_cols].apply(pd.to_numeric)

# Remove negative values (not expected, but safe to check)
data = data[(data[numeric_cols] >= 0).all(axis=1)]

In [16]:
# Outlier Detection using Z-score

z_scores = np.abs(zscore(data[numeric_cols]))
data = data[(z_scores < 3).all(axis=1)]  # Keep rows where Z-score is < 3 (within 3 std deviations)


In [19]:
# Data Transformation

label_encoders = {}
for col in data.select_dtypes(include='object').columns:
    le = LabelEncoder()
    data[col] = le.fit_transform(data[col])
    label_encoders[col] = le

# Separate features and target
X = data.drop('income', axis=1)
y = data['income']  # 0: <=50K, 1: >50K (LabelEncoded)

# Standardize numeric features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

In [20]:
# Model Building

# Split into train and test sets (80% train, 20% test)
X_train, X_test, y_train, y_test = train_test_split(
    X_scaled, y, test_size=0.2, random_state=42
)

# Logistic Regression Model
lr = LogisticRegression()
lr.fit(X_train, y_train)
y_pred_lr = lr.predict(X_test)
acc_lr = accuracy_score(y_test, y_pred_lr)

# Naive Bayes Model
nb = GaussianNB()
nb.fit(X_train, y_train)
y_pred_nb = nb.predict(X_test)
acc_nb = accuracy_score(y_test, y_pred_nb)

In [21]:
print("\nLogistic Regression Accuracy: {:.2f}%".format(acc_lr * 100))
print("Naive Bayes Accuracy: {:.2f}%".format(acc_nb * 100))

print("\nLogistic Regression Classification Report:\n", classification_report(y_test, y_pred_lr))
print("\nNaive Bayes Classification Report:\n", classification_report(y_test, y_pred_nb))


Logistic Regression Accuracy: 82.87%
Naive Bayes Accuracy: 78.96%

Logistic Regression Classification Report:
               precision    recall  f1-score   support

           0       0.85      0.95      0.90      4639
           1       0.70      0.40      0.51      1327

    accuracy                           0.83      5966
   macro avg       0.77      0.68      0.70      5966
weighted avg       0.81      0.83      0.81      5966


Naive Bayes Classification Report:
               precision    recall  f1-score   support

           0       0.91      0.81      0.86      4639
           1       0.52      0.73      0.61      1327

    accuracy                           0.79      5966
   macro avg       0.72      0.77      0.73      5966
weighted avg       0.83      0.79      0.80      5966

